In [37]:
import numpy as np
import pandas as pd

# 1. Pandas对象

## 1.1 Series

Series当numpy

In [38]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

数据绑定了一组索引

In [39]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [40]:
data.index

RangeIndex(start=0, stop=4, step=1)

Series当字典

In [41]:
data = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
data

a    1
b    2
c    3
d    4
dtype: int64

In [42]:
data['c']

3

In [43]:
population = pd.Series({
    'California': 38332521,
    'Texas': 26448193,
    'New York': 19651127,
    'Florida': 19552860,
    'Illinois': 12882135
})
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

如果索引是object的话，也可以用下标索引
但是建议下标索引一律用`.iloc[pos]`方法

In [44]:
data.iloc[1]

2

创建Series的几种方法

In [45]:
index = ['a', 'c', 'b', 'd']
pd.Series(data, index=index)

a    1
c    3
b    2
d    4
dtype: int64

In [46]:
pd.Series(5, index=[100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [47]:
pd.Series({
    'a': 1,
    'b': 2,
    'c': 3
})

a    1
b    2
c    3
dtype: int64

## 1.2 DataFrame

DataFrame可以看成是既可以行索引，也可以列索引的二维numpy数组

In [48]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)

In [49]:
states = pd.DataFrame({
    'population': population,
    'area': area
})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


DataFrame也可以看成字典

如何创建DataFrame对象

In [50]:
# （1）通过Series对象创建
# （2）通过字典列表创建
# （3）通过Series对象创建
# （4）通过Numpy二维数组创建
# （5）通过Numpy结构化数组创建

## 1.3 Index对象

In [51]:
ind = pd.Index([2, 3, 5, 7, 11])
ind

Index([2, 3, 5, 7, 11], dtype='int64')

看做不可变数组

In [52]:
ind[1]

3

In [53]:
ind[::2]

Index([2, 5, 11], dtype='int64')

In [54]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


看成有序集合

In [55]:
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([0, 2, 3, 5, 7, 11])

In [56]:
1 & 2

0

In [57]:
indA.intersection(indB)

Index([3, 5, 7], dtype='int64')

In [58]:
indA.union(indB)

Index([0, 1, 2, 3, 5, 7, 9, 11], dtype='int64')

# 2 数据取值与选择

## 2.1 Series数据选择方法

In [59]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

## 2.2 DataFrame数据选择方法

# 合并与连接

一对一连接

In [60]:
df1 = pd.DataFrame({
    'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']
})

df2 = pd.DataFrame({
    'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
    'hire_date': [2004, 2008, 2012, 2014]
})

In [61]:
print(df1)
print(df2)

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
  employee  hire_date
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014


pd.merge会自动寻找相同的列，并基于此进行拼接

In [62]:
df3 = pd.merge(df1, df2)
df3

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


多对一连接

In [63]:
df4 = pd.DataFrame({
    'group': ['Accounting', 'Engineering', 'HR'],
    'supervisor': ['Carly', 'Guido', 'Steve']
})
print(df3)
print(df4)
print(pd.merge(df3, df4))

  employee        group  hire_date
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014
         group supervisor
0   Accounting      Carly
1  Engineering      Guido
2           HR      Steve
  employee        group  hire_date supervisor
0      Bob   Accounting       2008      Carly
1     Jake  Engineering       2012      Guido
2     Lisa  Engineering       2004      Guido
3      Sue           HR       2014      Steve


多对多连接

In [64]:
df5 = pd.DataFrame({
    'group': ['Accounting', 'Accounting', 'Engineering', 'Engineering', 'HR', 'HR'],
    'skills': ['math', 'spreadsheets', 'coding', 'linux', 'spreadsheets', 'organization']
})
print(df5)
print(pd.merge(df1, df5))

         group        skills
0   Accounting          math
1   Accounting  spreadsheets
2  Engineering        coding
3  Engineering         linux
4           HR  spreadsheets
5           HR  organization
  employee        group        skills
0      Bob   Accounting          math
1      Bob   Accounting  spreadsheets
2     Jake  Engineering        coding
3     Jake  Engineering         linux
4     Lisa  Engineering        coding
5     Lisa  Engineering         linux
6      Sue           HR  spreadsheets
7      Sue           HR  organization


设置数据合并的键

In [65]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


In [66]:
df2

Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


In [67]:
pd.merge(df1, df2, on='employee')

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [68]:
df3 = pd.DataFrame({
    'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
    'salary': [70000, 80000, 120000, 90000]
})
df3

Unnamed: 0,name,salary
0,Bob,70000
1,Jake,80000
2,Lisa,120000
3,Sue,90000


In [69]:
pd.merge(df1, df3, left_on='employee', right_on='name').drop('name', axis=1)

Unnamed: 0,employee,group,salary
0,Bob,Accounting,70000
1,Jake,Engineering,80000
2,Lisa,Engineering,120000
3,Sue,HR,90000


In [70]:
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')

连接的集合操作规则

重复列名：suffixes参数

# 向量化字符串操作

In [73]:
data = ['peter', 'Paul', 'MARY', 'gUIDO', None]
[s.capitalize() for s in data]

AttributeError: 'NoneType' object has no attribute 'capitalize'

In [76]:
names = pd.Series(data)
names.str.capitalize()

0    Peter
1     Paul
2     Mary
3    Guido
4     None
dtype: object