In [30]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
obj = pd.Series([4,7,-5,3])
obj


0    4
1    7
2   -5
3    3
dtype: int64

Series 表现形式为左边索引，右边值，由于我们没有指定索引，这样就会自动创建0-N-1的值为索引

获取index属性

In [31]:
obj.index


RangeIndex(start=0, stop=4, step=1)

根据各个据点标记索引

In [32]:
obj2 = pd.Series([4,7,-5,3],index=['a','d','b','c'])
obj2

a    4
d    7
b   -5
c    3
dtype: int64

通过索引获取Series的单一或者一组值

In [33]:
obj2['d']

7

In [34]:
obj2[['a','b','c']]

a    4
b   -5
c    3
dtype: int64

根据布尔数组进行过滤，标量乘法，应用数学函数

In [35]:
obj2[obj2>0]

a    4
d    7
c    3
dtype: int64

In [36]:
obj2 * 2

a     8
d    14
b   -10
c     6
dtype: int64

Series 还可以看成一个定长的有序字典，因为的索引是一一映射的

In [37]:
'b' in obj2


True

In [38]:
'f' in obj2


False

根据python 字典创建Series

In [39]:
sdata = {'Ohio':35000,'Texas':71000}
obj3 = pd.Series(sdata)
obj3

Ohio     35000
Texas    71000
dtype: int64

传入字典默认index 就是原字典的键

In [40]:
states = ['California','Ohio','Oregon','Texas']
obj4 = pd.Series(sdata,index=states)
obj4


California        NaN
Ohio          35000.0
Oregon            NaN
Texas         71000.0
dtype: float64

由于California 和 Oregan 是没有的所以为NaN
pandas的isnull 和 notnull 函数可以用于检测缺失数据


In [41]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon         True
Texas         False
dtype: bool

In [42]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon        False
Texas          True
dtype: bool

Series 也有类似方法

In [43]:
obj4.isnull()

California     True
Ohio          False
Oregon         True
Texas         False
dtype: bool

Series最重要的一个功能就是，它会根据运算的索引标签自动对齐数据

In [44]:
obj3


Ohio     35000
Texas    71000
dtype: int64

In [45]:
obj4

California        NaN
Ohio          35000.0
Oregon            NaN
Texas         71000.0
dtype: float64

In [46]:
obj3 + obj4


California         NaN
Ohio           70000.0
Oregon             NaN
Texas         142000.0
dtype: float64

Series对象本身以及其索引都有一个属性name

In [47]:
obj4.name = "population"
obj4.index.name = "state"
obj4

state
California        NaN
Ohio          35000.0
Oregon            NaN
Texas         71000.0
Name: population, dtype: float64

Series的索引可以通过赋值方式修改

In [48]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [49]:
obj.index = ['Bob','Tom','Mary','Lily']
obj

Bob     4
Tom     7
Mary   -5
Lily    3
dtype: int64

DataFrame
DataFrame是一个表格型结构数据结构，它含有一组序列，每列可以是不同的值类型（数字，字符串，布尔值等）
DataFrame既有行索引也有列索引，它可以被看成Series字典

最常用的一种是直接传入一个由等长列表或NumPy数组组成字典


In [50]:
import pandas as pd
data = {
    'state':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],
    'year':[2000,2001,2002,2001,2002,2003],
    'pop':[1.5,1.7,3.6,2.4,2.9,3.2]
}
frame = pd.DataFrame(data)



In [51]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


指定列序列，则DataFrame的列就会按照指定顺序进行排列

In [52]:
pd.DataFrame(data,columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


如果传入列在数据找不到，就会产生结果缺失

In [53]:
frame2 = pd.DataFrame(data,columns=['year','state','pop','debt'],
                      index=['one','two','three','four','five','six'])

In [54]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


DataFrame可以获取为一个Series

In [55]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

第二种获取Series写法

In [56]:
frame.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

访问frame行

In [57]:
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [58]:
frame2.loc['eight']

KeyError: 'eight'

给整列赋值

In [None]:
frame2['debt']=16.5


In [None]:
frame2


将列表或数组赋值给某个列时候，其长度必须跟DataFrame的长度匹配，
如果赋值是一个Series就精确匹配到DataFrame 索引，其他会填上NaN

In [None]:
val = pd.Series([-1,-1.5,-1.7],index=['two','four','five'])
frame2['debt'] = val
frame2


为不存在的列赋值会创建出一个新列，关键字del 用删除列

In [None]:
frame2['happy'] = frame2.state == 'Ohio'

In [None]:
frames

In [None]:
frame2

In [None]:
frame2['nice'] = frame2['pop'] * 2
frame2


In [None]:
del frame2['nice','happy']

In [None]:
del frame2['nice']

In [None]:
frame2


通过索引方式返回的列是相应数据视图而已，并不是副本，通过Series copy方法即可指定复制列

另一种常见的数据形式嵌套字典

In [None]:
pop = {
    'Nevada':
        {
            2001:2.4,2002:2.9
        },
    'Ohio':
        {
           2000:1.5,2001:1.7,2002:3.6
        }
}


如果传入嵌套字典给DataFrame pandas会被解释为外层字典的键作为列，内层键则作为行索引


In [59]:
frame3 = pd.DataFrame(pop)
frame3


NameError: name 'pop' is not defined

对DataFrame进行转置（交换行和列）

In [60]:
frame3.T

NameError: name 'frame3' is not defined

DataFrame的index 和column的name 属性 设置

In [61]:
frame3.index.name = "year"
frame3.columns.name = "state"

NameError: name 'frame3' is not defined

In [None]:
frame3


In [None]:
frame3.T

values属性会以二维ndarray 返回DataFrame的数据

In [None]:
frame3.values

5.2 基本功能

In [62]:
data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year':[2000, 2001, 2002, 2001, 2002],
        'pop':[1.5,1.7,3.6,2.4,2.9]}
frame = pd.DataFrame(data)

In [63]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [64]:
obj = Series([1,3,-5,5])
obj

0    1
1    3
2   -5
3    5
dtype: int64

In [65]:
obj.values

array([ 1,  3, -5,  5])

In [66]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [67]:
obj2 = Series([4,5,7,-3], index=['a','b','c','d'])
obj2

a    4
b    5
c    7
d   -3
dtype: int64

In [68]:
obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [69]:
obj2['a']

4

In [70]:
obj2['d'] = 101

In [71]:
obj2

a      4
b      5
c      7
d    101
dtype: int64

In [72]:
obj2[obj2>10]

d    101
dtype: int64

In [73]:
obj2>100

a    False
b    False
c    False
d     True
dtype: bool

In [74]:
obj2 * 2


a      8
b     10
c     14
d    202
dtype: int64

In [75]:
'c' in obj2

True

In [76]:
8 in obj2

False

In [77]:
sdata = {'Ohio':3500,'Texas':71000,'Oregon':16000}
obj3 = Series(sdata)

In [78]:
obj3

Ohio       3500
Texas     71000
Oregon    16000
dtype: int64

In [79]:
states = ['California', 'Texas','Ohio']
obj4 = Series(sdata, index=states)
obj4

California        NaN
Texas         71000.0
Ohio           3500.0
dtype: float64

In [80]:
pd.isnull(obj4)

California     True
Texas         False
Ohio          False
dtype: bool

In [81]:
obj4.isnull()

California     True
Texas         False
Ohio          False
dtype: bool

In [82]:
obj4 + obj3

California         NaN
Ohio            7000.0
Oregon             NaN
Texas         142000.0
dtype: float64

In [83]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
California        NaN
Texas         71000.0
Ohio           3500.0
Name: population, dtype: float64

In [84]:
data = {'state':['Ohio', 'Ohio','Ohio','Nevada','Nevada'],
        'year':[2000,2001,2002,2001,2002],
        'pop':[1.5,1.7,3.6,2.4,2.9]}
frame = DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [85]:
DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [86]:
DataFrame(data, columns=['year', 'pop'])

Unnamed: 0,year,pop
0,2000,1.5
1,2001,1.7
2,2002,3.6
3,2001,2.4
4,2002,2.9


In [87]:
DataFrame(data, columns=['year', 'state', 'pop','debt'])

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,


In [88]:
frame2 = DataFrame(data, columns=['year', 'state', 'pop','debt'])
frame2


Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,


In [89]:
frame['year']

0    2000
1    2001
2    2002
3    2001
4    2002
Name: year, dtype: int64

In [90]:
frame2.index = ['one', 'two', 'three', 'four', 'five']

In [91]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [92]:
import numpy as np
frame2['debt'] = np.arange(5)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4


In [93]:
val = Series([-1.2, -1.5, -1.7], index=['one','three','five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,-1.2
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,-1.5
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,-1.7


In [94]:
frame2['eastern'] = 0
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,-1.2,0
two,2001,Ohio,1.7,,0
three,2002,Ohio,3.6,-1.5,0
four,2001,Nevada,2.4,,0
five,2002,Nevada,2.9,-1.7,0


In [95]:
frame2['large'] = frame2['pop'] > 2
frame2

Unnamed: 0,year,state,pop,debt,eastern,large
one,2000,Ohio,1.5,-1.2,0,False
two,2001,Ohio,1.7,,0,False
three,2002,Ohio,3.6,-1.5,0,True
four,2001,Nevada,2.4,,0,True
five,2002,Nevada,2.9,-1.7,0,True


In [96]:
pop = {'Nevada':{2001:2.4, 2002:2.9},'Ohio':{2000:1.5, 2001:1.7,2002:3.6}}

In [97]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [98]:
obj2 = obj.reindex(['a', 'b', 'c', 'd','e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [99]:
obj2 = obj.reindex(['a', 'b', 'c', 'd','e'], fill_value=0)

In [100]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [101]:
obj3 = Series(['blue', 'purple', 'yellow'], index =[0,2,4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [103]:
obj3.reindex(range(6),method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object