In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
obj = pd.Series([4,7,-5,3])
obj


0    4
1    7
2   -5
3    3
dtype: int64

Series 表现形式为左边索引，右边值，由于我们没有指定索引，这样就会自动创建0-N-1的值为索引

获取index属性

In [2]:
obj.index


RangeIndex(start=0, stop=4, step=1)

根据各个据点标记索引

In [3]:
obj2 = pd.Series([4,7,-5,3],index=['a','d','b','c'])
obj2

a    4
d    7
b   -5
c    3
dtype: int64

通过索引获取Series的单一或者一组值

In [4]:
obj2['d']

7

In [5]:
obj2[['a','b','c']]

a    4
b   -5
c    3
dtype: int64

根据布尔数组进行过滤，标量乘法，应用数学函数

In [6]:
obj2[obj2>0]

a    4
d    7
c    3
dtype: int64

In [7]:
obj2 * 2

a     8
d    14
b   -10
c     6
dtype: int64

Series 还可以看成一个定长的有序字典，因为的索引是一一映射的

In [8]:
'b' in obj2


True

In [9]:
'f' in obj2


False

根据python 字典创建Series

In [10]:
sdata = {'Ohio':35000,'Texas':71000}
obj3 = pd.Series(sdata)
obj3

Ohio     35000
Texas    71000
dtype: int64

传入字典默认index 就是原字典的键

In [11]:
states = ['California','Ohio','Oregon','Texas']
obj4 = pd.Series(sdata,index=states)
obj4


California        NaN
Ohio          35000.0
Oregon            NaN
Texas         71000.0
dtype: float64

由于California 和 Oregan 是没有的所以为NaN
pandas的isnull 和 notnull 函数可以用于检测缺失数据


In [12]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon         True
Texas         False
dtype: bool

In [13]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon        False
Texas          True
dtype: bool

Series 也有类似方法

In [14]:
obj4.isnull()

California     True
Ohio          False
Oregon         True
Texas         False
dtype: bool

Series最重要的一个功能就是，它会根据运算的索引标签自动对齐数据

In [15]:
obj3


Ohio     35000
Texas    71000
dtype: int64

In [16]:
obj4

California        NaN
Ohio          35000.0
Oregon            NaN
Texas         71000.0
dtype: float64

In [17]:
obj3 + obj4


California         NaN
Ohio           70000.0
Oregon             NaN
Texas         142000.0
dtype: float64

Series对象本身以及其索引都有一个属性name

In [18]:
obj4.name = "population"
obj4.index.name = "state"
obj4

state
California        NaN
Ohio          35000.0
Oregon            NaN
Texas         71000.0
Name: population, dtype: float64

Series的索引可以通过赋值方式修改

In [19]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [20]:
obj.index = ['Bob','Tom','Mary','Lily']
obj

Bob     4
Tom     7
Mary   -5
Lily    3
dtype: int64

DataFrame
DataFrame是一个表格型结构数据结构，它含有一组序列，每列可以是不同的值类型（数字，字符串，布尔值等）
DataFrame既有行索引也有列索引，它可以被看成Series字典

最常用的一种是直接传入一个由等长列表或NumPy数组组成字典


In [21]:
import pandas as pd
data = {
    'state':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],
    'year':[2000,2001,2002,2001,2002,2003],
    'pop':[1.5,1.7,3.6,2.4,2.9,3.2]
}
frame = pd.DataFrame(data)



In [22]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


指定列序列，则DataFrame的列就会按照指定顺序进行排列

In [23]:
pd.DataFrame(data,columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


如果传入列在数据找不到，就会产生结果缺失

In [24]:
frame2 = pd.DataFrame(data,columns=['year','state','pop','debt'],
                      index=['one','two','three','four','five','six'])

In [25]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


DataFrame可以获取为一个Series

In [26]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

第二种获取Series写法

In [27]:
frame.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

访问frame行

In [28]:
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [29]:
frame2.loc['eight']

KeyError: 'eight'

给整列赋值

In [None]:
frame2['debt']=16.5


In [None]:
frame2


将列表或数组赋值给某个列时候，其长度必须跟DataFrame的长度匹配，
如果赋值是一个Series就精确匹配到DataFrame 索引，其他会填上NaN

In [None]:
val = pd.Series([-1,-1.5,-1.7],index=['two','four','five'])
frame2['debt'] = val
frame2


为不存在的列赋值会创建出一个新列，关键字del 用删除列

In [None]:
frame2['happy'] = frame2.state == 'Ohio'

In [None]:
frames

In [None]:
frame2

In [None]:
frame2['nice'] = frame2['pop'] * 2
frame2


In [None]:
del frame2['nice','happy']

In [None]:
del frame2['nice']

In [None]:
frame2


通过索引方式返回的列是相应数据视图而已，并不是副本，通过Series copy方法即可指定复制列

另一种常见的数据形式嵌套字典

In [None]:
pop = {
    'Nevada':
        {
            2001:2.4,2002:2.9
        },
    'Ohio':
        {
           2000:1.5,2001:1.7,2002:3.6
        }
}


如果传入嵌套字典给DataFrame pandas会被解释为外层字典的键作为列，内层键则作为行索引


In [None]:
frame3 = pd.DataFrame(pop)
frame3


对DataFrame进行转置（交换行和列）

In [None]:
frame3.T

DataFrame的index 和column的name 属性 设置

In [None]:
frame3.index.name = "year"
frame3.columns.name = "state"

In [None]:
frame3


In [None]:
frame3.T

values属性会以二维ndarray 返回DataFrame的数据

In [None]:
frame3.values

5.2 基本功能

In [None]:
data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year':[2000, 2001, 2002, 2001, 2002],
        'pop':[1.5,1.7,3.6,2.4,2.9]}
frame = pd.DataFrame(data)

In [None]:
frame

In [None]:
obj = Series([1,3,-5,5])
obj

In [None]:
obj.values

In [None]:
obj.index

In [None]:
obj2 = Series([4,5,7,-3], index=['a','b','c','d'])
obj2

In [None]:
obj2.index

In [None]:
obj2['a']

In [None]:
obj2['d'] = 101

In [None]:
obj2

In [None]:
obj2[obj2>10]

In [None]:
obj2>100

In [None]:
obj2 * 2


In [None]:
'c' in obj2

In [None]:
8 in obj2

In [None]:
sdata = {'Ohio':3500,'Texas':71000,'Oregon':16000}
obj3 = Series(sdata)

In [None]:
obj3

In [None]:
states = ['California', 'Texas','Ohio']
obj4 = Series(sdata, index=states)
obj4

In [None]:
pd.isnull(obj4)

In [None]:
obj4.isnull()

In [None]:
obj4 + obj3

In [None]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

In [None]:
data = {'state':['Ohio', 'Ohio','Ohio','Nevada','Nevada'],
        'year':[2000,2001,2002,2001,2002],
        'pop':[1.5,1.7,3.6,2.4,2.9]}
frame = DataFrame(data)
frame

In [None]:
DataFrame(data, columns=['year', 'state', 'pop'])

In [None]:
DataFrame(data, columns=['year', 'pop'])

In [None]:
DataFrame(data, columns=['year', 'state', 'pop','debt'])

In [None]:
frame2 = DataFrame(data, columns=['year', 'state', 'pop','debt'])
frame2


In [None]:
frame['year']

In [None]:
frame2.index = ['one', 'two', 'three', 'four', 'five']

In [None]:
frame2

In [None]:
import numpy as np
frame2['debt'] = np.arange(5)
frame2

In [None]:
val = Series([-1.2, -1.5, -1.7], index=['one','three','five'])
frame2['debt'] = val
frame2

In [None]:
frame2['eastern'] = 0
frame2

In [None]:
frame2['large'] = frame2['pop'] > 2
frame2

In [None]:
pop = {'Nevada':{2001:2.4, 2002:2.9},'Ohio':{2000:1.5, 2001:1.7,2002:3.6}}

In [30]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [31]:
obj2 = obj.reindex(['a', 'b', 'c', 'd','e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [32]:
obj2 = obj.reindex(['a', 'b', 'c', 'd','e'], fill_value=0)

In [33]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [34]:
obj3 = Series(['blue', 'purple', 'yellow'], index =[0,2,4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [35]:
obj3.reindex(range(6),method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object