## pandas的基本功能

In [2]:
from pandas import Series,DataFrame
import numpy as np
import pandas as pd

In [3]:
obj=Series([4.5,7.2,-5.3,3.6],index=['d','b','c','a'])
obj

d    4.5
b    7.2
c   -5.3
a    3.6
dtype: float64

### 1 重新索引
使用reindex根据新的索引进行排序
#### 若某个索引值不存在，就引入缺失值

In [4]:
obj2=obj.reindex(['a','b','c','d','e'])
obj2

a    3.6
b    7.2
c   -5.3
d    4.5
e    NaN
dtype: float64

In [5]:
# 对不存在的值设置缺失值
obj.reindex(['a','b','c','d','e'],fill_value=0)

a    3.6
b    7.2
c   -5.3
d    4.5
e    0.0
dtype: float64

常用的差值处理：  

使用method函数


In [6]:
obj3=Series(['blue','red','yellow'],index=[0,2,4])
obj3
# ffill 前向值填充
obj3.reindex(range(6),method='ffill')

0      blue
1      blue
2       red
3       red
4    yellow
5    yellow
dtype: object

### 
#### 

In [7]:
frame=DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'],columns=['ohio','texas','california'])
frame

Unnamed: 0,ohio,texas,california
a,0,1,2
c,3,4,5
d,6,7,8


修改列索行

In [8]:
frame2=frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,ohio,texas,california
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


使用columns关键字重新索引列

In [9]:
states=['texas','utah','california']
frame.reindex(columns=states)

Unnamed: 0,texas,utah,california
a,1,,2
c,4,,5
d,7,,8


同时对行和列进行重新索引

In [10]:
print(frame)
frame.reindex(index=['a','b','c','d'],method='ffill')
#frame.reindex(index=['a','b','c','d'],method='ffill',columns=['texas','utah','california'])
#frame.reindex(index=['a','b','c'],method='ffill',columns=states)
#frame
# 出现错误如下：ValueError: index must be monotonic increasing or decreasing

   ohio  texas  california
a     0      1           2
c     3      4           5
d     6      7           8


Unnamed: 0,ohio,texas,california
a,0,1,2
b,0,1,2
c,3,4,5
d,6,7,8


*可以使用loc替代ix*

In [11]:
frame.ix[['a','b','c','d'],states]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,texas,utah,california
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


### 2 丢弃指定轴上的项

In [12]:
# 当使用np.arange(5.0)即生成的ndarray类型是float类型
obj=Series(np.arange(5.),index=['a','b','c','d','e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [13]:
new_obj=obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [14]:
# 注意创建dataframe时，若指定index或columns需要与实际行和列总数一致
data=DataFrame(np.arange(16).reshape((4,4)),
              index=['beijing','shanghai','guangzhou','qingdao'],
              columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
beijing,0,1,2,3
shanghai,4,5,6,7
guangzhou,8,9,10,11
qingdao,12,13,14,15


In [15]:
data.drop(['beijing','shanghai'])

Unnamed: 0,one,two,three,four
guangzhou,8,9,10,11
qingdao,12,13,14,15


In [16]:
# axis何用？
data.drop('two',axis=1)

Unnamed: 0,one,three,four
beijing,0,2,3
shanghai,4,6,7
guangzhou,8,10,11
qingdao,12,14,15


### 3 索引、选取和过滤
#### 1 Series索引

In [17]:
obj=Series(np.arange(4.),index=['a','b','c','d'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [18]:
obj['b']

1.0

In [19]:
obj[1]

1.0

In [20]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [21]:
obj[['b','c','d']]

b    1.0
c    2.0
d    3.0
dtype: float64

In [22]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

In [23]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

除了以上的方式外，还可利用标签的切片运算  

>标签的切片运算与python切片运算不同，末端是包含的

In [24]:
# 索引从b-c
obj['b':'c']

b    1.0
c    2.0
dtype: float64

赋值

In [25]:
obj['b':'d']=666
obj

a      0.0
b    666.0
c    666.0
d    666.0
dtype: float64

#### DataFrame索引

In [26]:
data=DataFrame(np.arange(16).reshape((4,4)),
               index=['beijing','shanghai','guangzhou','sanya'],
              columns=['a','b','c','d'])
data

Unnamed: 0,a,b,c,d
beijing,0,1,2,3
shanghai,4,5,6,7
guangzhou,8,9,10,11
sanya,12,13,14,15


In [27]:
data['b']

beijing       1
shanghai      5
guangzhou     9
sanya        13
Name: b, dtype: int64

In [28]:
data[['c','d']]

Unnamed: 0,c,d
beijing,2,3
shanghai,6,7
guangzhou,10,11
sanya,14,15


*除了以上的索引方式外，还支持切片或布尔型数组选取行*

In [29]:
data[:2]

Unnamed: 0,a,b,c,d
beijing,0,1,2,3
shanghai,4,5,6,7


In [30]:
data[data['c']>5]

Unnamed: 0,a,b,c,d
shanghai,4,5,6,7
guangzhou,8,9,10,11
sanya,12,13,14,15


In [31]:
data<5

Unnamed: 0,a,b,c,d
beijing,True,True,True,True
shanghai,True,False,False,False
guangzhou,False,False,False,False
sanya,False,False,False,False


In [32]:
data[data<5]=0
data

Unnamed: 0,a,b,c,d
beijing,0,0,0,0
shanghai,0,5,6,7
guangzhou,8,9,10,11
sanya,12,13,14,15


*在dataframe的行上进行标签索引，使用ix  
 可以使用loc替代ix
*

In [33]:
data.loc['beijing',['a','b']]

a    0
b    0
Name: beijing, dtype: int64

In [34]:
data.ix['beijing',[3,0,1]]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


d    0
a    0
b    0
Name: beijing, dtype: int64

In [35]:
data

Unnamed: 0,a,b,c,d
beijing,0,0,0,0
shanghai,0,5,6,7
guangzhou,8,9,10,11
sanya,12,13,14,15


In [36]:
# 第3行（下表为2的行）数据
data.ix[2]

a     8
b     9
c    10
d    11
Name: guangzhou, dtype: int64

In [37]:
data.ix[:'guangzhou','b']

beijing      0
shanghai     5
guangzhou    9
Name: b, dtype: int64

In [38]:
data.ix[data.c>8]

Unnamed: 0,a,b,c,d
guangzhou,8,9,10,11
sanya,12,13,14,15


In [39]:
data.ix[data.c>8,:3]

Unnamed: 0,a,b,c
guangzhou,8,9,10
sanya,12,13,14


#### 4 算数运算和数据对齐

*对不同的索引对象进行算数运算时，若存在不同的索引集，结果的索引集是索引对的并集*

##### 4.1 使用在Series上

In [40]:
s1=Series([7.,6.,-2.,1.5],index=['a','b','c','d'])
s1

a    7.0
b    6.0
c   -2.0
d    1.5
dtype: float64

In [41]:
s2=Series([5.2,6.2,-10.2,8.2],index=['a','b','e','f'])
s2

a     5.2
b     6.2
e   -10.2
f     8.2
dtype: float64

In [42]:
s1+s2

a    12.2
b    12.2
c     NaN
d     NaN
e     NaN
f     NaN
dtype: float64

##### 4.2 对于DataFrame的对齐操作

In [43]:
df1=DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['beijing','shanghai','tianjin'])
df2=DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['qingdao','beijing','shanghai','nanning'])

In [44]:
df1

Unnamed: 0,b,c,d
beijing,0.0,1.0,2.0
shanghai,3.0,4.0,5.0
tianjin,6.0,7.0,8.0


In [45]:
df2

Unnamed: 0,b,d,e
qingdao,0.0,1.0,2.0
beijing,3.0,4.0,5.0
shanghai,6.0,7.0,8.0
nanning,9.0,10.0,11.0


In [46]:
df1+df2

Unnamed: 0,b,c,d,e
beijing,3.0,,6.0,
nanning,,,,
qingdao,,,,
shanghai,9.0,,12.0,
tianjin,,,,


##### 4.3 在算数方法中填充值

*默认，若两个dataframe之间相加时，若没有重叠的位置回产生NA值*

In [47]:
df1=DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['beijing','shanghai','tianjin'])
df2=DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['qingdao','beijing','shanghai','nanning'])

In [48]:
df1

Unnamed: 0,b,c,d
beijing,0.0,1.0,2.0
shanghai,3.0,4.0,5.0
tianjin,6.0,7.0,8.0


In [49]:
df2

Unnamed: 0,b,d,e
qingdao,0.0,1.0,2.0
beijing,3.0,4.0,5.0
shanghai,6.0,7.0,8.0
nanning,9.0,10.0,11.0


In [50]:
df1+df2

Unnamed: 0,b,c,d,e
beijing,3.0,,6.0,
nanning,,,,
qingdao,,,,
shanghai,9.0,,12.0,
tianjin,,,,


*两个对象中某个轴标签在另一个对象中找不到时填充一个特殊值*

In [51]:
df1.add(df2,fill_value=0)

Unnamed: 0,b,c,d,e
beijing,3.0,1.0,6.0,5.0
nanning,9.0,,10.0,11.0
qingdao,0.0,,1.0,2.0
shanghai,9.0,4.0,12.0,8.0
tianjin,6.0,7.0,8.0,


In [52]:
df2.columns

Index(['b', 'd', 'e'], dtype='object')

*对于df1，只取df2的列索引*

In [53]:
df1.reindex(columns=df2.columns,fill_value=0)

Unnamed: 0,b,d,e
beijing,0.0,2.0,0
shanghai,3.0,5.0,0
tianjin,6.0,8.0,0


#### 5. DataFrame和Series之间的运算

##### 二维数组与其某行之间的差
### 广播

In [54]:
arr=np.arange(12.).reshape((3,4))
arr

array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])

In [55]:
arr[0]

array([ 0.,  1.,  2.,  3.])

In [56]:
arr-arr[0]

array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

### 缺少部分代码（详见书中p137）

### 6、函数应用和映射

In [57]:
frame=DataFrame(np.random.randn(4,3),columns=list('bcd'),index=['a1','b2','c3','d4'])
frame

Unnamed: 0,b,c,d
a1,-0.577682,0.687606,0.49176
b2,-1.263236,0.884682,0.351244
c3,0.853563,0.031112,-0.028329
d4,-0.291853,-0.338898,1.196752


In [58]:
# 取绝对值
np.abs(frame)

Unnamed: 0,b,c,d
a1,0.577682,0.687606,0.49176
b2,1.263236,0.884682,0.351244
c3,0.853563,0.031112,0.028329
d4,0.291853,0.338898,1.196752


#### 6.1 通过广播返回一个值

In [59]:
# 将函数应用到每一行或列形成的一维数组上
# 此处是广播
f=lambda x:x.max()-x.min()
f

<function __main__.<lambda>>

In [60]:
frame.apply(f)

b    2.116799
c    1.223580
d    1.225081
dtype: float64

In [61]:
frame.apply(f,axis=1)

a1    1.265288
b2    2.147918
c3    0.881892
d4    1.535650
dtype: float64

#### 6.2 使用apply返回series
相当于b，c，d是三个series

In [62]:
def f(x):
    return Series([x.min(),x.max()],index=['min','max'])
frame.apply(f)

Unnamed: 0,b,c,d
min,-1.263236,-0.338898,-0.028329
max,0.853563,0.884682,1.196752


#### 6.3 applymap与map

In [64]:
# 取两位小数
format=lambda x:'%.2f'%x
frame.applymap(format)

Unnamed: 0,b,c,d
a1,-0.58,0.69,0.49
b2,-1.26,0.88,0.35
c3,0.85,0.03,-0.03
d4,-0.29,-0.34,1.2


### 7 排序和排名

In [65]:
obj=Series(range(4),index=['d','b','c','a'])
obj

d    0
b    1
c    2
a    3
dtype: int64

#### 7.1 对索引进行排序

（1）Series根据索引进行排序

In [66]:

obj.sort_index()

a    3
b    1
c    2
d    0
dtype: int64

（2）DataFrame根据任意一个轴的索引进行排序

In [69]:
frame=DataFrame(np.arange(8).reshape((2,4)),index=['three','one'],columns=['d','c','b','a'])
frame

Unnamed: 0,d,c,b,a
three,0,1,2,3
one,4,5,6,7


In [74]:
#根据行索引进行排序
frame.sort_index()

Unnamed: 0,d,c,b,a
one,4,5,6,7
three,0,1,2,3


In [75]:
#根据列索引进行排序
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,3,2,1,0
one,7,6,5,4


In [72]:
frame.sort_index().sort_index(axis=1)

Unnamed: 0,a,b,c,d
one,7,6,5,4
three,3,2,1,0


默认排序是生序排序，改为降序排序

In [76]:
frame.sort_index(axis=1,ascending=False)

Unnamed: 0,d,c,b,a
three,0,1,2,3
one,4,5,6,7


#### 7.2 以上使用的是对索引进行排序，也可对值进行排序

（1）Series

In [78]:
obj=Series([4,7,-3,2])
obj.order()

AttributeError: 'Series' object has no attribute 'order'

In [80]:
obj=Series([4,np.nan,7,np.nan,-3,2])
obj.order()

AttributeError: 'Series' object has no attribute 'order'

(2)DataFrame

In [82]:
frame=DataFrame({'b':[4,7,-3,2],'a':[0,1,0,-1]})
frame

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,-1,2


##### 对某一个列进行排序

In [83]:
frame.sort_index(by='b')

  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b
2,0,-3
3,-1,2
0,0,4
1,1,7


In [84]:
# 若给一个不存在的列名，则会报错
frame.sort_index(by=['a','b','c'])

  """Entry point for launching an IPython kernel.


KeyError: 'c'

In [85]:
frame.sort_index(by=['a','b'])

  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b
3,-1,2
2,0,-3
0,0,4
1,1,7


#### 7.3 排名（ranking）
排名会增设一个排名值（从1开始）

In [87]:
# Series
obj=Series([7,-5,4,2,0])
obj

0    7
1   -5
2    4
3    2
4    0
dtype: int64

In [88]:
obj.rank()

0    5.0
1    1.0
2    4.0
3    3.0
4    2.0
dtype: float64

In [89]:
obj.rank(method="first")

0    5.0
1    1.0
2    4.0
3    3.0
4    2.0
dtype: float64

In [92]:
# 根据降序进行排名
obj.rank(ascending=False,method='max')

0    1.0
1    5.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [93]:
frame=DataFrame({'b':[4.3,8,10.2,-4],'a':[2.3,8,4,1],'c':[1,2,3,4]})
frame

Unnamed: 0,a,b,c
0,2.3,4.3,1
1,8.0,8.0,2
2,4.0,10.2,3
3,1.0,-4.0,4


对于DataFrame的值进行排序，并返回排名

In [94]:
frame.rank(axis=1)

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,2.5,2.5,1.0
2,2.0,3.0,1.0
3,2.0,1.0,3.0


#### 7.4 带有重复值的轴索引

In [95]:
obj=Series(range(5),index=['a','a','b','c','d'])
obj

a    0
a    1
b    2
c    3
d    4
dtype: int64

In [96]:
# 通过is_unique属性判断他的值是否唯一
obj.index.is_unique

False

In [99]:
# 若某个索引对应多个值，则返回一个series
obj['a']

a    0
a    1
dtype: int64

In [100]:
obj['c']

3