## pandas的基本功能

In [11]:
from pandas import Series,DataFrame
import numpy as np
import pandas as pd

In [12]:
obj=Series([4.5,7.2,-5.3,3.6],index=['d','b','c','a'])
obj

d    4.5
b    7.2
c   -5.3
a    3.6
dtype: float64

### 1 重新索引
使用reindex根据新的索引进行排序
#### 若某个索引值不存在，就引入缺失值

In [13]:
obj2=obj.reindex(['a','b','c','d','e'])
obj2

a    3.6
b    7.2
c   -5.3
d    4.5
e    NaN
dtype: float64

In [14]:
# 对不存在的值设置缺失值
obj.reindex(['a','b','c','d','e'],fill_value=0)

a    3.6
b    7.2
c   -5.3
d    4.5
e    0.0
dtype: float64

常用的差值处理：  

使用method函数


In [15]:
obj3=Series(['blue','red','yellow'],index=[0,2,4])
obj3
# ffill 前向值填充
obj3.reindex(range(6),method='ffill')

0      blue
1      blue
2       red
3       red
4    yellow
5    yellow
dtype: object

### 
#### 

In [17]:
frame=DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'],columns=['ohio','texas','california'])
frame

Unnamed: 0,ohio,texas,california
a,0,1,2
c,3,4,5
d,6,7,8


修改列索行

In [22]:
frame2=frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,ohio,texas,california
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


使用columns关键字重新索引列

In [23]:
states=['texas','utah','california']
frame.reindex(columns=states)

Unnamed: 0,texas,utah,california
a,1,,2
c,4,,5
d,7,,8


同时对行和列进行重新索引

In [35]:
print(frame)
frame.reindex(index=['a','b','c','d'],method='ffill')
#frame.reindex(index=['a','b','c','d'],method='ffill',columns=['texas','utah','california'])
#frame.reindex(index=['a','b','c'],method='ffill',columns=states)
#frame
# 出现错误如下：ValueError: index must be monotonic increasing or decreasing

   ohio  texas  california
a     0      1           2
c     3      4           5
d     6      7           8


Unnamed: 0,ohio,texas,california
a,0,1,2
b,0,1,2
c,3,4,5
d,6,7,8


*可以使用loc替代ix*

In [36]:
frame.ix[['a','b','c','d'],states]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,texas,utah,california
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


### 2 丢弃指定轴上的项

In [39]:
# 当使用np.arange(5.0)即生成的ndarray类型是float类型
obj=Series(np.arange(5.),index=['a','b','c','d','e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [40]:
new_obj=obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [42]:
# 注意创建dataframe时，若指定index或columns需要与实际行和列总数一致
data=DataFrame(np.arange(16).reshape((4,4)),
              index=['beijing','shanghai','guangzhou','qingdao'],
              columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
beijing,0,1,2,3
shanghai,4,5,6,7
guangzhou,8,9,10,11
qingdao,12,13,14,15


In [44]:
data.drop(['beijing','shanghai'])

Unnamed: 0,one,two,three,four
guangzhou,8,9,10,11
qingdao,12,13,14,15


In [45]:
# axis何用？
data.drop('two',axis=1)

Unnamed: 0,one,three,four
beijing,0,2,3
shanghai,4,6,7
guangzhou,8,10,11
qingdao,12,14,15


### 3 索引、选取和过滤
#### 1 Series索引

In [47]:
obj=Series(np.arange(4.),index=['a','b','c','d'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [48]:
obj['b']

1.0

In [49]:
obj[1]

1.0

In [50]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [51]:
obj[['b','c','d']]

b    1.0
c    2.0
d    3.0
dtype: float64

In [52]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

In [53]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

除了以上的方式外，还可利用标签的切片运算  

>标签的切片运算与python切片运算不同，末端是包含的

In [54]:
# 索引从b-c
obj['b':'c']

b    1.0
c    2.0
dtype: float64

赋值

In [55]:
obj['b':'d']=666
obj

a      0.0
b    666.0
c    666.0
d    666.0
dtype: float64

#### DataFrame索引

In [56]:
data=DataFrame(np.arange(16).reshape((4,4)),
               index=['beijing','shanghai','guangzhou','sanya'],
              columns=['a','b','c','d'])
data

Unnamed: 0,a,b,c,d
beijing,0,1,2,3
shanghai,4,5,6,7
guangzhou,8,9,10,11
sanya,12,13,14,15


In [57]:
data['b']

beijing       1
shanghai      5
guangzhou     9
sanya        13
Name: b, dtype: int64

In [59]:
data[['c','d']]

Unnamed: 0,c,d
beijing,2,3
shanghai,6,7
guangzhou,10,11
sanya,14,15


*除了以上的索引方式外，还支持切片或布尔型数组选取行*

In [60]:
data[:2]

Unnamed: 0,a,b,c,d
beijing,0,1,2,3
shanghai,4,5,6,7


In [62]:
data[data['c']>5]

Unnamed: 0,a,b,c,d
shanghai,4,5,6,7
guangzhou,8,9,10,11
sanya,12,13,14,15


In [63]:
data<5

Unnamed: 0,a,b,c,d
beijing,True,True,True,True
shanghai,True,False,False,False
guangzhou,False,False,False,False
sanya,False,False,False,False


In [65]:
data[data<5]=0
data

Unnamed: 0,a,b,c,d
beijing,0,0,0,0
shanghai,0,5,6,7
guangzhou,8,9,10,11
sanya,12,13,14,15


*在dataframe的行上进行标签索引，使用ix  
 可以使用loc替代ix
*

In [67]:
data.loc['beijing',['a','b']]

a    0
b    0
Name: beijing, dtype: int64

In [69]:
data.ix['beijing',[3,0,1]]

d    0
a    0
b    0
Name: beijing, dtype: int64

In [71]:
data

Unnamed: 0,a,b,c,d
beijing,0,0,0,0
shanghai,0,5,6,7
guangzhou,8,9,10,11
sanya,12,13,14,15


In [70]:
# 第3行（下表为2的行）数据
data.ix[2]

a     8
b     9
c    10
d    11
Name: guangzhou, dtype: int64

In [73]:
data.ix[:'guangzhou','b']

beijing      0
shanghai     5
guangzhou    9
Name: b, dtype: int64

In [75]:
data.ix[data.c>8]

Unnamed: 0,a,b,c,d
guangzhou,8,9,10,11
sanya,12,13,14,15


In [74]:
data.ix[data.c>8,:3]

Unnamed: 0,a,b,c
guangzhou,8,9,10
sanya,12,13,14


#### 4 算数运算和数据对其

*对不同的索引对象进行算数运算时，若存在不同的索引集，结果的索引集是索引对的并集*

In [76]:
s1=Series([7.,6.,-2.,1.5],index=['a','b','c','d'])
s1

a    7.0
b    6.0
c   -2.0
d    1.5
dtype: float64

In [78]:
s2=Series([5.2,6.2,-10.2,8.2],index=['a','b','e','f'])
s2

a     5.2
b     6.2
e   -10.2
f     8.2
dtype: float64

In [79]:
s1+s2

a    12.2
b    12.2
c     NaN
d     NaN
e     NaN
f     NaN
dtype: float64