# <利用Python进行数据分析> pandas 章节

## 入门——数据结构

### Series 序列：
Series是ndarray与字典的结合体；拥有二者的一些优良性质

In [2]:
import pandas as pd
from numpy import random as rand
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
s = {'Ohio':100, 'Texas':200, 'Utah':100}
obj1 = pd.Series(s)
obj1

Ohio     100
Texas    200
Utah     100
dtype: int64

In [4]:
obj1.name = '第一个序列'
obj1

Ohio     100
Texas    200
Utah     100
Name: 第一个序列, dtype: int64

In [5]:
# index的名字
obj1.index.name = '序列'
obj1

序列
Ohio     100
Texas    200
Utah     100
Name: 第一个序列, dtype: int64

In [6]:
obj1[:2]

序列
Ohio     100
Texas    200
Name: 第一个序列, dtype: int64

In [7]:
obj1.get_value('Ohio')

100

### DataFrame 数据框


In [8]:
data = {'state':['Ohio','Texas','Neveda','Ohio'],
       'year':[2000,2001,2002,2001],
       'pop':[1.5,1.7,3.6,2.4]}
df = pd.DataFrame(data)
df

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Texas,2001
2,3.6,Neveda,2002
3,2.4,Ohio,2001


In [9]:
pd.DataFrame(data, columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Texas,1.7
2,2002,Neveda,3.6
3,2001,Ohio,2.4


In [10]:
df2 = pd.DataFrame(data,columns=['year','state','pop'],
                  index=['one','two','three','four'])
df2

Unnamed: 0,year,state,pop
one,2000,Ohio,1.5
two,2001,Texas,1.7
three,2002,Neveda,3.6
four,2001,Ohio,2.4


In [11]:
# 每一列其实是一个Series
df2.year

one      2000
two      2001
three    2002
four     2001
Name: year, dtype: int64

In [12]:
df2['year']

one      2000
two      2001
three    2002
four     2001
Name: year, dtype: int64

In [13]:
df2['new'] = 'hha'

In [14]:
df2

Unnamed: 0,year,state,pop,new
one,2000,Ohio,1.5,hha
two,2001,Texas,1.7,hha
three,2002,Neveda,3.6,hha
four,2001,Ohio,2.4,hha


In [15]:
val = pd.Series([12,34,34,23], index=['two','one','three','four'])
df2.new = val
df2

Unnamed: 0,year,state,pop,new
one,2000,Ohio,1.5,34
two,2001,Texas,1.7,12
three,2002,Neveda,3.6,34
four,2001,Ohio,2.4,23


In [16]:
# 按行选取同样可以变为一个Series
print df2.loc['one'],type(df2.loc['one'])

year     2000
state    Ohio
pop       1.5
new        34
Name: one, dtype: object <class 'pandas.core.series.Series'>


In [17]:
# 从包含多个字典的列表中来生成
data2 = [{'a':1, 'b':2, 'c':34}, {'a':23,'b':342}]
pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,34.0
1,23,342,


### 索引对象

+ Index对象是不能修改的，Index对象就像是一个集合

### reindex 重新索引

In [18]:
df2

Unnamed: 0,year,state,pop,new
one,2000,Ohio,1.5,34
two,2001,Texas,1.7,12
three,2002,Neveda,3.6,34
four,2001,Ohio,2.4,23


In [19]:
df2.reindex(['two','one','three','four'])

Unnamed: 0,year,state,pop,new
two,2001,Texas,1.7,12
one,2000,Ohio,1.5,34
three,2002,Neveda,3.6,34
four,2001,Ohio,2.4,23


In [20]:
# reindex的时候使用fill——value来填充值
df2.reindex(['two','one','three','four','没有这一项'], fill_value='傻了吧')

Unnamed: 0,year,state,pop,new
two,2001,Texas,1.7,12
one,2000,Ohio,1.5,34
three,2002,Neveda,3.6,34
four,2001,Ohio,2.4,23
没有这一项,傻了吧,傻了吧,傻了吧,傻了吧


In [21]:
# 插值的时候可以指定一些方式
obj3 = pd.Series(['blue','purple','yellow'],index=[0,2,4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [22]:
obj3.reindex(range(6), method='ffill') # 前向补充

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [23]:
obj3.reindex(range(6), method='bfill') # 后向补充，下面的

0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object

+ 至于DataFrame中的reindex，如果只传入一个列表，则会优先重新索引行；
但reindex可以修改行和列，甚至二者都修改

In [24]:
df2.reindex(columns=['state','pop','new','year'])

Unnamed: 0,state,pop,new,year
one,Ohio,1.5,34,2000
two,Texas,1.7,12,2001
three,Neveda,3.6,34,2002
four,Ohio,2.4,23,2001


In [25]:
df2.reindex(['two','one','three','four','傻了吧'])

Unnamed: 0,year,state,pop,new
two,2001.0,Texas,1.7,12.0
one,2000.0,Ohio,1.5,34.0
three,2002.0,Neveda,3.6,34.0
four,2001.0,Ohio,2.4,23.0
傻了吧,,,,


In [26]:
# 不过这里的插值是只能在行上进行处理的
df2.reindex(['one','two','three','four','傻了吧'])

Unnamed: 0,year,state,pop,new
one,2000.0,Ohio,1.5,34.0
two,2001.0,Texas,1.7,12.0
three,2002.0,Neveda,3.6,34.0
four,2001.0,Ohio,2.4,23.0
傻了吧,,,,


In [27]:
df2.ix(['one','three','two','four','傻了吧'])

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """Entry point for launching an IPython kernel.


<pandas.core.indexing._IXIndexer at 0x1112cc390>

In [28]:
obj1

序列
Ohio     100
Texas    200
Utah     100
Name: 第一个序列, dtype: int64

In [29]:
obj1.drop('Ohio')

序列
Texas    200
Utah     100
Name: 第一个序列, dtype: int64

In [30]:
df

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Texas,2001
2,3.6,Neveda,2002
3,2.4,Ohio,2001


In [31]:
df.drop(0)

Unnamed: 0,pop,state,year
1,1.7,Texas,2001
2,3.6,Neveda,2002
3,2.4,Ohio,2001


In [32]:
df.drop('pop',axis=1)

Unnamed: 0,state,year
0,Ohio,2000
1,Texas,2001
2,Neveda,2002
3,Ohio,2001


### 索引，选取，过滤
Series的索引类似于numpy数组

不过有点不同的是，’label‘索引切片是后端包含的，数值索引与numpy还是相同的

In [33]:
obj1

序列
Ohio     100
Texas    200
Utah     100
Name: 第一个序列, dtype: int64

In [34]:
obj1['Ohio':'Texas'] = 291289128
obj1

序列
Ohio     291289128
Texas    291289128
Utah           100
Name: 第一个序列, dtype: int64

In [35]:
obj1[:1]  #

序列
Ohio    291289128
Name: 第一个序列, dtype: int64

In [36]:
df['pop'] # 会simplify成一个series

0    1.5
1    1.7
2    3.6
3    2.4
Name: pop, dtype: float64

In [37]:
df[:2] # 行索引

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Texas,2001


In [38]:
df.iloc[:2,:2]

Unnamed: 0,pop,state
0,1.5,Ohio
1,1.7,Texas


In [39]:
df.loc[:1,['pop','year']]


Unnamed: 0,pop,year
0,1.5,2000
1,1.7,2001


In [40]:
df.ix[:1,:2]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """Entry point for launching an IPython kernel.


Unnamed: 0,pop,state
0,1.5,Ohio
1,1.7,Texas


### pandas算术运算的数据对齐功能
会根据索引来对齐

In [41]:
s1 = pd.Series(rand.randn(4), index=['a','b','c','d'])
s2 = pd.Series(rand.randn(5), index=['a','b','c','e','d'])
s1

a   -0.105365
b    0.088516
c    0.326349
d   -0.955939
dtype: float64

In [42]:
s2

a    0.162483
b    0.452959
c    1.464806
e   -0.902208
d    1.193063
dtype: float64

In [43]:
s1 + s2

a    0.057118
b    0.541475
c    1.791155
d    0.237125
e         NaN
dtype: float64

In [44]:
df1 = pd.DataFrame(rand.randn(9).reshape(3,3),
                  columns=list('bcd'),index=['Ohio','Texas','Colorado'])
df1

Unnamed: 0,b,c,d
Ohio,0.178517,1.939606,0.852514
Texas,-0.728696,-0.054076,-0.173614
Colorado,-0.477223,0.212371,2.364698


In [45]:
df2 = pd.DataFrame(rand.randn(9).reshape(3,3),
                  columns=list('bde'),index=['Ohio','Texas','Colorado'])
df2

Unnamed: 0,b,d,e
Ohio,-0.385532,-1.445656,-0.218515
Texas,-0.222913,-0.461583,-0.121234
Colorado,1.961249,0.561735,1.521926


In [46]:
df1 + df2

Unnamed: 0,b,c,d,e
Ohio,-0.207015,,-0.593143,
Texas,-0.951609,,-0.635198,
Colorado,1.484026,,2.926434,


In [47]:
pd.DataFrame.add(df1,df2,fill_value=0)

Unnamed: 0,b,c,d,e
Ohio,-0.207015,1.939606,-0.593143,-0.218515
Texas,-0.951609,-0.054076,-0.635198,-0.121234
Colorado,1.484026,0.212371,2.926434,1.521926


### DataFrame与Series之间的运算
普通的就是在行上利用“广播”的方式操作；
当然也可以

In [48]:
frame = pd.DataFrame(np.arange(12).reshape((4,3)), columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
series=frame.ix[0]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  


In [49]:
series

b    0
d    1
e    2
Name: Utah, dtype: int64

In [50]:
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [51]:
frame - series  # 按照行操作

Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


In [52]:
# 如何按照列操作？
series2 = frame['b']
series2

Utah      0
Ohio      3
Texas     6
Oregon    9
Name: b, dtype: int64

In [53]:
# 使用pd.sub
frame.sub(series2, axis = 0)

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,0,1,2
Texas,0,1,2
Oregon,0,1,2


In [54]:
# 自然原先的行操作也可以使用 
frame.sub(series, axis=1)

Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


### 函数应用(apply)与映射applymap


In [55]:
f = lambda x: x.max() - x.min()
frame.apply(f, axis = 0)

b    9
d    9
e    9
dtype: int64

In [56]:
frame.apply(f, axis=1)

Utah      2
Ohio      2
Texas     2
Oregon    2
dtype: int64

元素级别的python函数，其实类似于做了一个映射 applymap

In [57]:
frame.applymap(lambda x: x ** 2)

Unnamed: 0,b,d,e
Utah,0,1,4
Ohio,9,16,25
Texas,36,49,64
Oregon,81,100,121


### 排序与排名
按照索引来排序，实质上是使用字典顺序排序

In [58]:
obj = pd.Series(range(4), index=['d','a','c','b'])
obj

d    0
a    1
c    2
b    3
dtype: int64

In [59]:
obj.sort_index()

a    1
b    3
c    2
d    0
dtype: int64

In [60]:
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [61]:
frame.sort_index(axis=1, ascending=False) # 降序排列

Unnamed: 0,e,d,b
Utah,2,1,0
Ohio,5,4,3
Texas,8,7,6
Oregon,11,10,9


In [62]:
obj.sort_values()

d    0
a    1
c    2
b    3
dtype: int64

#### 根据一个或多个列中的值进行排序
利用by选项

In [63]:
frame.sort_index(by = 'b')

  """Entry point for launching an IPython kernel.


Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [64]:
frame.sort_index(by = ['d','b'])

  """Entry point for launching an IPython kernel.


Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


#### 排名
得到这个值的

In [65]:
obj.rank(method='first')

d    1.0
a    2.0
c    3.0
b    4.0
dtype: float64

In [66]:
obj.rank(ascending=False,method='max')

d    4.0
a    3.0
c    2.0
b    1.0
dtype: float64

In [67]:
frame.rank(axis=1)

Unnamed: 0,b,d,e
Utah,1.0,2.0,3.0
Ohio,1.0,2.0,3.0
Texas,1.0,2.0,3.0
Oregon,1.0,2.0,3.0


### 带有重复值的轴索引
有时候索引会重复

In [68]:
obj = pd.Series(range(5), index=['a','a','b','b','c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [69]:
obj.index.is_unique

False

### 汇总和计算描述统计


In [70]:
df = pd.DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],
                 index=['a','b','c','d'],
                 columns=['one','two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [71]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [72]:
df.mean(axis=1,skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [73]:
# 在哪里达到最小值和最大值
df.idxmax()

one    b
two    d
dtype: object

In [74]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [75]:
df.pct_change()

Unnamed: 0,one,two
a,,
b,4.071429,
c,,
d,-0.894366,-0.711111


### 唯一值，值计数，以及成员资格 isin


In [76]:
obj.unique()

array([0, 1, 2, 3, 4])

In [77]:
obj.value_counts()

4    1
3    1
2    1
1    1
0    1
dtype: int64

In [78]:
obj.isin(['b','c'])

a    False
a    False
b    False
b    False
c    False
dtype: bool

### 处理缺失数据
Python内置的None值也会被当做NA处理

In [79]:
string_data = pd.Series(['as','dbhj',np.nan,'avods'])
string_data

0       as
1     dbhj
2      NaN
3    avods
dtype: object

In [80]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [81]:
from numpy import nan as NA
data = pd.Series([1,NA,3.5,NA,7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [82]:
data = pd.DataFrame([[1,6.5,3],[1,NA,NA],[NA,NA,NA],[NA,3,2]])
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [83]:
data.dropna(how='all') # 只删除一行都为NA的数据

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,3.0,2.0


In [84]:
data.dropna(how='all', axis=1) # 想要丢弃列的话

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,3.0,2.0


In [85]:
# 填充缺失数据
data.fillna(0)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,3.0,2.0


In [86]:
# 对不同的列填充不同的值
data.fillna({1:0.5,2:12})

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.5,12.0
2,,0.5,12.0
3,,3.0,2.0


In [99]:
# fillna默认会返回新对象，但也可以就地进行修改
data.fillna({1:0.5,3:-1}, inplace=True)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.5,
2,,0.5,
3,,3.0,2.0


## 利用pandas进行文件读写，数据获取(Web)

### 读取
+ read_csv:默认逗号分隔
+ read_table：默认'\t'分隔
+ read_clipboard:从剪切板中读取
+ read_fwf: 没有分隔符

In [88]:
data1 = pd.read_csv('/Users/yanghao/笔记/pydata-book-master/ch06/ex1.csv')
data1

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [89]:
pd.read_table('/Users/yanghao/笔记/pydata-book-master/ch06/ex1.csv', sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [90]:
pd.read_table('/Users/yanghao/笔记/pydata-book-master/ch06/ex1.csv', sep=',', header=None)

Unnamed: 0,0,1,2,3,4
0,a,b,c,d,message
1,1,2,3,4,hello
2,5,6,7,8,world
3,9,10,11,12,foo


In [100]:
pd.read_csv('/Users/yanghao/笔记/pydata-book-master/ch06/ex1.csv')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [101]:
# 如何设置某一个列作为索引列
pd.read_csv('/Users/yanghao/笔记/pydata-book-master/ch06/ex1.csv', index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [102]:
# 使用多个列作为层次化索引
data2 = pd.read_csv('/Users/yanghao/笔记/pydata-book-master/ch06/csv_mindex.csv', index_col=['key1','key2'])

In [103]:
data2

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


#### 分隔符不确定
可以考虑使用正则表达式

In [104]:
list(open('/Users/yanghao/笔记/pydata-book-master/ch06/ex3.csv')) # 多个空格

['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491']

In [105]:
result = pd.read_table('/Users/yanghao/笔记/pydata-book-master/ch06/ex3.csv', sep='\s+')

In [106]:
result

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


#### 跳过某些行


In [109]:
pd.read_csv('/Users/yanghao/笔记/pydata-book-master/ch06/ex4.csv', skiprows=[0,2,3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


#### 处理缺失数据


In [111]:
data3 = pd.read_csv('/Users/yanghao/笔记/pydata-book-master/ch06/ex5.csv')
data3

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [115]:
pd.read_csv('/Users/yanghao/笔记/pydata-book-master/ch06/ex5.csv', na_values=['world'])

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,
2,three,9,10,11.0,12,foo


In [116]:
# 不同列指定不同的NA标记物
pd.read_csv('/Users/yanghao/笔记/pydata-book-master/ch06/ex5.csv', na_values={'message':["foo","NA"],"something":['two']})

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


#### 逐块读取文本文件
设置多块迭代读取。

In [128]:
chunker = pd.read_csv('/Users/yanghao/笔记/pydata-book-master/ch06/ex6.csv', chunksize=100)
chunker

<pandas.io.parsers.TextFileReader at 0x11130dc10>

In [129]:
tot = pd.Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

tot

0    151.0
1    146.0
2    152.0
3    162.0
4    171.0
5    157.0
6    166.0
7    164.0
8    162.0
9    150.0
A    320.0
B    302.0
C    286.0
D    320.0
E    368.0
F    335.0
G    308.0
H    330.0
I    327.0
J    337.0
K    334.0
L    346.0
M    338.0
N    306.0
O    343.0
P    324.0
Q    340.0
R    318.0
S    308.0
T    304.0
U    326.0
V    328.0
W    305.0
X    364.0
Y    314.0
Z    288.0
dtype: float64

#### 文本输出，写入到文本格式中


In [130]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,3.0,2.0


In [131]:
data.to_csv('test.csv')

#### 手工处理分隔符格式


In [136]:
import csv
lines = list(csv.reader(open('/Users/yanghao/笔记/pydata-book-master/ch06/ex7.csv')))
header, values = lines[0], lines[1:]
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

In [137]:
lines

[['a', 'b', 'c'], ['1', '2', '3'], ['1', '2', '3', '4']]

### Json数据
### XML
### 二进制数据格式
### HDF5格式
### Excel格式
### HTML和WEB API

### 使用数据库


In [140]:
from pymysql import connect

In [141]:
connection = connect?

In [146]:
connection = connect(host='localhost@192.168.1.49',user='root',password='duanzhongqu1234',port=22, database='JD')

OperationalError: (2003, "Can't connect to MySQL server on 'localhost@192.168.1.49' ([Errno 8] nodename nor servname provided, or not known)")