In [5]:
import pandas as pd
import numpy as np

## 函数的：应用和映射

In [18]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), 
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.669804,0.230671,0.009192
Ohio,-0.29083,0.803477,-0.292001
Texas,1.167537,-0.223561,0.08058
Oregon,-0.342557,-0.540908,0.603223


In [7]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.495049,1.351594,2.585709
Ohio,0.265554,0.582069,0.284151
Texas,0.290076,0.636013,0.594938
Oregon,3.760555,2.42339,0.447319


### 函数的应用
应用指的是把用在一维数组上的函数，用到每一行或每一列上。使用**apply**函数

#### lambda函数

+ lambda函数也叫匿名函数，即，函数没有具体的名称
+ 使用Python写一些执行脚本时，使用lambda可以省去定义函数的过程，让代码更加精简
+ 有时候给函数起个名字也是个难题，使用lambda不需要考虑命名的问题
+ 使用lambda在某些时候让代码更容易理解

**lambda语句中，冒号前是参数，可以有多个，用逗号隔开，冒号右边的返回值。**

In [20]:
def ff(x):
    return 3.1415926*x

ff(4)

12.5663704

In [21]:
gg = lambda x : 3.1415926*x

gg(4)

12.5663704

 #### 应用到列上

In [19]:
f = lambda x: x.max() - x.min()
frame.apply(f)

b    1.510095
d    1.344385
e    0.895224
dtype: float64

#### 应用到行上

In [22]:
frame.apply(f, axis='columns')

Utah      0.660612
Ohio      1.095478
Texas     1.391098
Oregon    1.144130
dtype: float64

In [23]:
def f0(x): 
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

frame

Unnamed: 0,b,d,e
Utah,0.669804,0.230671,0.009192
Ohio,-0.29083,0.803477,-0.292001
Texas,1.167537,-0.223561,0.08058
Oregon,-0.342557,-0.540908,0.603223


In [24]:
frame.apply(f0)

Unnamed: 0,b,d,e
min,-0.342557,-0.540908,-0.292001
max,1.167537,0.803477,0.603223


### 函数的映射
+ 映射则到了cell水平    
+ 其实是series有个map函数，可以用来实现cell水平的计算

In [27]:
frame

Unnamed: 0,b,d,e
Utah,0.669804,0.230671,0.009192
Ohio,-0.29083,0.803477,-0.292001
Texas,1.167537,-0.223561,0.08058
Oregon,-0.342557,-0.540908,0.603223


In [25]:
format = lambda x: '%.2f' % x

In [26]:
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.67,0.23,0.01
Ohio,-0.29,0.8,-0.29
Texas,1.17,-0.22,0.08
Oregon,-0.34,-0.54,0.6


## 排序和排名

In [28]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

### 按inedex对series的排序

**sort_index**函数的使用

In [29]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [30]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


### 按index对DataFrame的排序

+ 升序

In [31]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


+ 降序

In [34]:
frame.sort_index(ascending=False)

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


### 按列名对DataFrame排序
+ 升序

In [32]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


+ 降序

In [33]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


### 按取值对seriex排序

**sort_values**函数的使用


In [35]:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

缺失值排最后

In [36]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

### 按取值对DataFrame排序

In [37]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [38]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


+ 存在两个以上排序“关键词”

In [39]:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


### 排名

**rank**函数的使用


In [40]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [41]:
obj.sort_values()

1   -5
5    0
4    2
3    4
6    4
0    7
2    7
dtype: int64

+ 默认方法是排名相同时，取排名平均作为rank

In [42]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

+ 使用method='first'，先按大小，再按index的顺序给出的排序

In [43]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

+ 使用method='max'，先按大小，再按序号给出的排序

In [44]:
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

#### 每一行里列数据的排名

In [45]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2],
                      'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [46]:
frame.rank(axis='columns') 

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


## index出现重复label的情况

### series里出现重复label的index

In [49]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

**index.is_unique**函数的使用

In [50]:
obj.index.is_unique

False

In [51]:
obj['a']

a    0
a    1
dtype: int64

In [52]:
obj['c']

4

### DataFrame里出现重复label的index

In [53]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df

Unnamed: 0,0,1,2
a,0.369695,0.191508,-0.508626
a,1.063741,-1.119369,-0.291784
b,1.02254,2.125172,0.275522
b,-0.358809,1.571457,-1.134023


In [54]:
df.loc['b']

Unnamed: 0,0,1,2
b,1.02254,2.125172,0.275522
b,-0.358809,1.571457,-1.134023


## 描述统计

In [56]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


### 各类简单的描述统计函数

In [57]:
df.sum()

one    9.25
two   -5.80
dtype: float64

+ 使用axis='columns' or axis=1，按行加和

In [58]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

+ 排除缺失的加和  
skipna的使用，skip+nan

In [59]:
df.mean(axis='columns', skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [60]:
df.idxmax()

one    b
two    d
dtype: object

In [61]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [62]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [71]:
df.pct_change()

Unnamed: 0,one,two
a,,
b,4.071429,
c,0.0,0.0
d,-0.894366,-0.711111


In [73]:
from pandas import Series, DataFrame

In [76]:
df2 = DataFrame(np.arange(50).reshape(10,5),columns=list('abcde'))
df2

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24
5,25,26,27,28,29
6,30,31,32,33,34
7,35,36,37,38,39
8,40,41,42,43,44
9,45,46,47,48,49


In [77]:
df2.skew()

a    0.0
b    0.0
c    0.0
d    0.0
e    0.0
dtype: float64

In [78]:
df2.kurt()

a   -1.2
b   -1.2
c   -1.2
d   -1.2
e   -1.2
dtype: float64

In [79]:
df2.cumsum()

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,7,9,11,13
2,15,18,21,24,27
3,30,34,38,42,46
4,50,55,60,65,70
5,75,81,87,93,99
6,105,112,119,126,133
7,140,148,156,164,172
8,180,189,198,207,216
9,225,235,245,255,265


In [81]:
df2.cumprod()

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,0,6,14,24,36
2,0,66,168,312,504
3,0,1056,2856,5616,9576
4,0,22176,62832,129168,229824
5,0,576576,1696464,3616704,6664896
6,0,17873856,54286848,119351232,226606464
7,0,643458816,2008613376,240379520,247717504
8,0,612007680,-1537584128,1746384768,-1985331712
9,0,-1912417792,747990016,-2072877056,1502993920


In [82]:
df2.cummin()

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,0,1,2,3,4
2,0,1,2,3,4
3,0,1,2,3,4
4,0,1,2,3,4
5,0,1,2,3,4
6,0,1,2,3,4
7,0,1,2,3,4
8,0,1,2,3,4
9,0,1,2,3,4


In [83]:
df2.cummax()

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24
5,25,26,27,28,29
6,30,31,32,33,34
7,35,36,37,38,39
8,40,41,42,43,44
9,45,46,47,48,49


In [84]:
df2.mad()

a    12.5
b    12.5
c    12.5
d    12.5
e    12.5
dtype: float64

In [85]:
df2.diff()

Unnamed: 0,a,b,c,d,e
0,,,,,
1,5.0,5.0,5.0,5.0,5.0
2,5.0,5.0,5.0,5.0,5.0
3,5.0,5.0,5.0,5.0,5.0
4,5.0,5.0,5.0,5.0,5.0
5,5.0,5.0,5.0,5.0,5.0
6,5.0,5.0,5.0,5.0,5.0
7,5.0,5.0,5.0,5.0,5.0
8,5.0,5.0,5.0,5.0,5.0
9,5.0,5.0,5.0,5.0,5.0


In [86]:
df2.pct_change()

Unnamed: 0,a,b,c,d,e
0,,,,,
1,inf,5.0,2.5,1.666667,1.25
2,1.0,0.833333,0.714286,0.625,0.555556
3,0.5,0.454545,0.416667,0.384615,0.357143
4,0.333333,0.3125,0.294118,0.277778,0.263158
5,0.25,0.238095,0.227273,0.217391,0.208333
6,0.2,0.192308,0.185185,0.178571,0.172414
7,0.166667,0.16129,0.15625,0.151515,0.147059
8,0.142857,0.138889,0.135135,0.131579,0.128205
9,0.125,0.121951,0.119048,0.116279,0.113636


In [90]:
# 安装pandas-datareader
# conda install pandas-datareader

SyntaxError: invalid syntax (<ipython-input-90-990bbb6ee910>, line 1)

In [91]:
import pandas_datareader.data as web

In [108]:
price = pd.read_pickle('yahoo_volume.pkl')

KeyError: 10

In [104]:
df = pd.DataFrame(np.arange(20).reshape(4,5))
df


Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [102]:
df.to_pickle('foo.pkl')

In [103]:
pd.read_pickle('foo.pkl')

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
