# 数据聚合和分组运算

In [1]:
import numpy as np
from numpy import NaN as NA
import pandas as pd
from numpy.random import randn
from numpy.linalg import inv,qr, eig ,det ,svd

import matplotlib.pyplot as plt
import random

from pandas import DataFrame , Series

In [2]:
df = DataFrame({ 'key1':list('aabba'),
                'key2':'one,two,one,two,one'.split(','),
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)
            
})

In [3]:
df

Unnamed: 0,data1,data2,key1,key2
0,0.99905,-0.094037,a,one
1,1.640705,0.587479,a,two
2,-1.202413,0.539482,b,one
3,1.517189,0.056848,b,two
4,-1.939885,0.142478,a,one


In [4]:
grouped = df['data1'].groupby(df['key1'])

In [5]:
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x7f6195d5c908>

In [6]:
grouped.mean()

key1
a    0.233290
b    0.157388
Name: data1, dtype: float64

In [7]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()

In [8]:
means

key1  key2
a     one    -0.470417
      two     1.640705
b     one    -1.202413
      two     1.517189
Name: data1, dtype: float64

In [9]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.470417,1.640705
b,-1.202413,1.517189


In [10]:
states = np.array('Ohio,Calfornia,California,Ohio,Ohio'.split(','))

In [11]:
years = np.array([2005, 2005, 2006, 2005, 2006])

In [12]:
df['data1'].groupby([states, years]).mean()

Calfornia   2005    1.640705
California  2006   -1.202413
Ohio        2005    1.258120
            2006   -1.939885
Name: data1, dtype: float64

In [13]:
df

Unnamed: 0,data1,data2,key1,key2
0,0.99905,-0.094037,a,one
1,1.640705,0.587479,a,two
2,-1.202413,0.539482,b,one
3,1.517189,0.056848,b,two
4,-1.939885,0.142478,a,one


In [14]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.23329,0.211973
b,0.157388,0.298165


In [15]:
df.groupby(['key1', 'key2']).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.470417,0.024221
a,two,1.640705,0.587479
b,one,-1.202413,0.539482
b,two,1.517189,0.056848


In [16]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

## 对分组进行迭代

In [17]:
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0  0.999050 -0.094037    a  one
1  1.640705  0.587479    a  two
4 -1.939885  0.142478    a  one
b
      data1     data2 key1 key2
2 -1.202413  0.539482    b  one
3  1.517189  0.056848    b  two


In [18]:
for (k1, k2), group in df.groupby('key1,key2'.split(',')):
    print(k1,k2)
    print(group)
    print('=========')

a one
      data1     data2 key1 key2
0  0.999050 -0.094037    a  one
4 -1.939885  0.142478    a  one
a two
      data1     data2 key1 key2
1  1.640705  0.587479    a  two
b one
      data1     data2 key1 key2
2 -1.202413  0.539482    b  one
b two
      data1     data2 key1 key2
3  1.517189  0.056848    b  two


In [19]:
pieces =dict(list(df.groupby(['key1'])))

In [20]:
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,-1.202413,0.539482,b,one
3,1.517189,0.056848,b,two


In [21]:
list(df.groupby(['key1']))

[('a',       data1     data2 key1 key2
  0  0.999050 -0.094037    a  one
  1  1.640705  0.587479    a  two
  4 -1.939885  0.142478    a  one), ('b',       data1     data2 key1 key2
  2 -1.202413  0.539482    b  one
  3  1.517189  0.056848    b  two)]

In [22]:
dict([('a', 'n')])

{'a': 'n'}

In [23]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [24]:
grouped = df.groupby(df.dtypes, axis = 1)

In [25]:
dict(list(grouped))

{dtype('float64'):       data1     data2
 0  0.999050 -0.094037
 1  1.640705  0.587479
 2 -1.202413  0.539482
 3  1.517189  0.056848
 4 -1.939885  0.142478, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

In [26]:
df

Unnamed: 0,data1,data2,key1,key2
0,0.99905,-0.094037,a,one
1,1.640705,0.587479,a,two
2,-1.202413,0.539482,b,one
3,1.517189,0.056848,b,two
4,-1.939885,0.142478,a,one


In [27]:
grouped.mean()

DataError: No numeric types to aggregate

## 选取一个或者一组列

In [None]:
df.groupby('key1')['data1']

In [None]:
df.groupby('key1')[['data2']]

In [None]:
df.groupby('key1')['data1'].mean()

In [None]:
df.groupby('key1')[['data2']].mean()

In [28]:
s_grouped = df.groupby(['key1', 'key2'])['data2']

In [29]:
s_grouped

<pandas.core.groupby.SeriesGroupBy object at 0x7f6195d3b198>

In [30]:
s_grouped.mean()

key1  key2
a     one     0.024221
      two     0.587479
b     one     0.539482
      two     0.056848
Name: data2, dtype: float64

## 通过字典和Series进行分组

In [31]:
people = DataFrame(np.random.randn(5, 5),
                   columns=list('abcde'),
                   index='Joe,Steve,Wes,Jim,Travis'.split(',')

)

In [32]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.978465,1.153454,-0.038719,0.734191,0.405998
Steve,-0.353598,-1.040794,-1.265739,1.38471,0.719764
Wes,1.609915,0.548832,0.45352,0.578642,0.446704
Jim,0.182879,-2.079879,-0.141831,-1.372441,-0.425942
Travis,-0.173727,2.653995,-0.516163,1.027742,0.23426


In [33]:
people.loc[2:3, list('bc')] = np.nan

In [34]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.978465,1.153454,-0.038719,0.734191,0.405998
Steve,-0.353598,-1.040794,-1.265739,1.38471,0.719764
Wes,1.609915,,,0.578642,0.446704
Jim,0.182879,-2.079879,-0.141831,-1.372441,-0.425942
Travis,-0.173727,2.653995,-0.516163,1.027742,0.23426


In [35]:
mapping = {
    'a':'red',
    'b':'red',
    'c':'blue',
    'b':'blue',
    'e':'red',
    'f':'orange',
    
}

In [36]:
by_column = people.groupby(mapping, axis=1)

In [37]:
by_column.sum()

Unnamed: 0,blue,red
Joe,1.114734,-1.572468
Steve,-2.306533,0.366166
Wes,,2.056619
Jim,-2.221709,-0.243063
Travis,2.137832,0.060534


In [38]:
map_series = Series(mapping)

In [39]:
map_series

a       red
b      blue
c      blue
e       red
f    orange
dtype: object

In [40]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,2
Steve,2,2
Wes,0,2
Jim,2,2
Travis,2,2


## 通过函数进行分组

In [41]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-0.185671,-0.926425,-0.18055,-0.059608,0.426759
5,-0.353598,-1.040794,-1.265739,1.38471,0.719764
6,-0.173727,2.653995,-0.516163,1.027742,0.23426


In [42]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.978465,1.153454,-0.038719,0.734191,0.405998
Steve,-0.353598,-1.040794,-1.265739,1.38471,0.719764
Wes,1.609915,,,0.578642,0.446704
Jim,0.182879,-2.079879,-0.141831,-1.372441,-0.425942
Travis,-0.173727,2.653995,-0.516163,1.027742,0.23426


## 根据索引级别进行分组

In [43]:
columns = pd.MultiIndex.from_arrays([
    'US,US,US,JP,JP'.split(','),
    [1, 3, 5, 1, 3]],
    names = ['cty', 'tenor']


)

In [44]:
columns

MultiIndex(levels=[['JP', 'US'], [1, 3, 5]],
           labels=[[1, 1, 1, 0, 0], [0, 1, 2, 0, 1]],
           names=['cty', 'tenor'])

In [45]:
hier_df = DataFrame(np.random.randn(4, 5), columns = columns)

In [46]:
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.818437,0.529596,0.461068,0.820986,0.495715
1,0.03683,-0.535788,0.259759,-1.881591,-1.365679
2,-0.702715,-2.013339,0.299547,-0.220129,-0.333428
3,-0.006247,-0.442238,-0.713599,1.216152,-0.149945


In [47]:
hier_df.groupby(level='cty', axis =1 ).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


## 数据聚合

In [48]:
df

Unnamed: 0,data1,data2,key1,key2
0,0.99905,-0.094037,a,one
1,1.640705,0.587479,a,two
2,-1.202413,0.539482,b,one
3,1.517189,0.056848,b,two
4,-1.939885,0.142478,a,one


In [49]:
grouped = df.groupby('key1')

In [50]:
grouped

<pandas.core.groupby.DataFrameGroupBy object at 0x7f6195c8d7b8>

In [51]:
grouped['data1']

<pandas.core.groupby.SeriesGroupBy object at 0x7f6195c8d080>

In [52]:
grouped['data1'].quantile(0.9)

key1
a    1.512374
b    1.245229
Name: data1, dtype: float64

In [53]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [54]:
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3.58059,0.681516
b,2.719602,0.482634


In [55]:
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,0.23329,1.909175,-1.939885,-0.470417,0.99905,1.319878,1.640705,3.0,0.211973,0.346032,-0.094037,0.024221,0.142478,0.364979,0.587479
b,2.0,0.157388,1.923049,-1.202413,-0.522512,0.157388,0.837288,1.517189,2.0,0.298165,0.341274,0.056848,0.177507,0.298165,0.418824,0.539482


In [56]:
! ll

/bin/sh: 1: ll: not found


In [57]:
! ls

arr.txt
chapter1_data.txt
comparison of NaN from numpy and None from Python.ipynb
contain_duplicate_columns.csv
df_to_df.ipynb
dual_test_npy.npz
foods-2011-10-03.json
Haiti.csv
how to use DataFrame.loc to select columns&rows assigned AND compare boolean.ipynb
joined_by_contain_duplicate_columns.csv
macrodata.csv
movies.dat
names
NaN_behavior_in_merge.ipynb
pandas_contain_duplicate_columns.ipynb
PortAuPrince_Roads
Python_data_analysis_chapter_1.ipynb
Python_data_analysis_chapter_2.ipynb
Python_data_analysis_chapter_3.ipynb
Python_data_analysis_chapter_4.ipynb
Python_data_analysis_chapter_6.ipynb
Python_data_analysis_chapter_7.ipynb
Python_data_analysis_chapter_8.ipynb
Python_data_analysis_chapter_9.ipynb
ratings.dat
README
spx.csv
test_npy.npy
three_shape1.svg
three_shape2.svg
tips.csv
users.dat


In [58]:
tips = pd.read_csv('tips.csv')

In [59]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [60]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']

In [61]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


## 面向列的多函数应用

In [62]:
grouped = tips.groupby(['sex', 'smoker'])

In [63]:
grouped_pct = grouped['tip_pct']

In [64]:
grouped_pct.agg('mean')

sex     smoker
Female  No        0.156921
        Yes       0.182150
Male    No        0.160669
        Yes       0.152771
Name: tip_pct, dtype: float64

In [65]:
grouped_pct.agg(['mean', 'std', peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,0.156921,0.036421,0.195876
Female,Yes,0.18215,0.071595,0.360233
Male,No,0.160669,0.041849,0.220186
Male,Yes,0.152771,0.090588,0.674707


In [66]:
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,No,0.156921,0.036421
Female,Yes,0.18215,0.071595
Male,No,0.160669,0.041849
Male,Yes,0.152771,0.090588


In [67]:
funtions = 'count,mean,max'.split(',')

In [68]:
result = grouped['tip_pct', 'total_bill'].agg(funtions)

In [69]:
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Female,No,54,0.156921,0.252672,54,18.105185,35.83
Female,Yes,33,0.18215,0.416667,33,17.977879,44.3
Male,No,97,0.160669,0.29199,97,19.791237,48.33
Male,Yes,60,0.152771,0.710345,60,22.2845,50.81


In [70]:
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,54,0.156921,0.252672
Female,Yes,33,0.18215,0.416667
Male,No,97,0.160669,0.29199
Male,Yes,60,0.152771,0.710345


In [71]:
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]

In [72]:
grouped['tip_pct', 'total_bill'].agg(ftuples)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Female,No,0.156921,0.001327,18.105185,53.092422
Female,Yes,0.18215,0.005126,17.977879,84.451517
Male,No,0.160669,0.001751,19.791237,76.152961
Male,Yes,0.152771,0.008206,22.2845,98.244673


### 对不同的列应用不同的函数


In [73]:
grouped.agg({'tip':np.max, 'size':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,No,5.2,140
Female,Yes,6.5,74
Male,No,9.0,263
Male,Yes,10.0,150


In [74]:
grouped.agg({
    'tip_pct':'min,max,mean,std'.split(','),
    'size':'sum'
})

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip_pct,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,min,max,mean,std
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Female,No,140,0.056797,0.252672,0.156921,0.036421
Female,Yes,74,0.056433,0.416667,0.18215,0.071595
Male,No,263,0.071804,0.29199,0.160669,0.041849
Male,Yes,150,0.035638,0.710345,0.152771,0.090588


## 以“无索引”的形式返回聚合数据

In [75]:
tips.groupby('sex,smoker'.split(','), as_index = False).mean()

Unnamed: 0,sex,smoker,total_bill,tip,size,tip_pct
0,Female,No,18.105185,2.773519,2.592593,0.156921
1,Female,Yes,17.977879,2.931515,2.242424,0.18215
2,Male,No,19.791237,3.113402,2.71134,0.160669
3,Male,Yes,22.2845,3.051167,2.5,0.152771


# 分组级运算和转换

In [76]:
df

Unnamed: 0,data1,data2,key1,key2
0,0.99905,-0.094037,a,one
1,1.640705,0.587479,a,two
2,-1.202413,0.539482,b,one
3,1.517189,0.056848,b,two
4,-1.939885,0.142478,a,one


In [77]:
k1_means = df.groupby('key1').mean().add_prefix('mean_')

In [78]:
k1_means

Unnamed: 0_level_0,mean_data1,mean_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.23329,0.211973
b,0.157388,0.298165


In [79]:
pd.merge(df, k1_means, left_on='key1', right_index=True)

Unnamed: 0,data1,data2,key1,key2,mean_data1,mean_data2
0,0.99905,-0.094037,a,one,0.23329,0.211973
1,1.640705,0.587479,a,two,0.23329,0.211973
4,-1.939885,0.142478,a,one,0.23329,0.211973
2,-1.202413,0.539482,b,one,0.157388,0.298165
3,1.517189,0.056848,b,two,0.157388,0.298165


In [80]:
key = 'one,two,one,two,one'.split(',')

In [81]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.978465,1.153454,-0.038719,0.734191,0.405998
Steve,-0.353598,-1.040794,-1.265739,1.38471,0.719764
Wes,1.609915,,,0.578642,0.446704
Jim,0.182879,-2.079879,-0.141831,-1.372441,-0.425942
Travis,-0.173727,2.653995,-0.516163,1.027742,0.23426


In [82]:
people.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,-0.180759,1.903724,-0.277441,0.780192,0.362321
two,-0.085359,-1.560336,-0.703785,0.006134,0.146911


In [83]:
people.groupby(key).transform(np.mean)

Unnamed: 0,a,b,c,d,e
Joe,-0.180759,1.903724,-0.277441,0.780192,0.362321
Steve,-0.085359,-1.560336,-0.703785,0.006134,0.146911
Wes,-0.180759,1.903724,-0.277441,0.780192,0.362321
Jim,-0.085359,-1.560336,-0.703785,0.006134,0.146911
Travis,-0.180759,1.903724,-0.277441,0.780192,0.362321


In [84]:
def demean(arr):
    return arr - arr.mean()


In [85]:
demeaned = people.groupby(key).transform(demean)

In [86]:
demeaned

Unnamed: 0,a,b,c,d,e
Joe,-1.797706,-0.75027,0.238722,-0.046001,0.043677
Steve,-0.268238,0.519543,-0.561954,1.378575,0.572853
Wes,1.790674,,,-0.201549,0.084383
Jim,0.268238,-0.519543,0.561954,-1.378575,-0.572853
Travis,0.007032,0.75027,-0.238722,0.247551,-0.12806


In [87]:
demeaned.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,-2.775558e-17,0.0,1.387779e-17,1.480297e-16,5.5511150000000004e-17
two,0.0,0.0,0.0,0.0,0.0


## apply:一般性的拆分应用和合并

In [92]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]

In [93]:
top(tips, n=6)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [91]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [94]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [95]:
tips.groupby('smoker,day'.split(',')).apply(top, n=1, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
No,Fri,94,22.75,3.25,Female,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,Male,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,Male,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Male,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Female,Yes,Thur,Lunch,4,0.115982


In [96]:
result = tips.groupby('smoker')['tip_pct'].describe()

In [102]:
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [98]:
result.unstack('smoker')

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

In [101]:
result.unstack('smoker').unstack('smoker')

smoker,No,Yes
count,151.0,93.0
mean,0.159328,0.163196
std,0.03991,0.085119
min,0.056797,0.035638
25%,0.136906,0.106771
50%,0.155625,0.153846
75%,0.185014,0.195059
max,0.29199,0.710345


In [99]:
f = lambda x:x.describe()

In [100]:
grouped.apply(f)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,size,tip_pct
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,No,count,54.0,54.0,54.0,54.0
Female,No,mean,18.105185,2.773519,2.592593,0.156921
Female,No,std,7.286455,1.128425,1.073146,0.036421
Female,No,min,7.25,1.0,1.0,0.056797
Female,No,25%,12.65,2.0,2.0,0.139708
Female,No,50%,16.69,2.68,2.0,0.149691
Female,No,75%,20.8625,3.4375,3.0,0.18163
Female,No,max,35.83,5.2,6.0,0.252672
Female,Yes,count,33.0,33.0,33.0,33.0
Female,Yes,mean,17.977879,2.931515,2.242424,0.18215


## 禁止分组键

In [103]:
tips.groupby('smoker', group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [104]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


## 分位数和桶分析

In [105]:
frame = DataFrame({
        'data1':np.random.randn(1000),
        'data2':np.random.randn(1000),
})

In [107]:
frame.head()

Unnamed: 0,data1,data2
0,-0.773288,1.313637
1,0.274176,0.065855
2,-0.937285,0.20454
3,3.55926,-0.757382
4,0.905283,0.718121


In [108]:
factor = pd.cut(frame.data1, 4)

In [111]:
factor.head()

0    (-2.114, -0.223]
1     (-0.223, 1.668]
2    (-2.114, -0.223]
3      (1.668, 3.559]
4     (-0.223, 1.668]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-4.013, -2.114] < (-2.114, -0.223] < (-0.223, 1.668] < (1.668, 3.559]]

In [112]:
def get_stats(group):
    return {
        'min':group.min(), 'max':group.max(),
        'count':group.count(), 'mean':group.mean()
    }

In [113]:
grouped = frame.data2.groupby(factor)

In [114]:
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x7f619483a7f0>

In [117]:
grouped1 = grouped.apply(get_stats)
grouped1

data1                  
(-4.013, -2.114]  count     19.000000
                  max        2.664174
                  mean       0.559221
                  min       -1.696209
(-2.114, -0.223]  count    389.000000
                  max        2.669489
                  mean      -0.002269
                  min       -2.968103
(-0.223, 1.668]   count    532.000000
                  max        3.322866
                  mean       0.027052
                  min       -2.961104
(1.668, 3.559]    count     60.000000
                  max        1.366887
                  mean      -0.230178
                  min       -2.333047
Name: data2, dtype: float64

In [118]:
grouped1.rename_axis(['data1', 'inner_index'])

data1             inner_index
(-4.013, -2.114]  count           19.000000
                  max              2.664174
                  mean             0.559221
                  min             -1.696209
(-2.114, -0.223]  count          389.000000
                  max              2.669489
                  mean            -0.002269
                  min             -2.968103
(-0.223, 1.668]   count          532.000000
                  max              3.322866
                  mean             0.027052
                  min             -2.961104
(1.668, 3.559]    count           60.000000
                  max              1.366887
                  mean            -0.230178
                  min             -2.333047
Name: data2, dtype: float64

In [119]:
grouped1.unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-4.013, -2.114]",19.0,2.664174,0.559221,-1.696209
"(-2.114, -0.223]",389.0,2.669489,-0.002269,-2.968103
"(-0.223, 1.668]",532.0,3.322866,0.027052,-2.961104
"(1.668, 3.559]",60.0,1.366887,-0.230178,-2.333047


In [125]:
grouping = pd.qcut(frame.data1, 10, labels = False)

In [126]:
grouping.head()

0    2
1    6
2    1
3    9
4    8
Name: data1, dtype: int64

In [127]:
grouping = pd.qcut(frame.data1, 10)

In [129]:
grouping.head()

0     (-0.847, -0.53]
1      (0.259, 0.519]
2    (-1.263, -0.847]
3      (1.322, 3.559]
4       (0.84, 1.322]
Name: data1, dtype: category
Categories (10, interval[float64]): [(-4.007, -1.263] < (-1.263, -0.847] < (-0.847, -0.53] < (-0.53, -0.251] ... (0.259, 0.519] < (0.519, 0.84] < (0.84, 1.322] < (1.322, 3.559]]

In [130]:
grouped = frame.data2.groupby(grouping)

In [131]:
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x7f61950afac8>

In [132]:
grouped.apply(get_stats)

data1                  
(-4.007, -1.263]  count    100.000000
                  max        2.664174
                  mean       0.126349
                  min       -2.292074
(-1.263, -0.847]  count    100.000000
                  max        2.397324
                  mean       0.040678
                  min       -2.007415
(-0.847, -0.53]   count    100.000000
                  max        2.380677
                  mean      -0.068717
                  min       -2.632056
(-0.53, -0.251]   count    100.000000
                  max        2.669489
                  mean      -0.030666
                  min       -2.968103
(-0.251, 0.0315]  count    100.000000
                  max        2.682684
                  mean      -0.043839
                  min       -1.865749
(0.0315, 0.259]   count    100.000000
                  max        2.241176
                  mean       0.108288
                  min       -2.391551
(0.259, 0.519]    count    100.000000
                  max     

In [133]:
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-4.007, -1.263]",100.0,2.664174,0.126349,-2.292074
"(-1.263, -0.847]",100.0,2.397324,0.040678,-2.007415
"(-0.847, -0.53]",100.0,2.380677,-0.068717,-2.632056
"(-0.53, -0.251]",100.0,2.669489,-0.030666,-2.968103
"(-0.251, 0.0315]",100.0,2.682684,-0.043839,-1.865749
"(0.0315, 0.259]",100.0,2.241176,0.108288,-2.391551
"(0.259, 0.519]",100.0,2.115313,-0.043604,-2.498519
"(0.519, 0.84]",100.0,3.322866,-0.001621,-2.794288
"(0.84, 1.322]",100.0,2.086364,0.172444,-2.961104
"(1.322, 3.559]",100.0,2.633902,-0.156076,-2.498664


## 示例：用特定于分组的值填充缺失值

In [134]:
s = Series(np.random.randn(6)) 

In [135]:
s

0   -0.119769
1    1.634720
2   -0.557050
3   -0.317983
4    0.607688
5   -0.077004
dtype: float64

In [138]:
s[::2] = NA

In [139]:
s

0         NaN
1    1.634720
2         NaN
3   -0.317983
4         NaN
5   -0.077004
dtype: float64

In [140]:
s.fillna(s.mean())

0    0.413244
1    1.634720
2    0.413244
3   -0.317983
4    0.413244
5   -0.077004
dtype: float64

In [144]:
states = 'Ohio,New York,Vermont,Florida,Oregon,Nevada,California,Idaho'.split(',')

In [145]:
group_key = ['East'] * 4 + ['West'] * 4

In [146]:
data = Series(np.random.randn(8), index=states)

In [147]:
data

Ohio          0.453170
New York     -0.065638
Vermont      -0.619933
Florida       0.369855
Oregon       -0.243560
Nevada        0.127147
California   -1.055054
Idaho        -0.080726
dtype: float64

In [148]:
data['Vermont,Nevada,Idaho'.split(',')] = np.nan

In [149]:
data

Ohio          0.453170
New York     -0.065638
Vermont            NaN
Florida       0.369855
Oregon       -0.243560
Nevada             NaN
California   -1.055054
Idaho              NaN
dtype: float64

In [150]:
data.groupby(group_key).mean()

East    0.252463
West   -0.649307
dtype: float64

In [152]:
fill_mean = lambda g:g.fillna(g.mean())

In [153]:
data.groupby(group_key).apply(fill_mean)

Ohio          0.453170
New York     -0.065638
Vermont       0.252463
Florida       0.369855
Oregon       -0.243560
Nevada       -0.649307
California   -1.055054
Idaho        -0.649307
dtype: float64

In [154]:
data

Ohio          0.453170
New York     -0.065638
Vermont            NaN
Florida       0.369855
Oregon       -0.243560
Nevada             NaN
California   -1.055054
Idaho              NaN
dtype: float64

In [155]:
fill_values = {
    'East':0.5,
    'West':-1
}

In [156]:
fill_func = lambda g:g.fillna(fill_values[g.name])

In [157]:
data.groupby(group_key).apply(fill_func)

Ohio          0.453170
New York     -0.065638
Vermont       0.500000
Florida       0.369855
Oregon       -0.243560
Nevada       -1.000000
California   -1.055054
Idaho        -1.000000
dtype: float64

## 示例：随机采样和排列



In [158]:
suits = list('HSCD')

In [160]:
card_val = (list(range(1,11)) + [10]*3)*4

In [163]:
base_names = ['A'] + list(range(2, 11)) +list('JKQ') 

In [164]:
base_names

['A', 2, 3, 4, 5, 6, 7, 8, 9, 10, 'J', 'K', 'Q']

In [165]:
cards  = []

In [167]:
for suit in list('HSCD'):
    cards.extend(str(num) + suit for num in base_names)
deck = Series(card_val, index=cards)

In [168]:
deck

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
AS      1
2S      2
3S      3
4S      4
5S      5
6S      6
7S      7
8S      8
9S      9
10S    10
JS     10
KS     10
QS     10
AC      1
2C      2
3C      3
4C      4
5C      5
6C      6
7C      7
8C      8
9C      9
10C    10
JC     10
KC     10
QC     10
AD      1
2D      2
3D      3
4D      4
5D      5
6D      6
7D      7
8D      8
9D      9
10D    10
JD     10
KD     10
QD     10
dtype: int64

In [185]:
def draw(deck, n=5):
    return deck.take(np.random.permutation(len(deck))[:n] )

In [187]:
draw(deck)

5S      5
4S      4
10D    10
10S    10
KD     10
dtype: int64

In [188]:
get_suit = lambda card:card[-1]

In [189]:
deck.groupby(get_suit).apply(draw, n=2)

C  7C      7
   4C      4
D  6D      6
   10D    10
H  JH     10
   10H    10
S  6S      6
   AS      1
dtype: int64

In [191]:
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

2C      2
7C      7
AD      1
10D    10
5H      5
JH     10
4S      4
QS     10
dtype: int64

## 示例：分组加权平均数和相关系数

In [192]:
df = DataFrame({
    'category':list('a'*4+'b'*4),
    'data':np.random.randn(8),
    'weights':np.random.randn(8)
})

In [193]:
df

Unnamed: 0,category,data,weights
0,a,-1.7517,1.169088
1,a,-0.222518,0.478498
2,a,-0.851611,-0.406632
3,a,-0.212275,0.159139
4,b,-0.102189,-0.568136
5,b,0.596911,0.25111
6,b,-1.293784,-0.387172
7,b,0.105142,-0.446348


In [194]:
grouped = df.groupby('category')

In [201]:
get_wavg = lambda g:np.average(g['data'],weights=g['weights'])

In [202]:
grouped.apply(get_wavg)

category
a   -1.315523
b   -0.575322
dtype: float64

In [203]:
close_px = pd.read_csv('stock_px.csv', parse_dates=True, index_col=0)

In [205]:
close_px.head()

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003-01-02,7.4,21.11,29.22,909.03
2003-01-03,7.45,21.14,29.24,908.59
2003-01-06,7.45,21.52,29.96,929.01
2003-01-07,7.43,21.93,28.95,922.93
2003-01-08,7.28,21.31,28.83,909.93


In [206]:
close_px[-4:]

Unnamed: 0,AAPL,MSFT,XOM,SPX
2011-10-11,400.29,27.0,76.27,1195.54
2011-10-12,402.19,26.96,77.16,1207.25
2011-10-13,408.43,27.18,76.37,1203.66
2011-10-14,422.0,27.27,78.11,1224.58


In [207]:
close_px.pct_change()

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003-01-02,,,,
2003-01-03,0.006757,0.001421,0.000684,-0.000484
2003-01-06,0.000000,0.017975,0.024624,0.022474
2003-01-07,-0.002685,0.019052,-0.033712,-0.006545
2003-01-08,-0.020188,-0.028272,-0.004145,-0.014086
2003-01-09,0.008242,0.029094,0.021159,0.019386
2003-01-10,0.002725,0.001824,-0.013927,0.000000
2003-01-13,-0.005435,0.008648,-0.004134,-0.001412
2003-01-14,-0.002732,0.010379,0.008993,0.005830
2003-01-15,-0.010959,-0.012506,-0.013713,-0.014426


In [208]:
rets = close_px.pct_change().dropna()

In [210]:
rets.head()

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003-01-03,0.006757,0.001421,0.000684,-0.000484
2003-01-06,0.0,0.017975,0.024624,0.022474
2003-01-07,-0.002685,0.019052,-0.033712,-0.006545
2003-01-08,-0.020188,-0.028272,-0.004145,-0.014086
2003-01-09,0.008242,0.029094,0.021159,0.019386


In [211]:
spx_corr = lambda x:x.corrwith(x['SPX'])

In [213]:
by_year = rets.groupby(lambda x:x.year)

In [215]:
by_year.apply(spx_corr)

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


In [216]:
by_year.apply(lambda g:g['AAPL'].corr(g['MSFT']))  

2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

## 示例：面向分组的线性回归

In [219]:
import statsmodels.api as sm

  from pandas.core import datetools


In [220]:
def regress(data, yvar, xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1
    result = sm.OLS(Y, X).fit()
    return result.params

In [221]:
by_year.apply(regress, 'AAPL', ['SPX'])

Unnamed: 0,SPX,intercept
2003,1.195406,0.00071
2004,1.363463,0.004201
2005,1.766415,0.003246
2006,1.645496,8e-05
2007,1.198761,0.003438
2008,0.968016,-0.00111
2009,0.879103,0.002954
2010,1.052608,0.001261
2011,0.806605,0.001514


# 透视表和交叉表

In [222]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.50,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.139780
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808
5,25.29,4.71,Male,No,Sun,Dinner,4,0.186240
6,8.77,2.00,Male,No,Sun,Dinner,2,0.228050
7,26.88,3.12,Male,No,Sun,Dinner,4,0.116071
8,15.04,1.96,Male,No,Sun,Dinner,2,0.130319
9,14.78,3.23,Male,No,Sun,Dinner,2,0.218539


In [225]:
tips.pivot_table(index=['sex','smoker'])

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,No,2.592593,2.773519,0.156921,18.105185
Female,Yes,2.242424,2.931515,0.18215,17.977879
Male,No,2.71134,3.113402,0.160669,19.791237
Male,Yes,2.5,3.051167,0.152771,22.2845


In [228]:
tips.pivot_table('tip_pct,size'.split(','), index='sex,day'.split(','), columns='smoker')

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,No,Yes
sex,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Female,Fri,2.5,2.0,0.165296,0.209129
Female,Sat,2.307692,2.2,0.147993,0.163817
Female,Sun,3.071429,2.5,0.16571,0.237075
Female,Thur,2.48,2.428571,0.155971,0.163073
Male,Fri,2.0,2.125,0.138005,0.14473
Male,Sat,2.65625,2.62963,0.162132,0.139067
Male,Sun,2.883721,2.6,0.158291,0.173964
Male,Thur,2.5,2.3,0.165706,0.164417


In [229]:
tips.pivot_table('tip_pct,size'.split(','), index='sex,day'.split(','), columns='smoker', margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
sex,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Female,Fri,2.5,2.0,2.111111,0.165296,0.209129,0.199388
Female,Sat,2.307692,2.2,2.25,0.147993,0.163817,0.15647
Female,Sun,3.071429,2.5,2.944444,0.16571,0.237075,0.181569
Female,Thur,2.48,2.428571,2.46875,0.155971,0.163073,0.157525
Male,Fri,2.0,2.125,2.1,0.138005,0.14473,0.143385
Male,Sat,2.65625,2.62963,2.644068,0.162132,0.139067,0.151577
Male,Sun,2.883721,2.6,2.810345,0.158291,0.173964,0.162344
Male,Thur,2.5,2.3,2.433333,0.165706,0.164417,0.165276
All,,2.668874,2.408602,2.569672,0.159328,0.163196,0.160803


In [230]:
tips.pivot_table('tip_pct', index='sex,smoker'.split(','), columns='day',aggfunc=len, margins=True)

Unnamed: 0_level_0,day,Fri,Sat,Sun,Thur,All
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,No,2.0,13.0,14.0,25.0,54.0
Female,Yes,7.0,15.0,4.0,7.0,33.0
Male,No,2.0,32.0,43.0,20.0,97.0
Male,Yes,8.0,27.0,15.0,10.0,60.0
All,,19.0,87.0,76.0,62.0,244.0


## 交叉表

In [231]:
pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)

Unnamed: 0_level_0,smoker,No,Yes,All
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dinner,Fri,3,9,12
Dinner,Sat,45,42,87
Dinner,Sun,57,19,76
Dinner,Thur,1,0,1
Lunch,Fri,1,6,7
Lunch,Thur,44,17,61
All,,151,93,244


## 2012年联邦选举委员会数据库

In [232]:
fec = pd.read_csv('P00000001-ALL.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [234]:
fec

Unnamed: 0,cmte_id,cand_id,cand_nm,contbr_nm,contbr_city,contbr_st,contbr_zip,contbr_employer,contbr_occupation,contb_receipt_amt,contb_receipt_dt,receipt_desc,memo_cd,memo_text,form_tp,file_num
0,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,3.6601e+08,RETIRED,RETIRED,250.0,20-JUN-11,,,,SA17A,736166
1,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,3.6601e+08,RETIRED,RETIRED,50.0,23-JUN-11,,,,SA17A,736166
2,C00410118,P20002978,"Bachmann, Michelle","SMITH, LANIER",LANETT,AL,3.68633e+08,INFORMATION REQUESTED,INFORMATION REQUESTED,250.0,05-JUL-11,,,,SA17A,749073
3,C00410118,P20002978,"Bachmann, Michelle","BLEVINS, DARONDA",PIGGOTT,AR,7.24548e+08,NONE,RETIRED,250.0,01-AUG-11,,,,SA17A,749073
4,C00410118,P20002978,"Bachmann, Michelle","WARDENBURG, HAROLD",HOT SPRINGS NATION,AR,7.19016e+08,NONE,RETIRED,300.0,20-JUN-11,,,,SA17A,736166
5,C00410118,P20002978,"Bachmann, Michelle","BECKMAN, JAMES",SPRINGDALE,AR,7.27647e+08,NONE,RETIRED,500.0,23-JUN-11,,,,SA17A,736166
6,C00410118,P20002978,"Bachmann, Michelle","BLEVINS, DARONDA",PIGGOTT,AR,7.24548e+08,INFORMATION REQUESTED,INFORMATION REQUESTED,250.0,21-JUN-11,,,,SA17A,736166
7,C00410118,P20002978,"Bachmann, Michelle","BLEVINS, DARONDA",PIGGOTT,AR,7.24548e+08,NONE,RETIRED,250.0,05-JUL-11,,,,SA17A,749073
8,C00410118,P20002978,"Bachmann, Michelle","COLLINS, SARAH",MESA,AZ,8.52107e+08,ST. JOSEPH HOSPITAL,RN,250.0,21-JUN-11,,,,SA17A,736166
9,C00410118,P20002978,"Bachmann, Michelle","COLEMAN, RONALD",TUCSON,AZ,8.57499e+08,RAYTHEON,ELECTRICAL ENGINEER,250.0,20-JUN-11,,,,SA17A,736166


In [236]:
fec.ix[123456]

cmte_id                             C00431445
cand_id                             P80003338
cand_nm                         Obama, Barack
contbr_nm                         ELLMAN, IRA
contbr_city                             TEMPE
contbr_st                                  AZ
contbr_zip                          852816719
contbr_employer      ARIZONA STATE UNIVERSITY
contbr_occupation                   PROFESSOR
contb_receipt_amt                          50
contb_receipt_dt                    01-DEC-11
receipt_desc                              NaN
memo_cd                                   NaN
memo_text                                 NaN
form_tp                                 SA17A
file_num                               772372
Name: 123456, dtype: object

In [237]:
unique_cards = fec.cand_nm.unique()

In [238]:
unique_cards

array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
       "Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
       'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick', 'Cain, Herman',
       'Gingrich, Newt', 'McCotter, Thaddeus G', 'Huntsman, Jon',
       'Perry, Rick'], dtype=object)

In [241]:
parties = {
    'Bachmann, Michelle':'Republican',
    'Romney, Mitt':'Republican',
    'Obama, Barack':'Democrat',
       "Roemer, Charles E. 'Buddy' III":'Democrat',
    'Pawlenty, Timothy':'Republican',
       'Johnson, Gary Earl':'Republican',
    'Paul, Ron':'Republican',
    'Santorum, Rick':'Republican',
    'Cain, Herman':'Republican',
       'Gingrich, Newt':'Republican',
    'McCotter, Thaddeus G':'Republican', 
    'Huntsman, Jon':'Republican',
       'Perry, Rick':'Democrat'
}

In [242]:
fec.cand_nm[123456:123461].map(parties)

123456    Democrat
123457    Democrat
123458    Democrat
123459    Democrat
123460    Democrat
Name: cand_nm, dtype: object

In [243]:
fec['party'] = fec.cand_nm.map(parties)

In [244]:
fec['party'].value_counts()

Democrat      613241
Republican    388490
Name: party, dtype: int64

In [247]:
(fec.contb_receipt_amt>0).value_counts()

True     991475
False     10256
Name: contb_receipt_amt, dtype: int64

In [248]:
fec = fec[fec.contb_receipt_amt>0]

In [251]:
fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])] 

In [252]:
fec_mrbo

Unnamed: 0,cmte_id,cand_id,cand_nm,contbr_nm,contbr_city,contbr_st,contbr_zip,contbr_employer,contbr_occupation,contb_receipt_amt,contb_receipt_dt,receipt_desc,memo_cd,memo_text,form_tp,file_num,party
411,C00431171,P80003353,"Romney, Mitt","ELDERBAUM, WILLIAM",DPO,AA,3.4023e+08,US GOVERNMENT,FOREIGN SERVICE OFFICER,25.0,01-FEB-12,,,,SA17A,780124,Republican
412,C00431171,P80003353,"Romney, Mitt","ELDERBAUM, WILLIAM",DPO,AA,3.4023e+08,US GOVERNMENT,FOREIGN SERVICE OFFICER,110.0,01-FEB-12,,,,SA17A,780124,Republican
413,C00431171,P80003353,"Romney, Mitt","CARLSEN, RICHARD",APO,AE,9.128e+07,DEFENSE INTELLIGENCE AGENCY,INTELLIGENCE ANALYST,250.0,13-APR-12,,,,SA17A,785689,Republican
414,C00431171,P80003353,"Romney, Mitt","DELUCA, PIERRE",APO,AE,9.128e+07,CISCO,ENGINEER,30.0,21-AUG-11,,,,SA17A,760261,Republican
415,C00431171,P80003353,"Romney, Mitt","SARGENT, MICHAEL",APO,AE,9.01201e+07,RAYTHEON TECHNICAL SERVICES CORP,COMPUTER SYSTEMS ENGINEER,100.0,07-MAR-12,,,,SA17A,780128,Republican
416,C00431171,P80003353,"Romney, Mitt","WILSON, ANDREW C. MR.",DPO,AE,9.87e+07,US DEPT OF STATE,FOREIGN SERVICE OFFICER,50.0,17-MAR-12,,,,SA17A,780128,Republican
417,C00431171,P80003353,"Romney, Mitt","GRIFFIS, JOHN",APO,AE,9.128e+07,US ARMY,MILITARY OFFICER,250.0,20-MAR-12,,,,SA17A,780128,Republican
418,C00431171,P80003353,"Romney, Mitt","SARGENT, MICHAEL",APO,AE,9.01201e+07,RAYTHEON TECHNICAL SERVICES CORP,COMPUTER SYSTEMS ENGINEER,100.0,09-APR-12,,,,SA17A,785689,Republican
419,C00431171,P80003353,"Romney, Mitt","GRIFFIS, JOHN",APO,AE,9.128e+07,US ARMY,MILITARY OFFICER,250.0,28-JAN-12,,,,SA17A,771933,Republican
420,C00431171,P80003353,"Romney, Mitt","DELUCA, PIERRE MR.",APO,AE,9.128e+07,US ARMY,ENGINEER,50.0,01-FEB-12,,,,SA17A,780124,Republican


## 根据职业和雇主统计赞助信息

In [253]:
fec.contbr_occupation.value_counts()[:10]

RETIRED                                   233990
INFORMATION REQUESTED                      35107
ATTORNEY                                   34286
HOMEMAKER                                  29931
PHYSICIAN                                  23432
INFORMATION REQUESTED PER BEST EFFORTS     21138
ENGINEER                                   14334
TEACHER                                    13990
CONSULTANT                                 13273
PROFESSOR                                  12555
Name: contbr_occupation, dtype: int64

In [254]:
occ_mapping ={
    'INFORMATION REQUESTED PER BEST EFFORTS': 'NOT PROVIDED',
    'INFORMATION REQUESTED': 'NOT PROVIDED',
    'INFORMATION REQUESTED (BEST EFFORTS)': 'NOT PROVIDED',
    'C.E.O.':'CEO'
    
}

In [255]:
f = lambda x:occ_mapping.get(x,x)

In [257]:
fec.contbr_occupation = fec.contbr_occupation.apply(f)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [259]:
fec.head()

Unnamed: 0,cmte_id,cand_id,cand_nm,contbr_nm,contbr_city,contbr_st,contbr_zip,contbr_employer,contbr_occupation,contb_receipt_amt,contb_receipt_dt,receipt_desc,memo_cd,memo_text,form_tp,file_num,party
0,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,366010000.0,RETIRED,RETIRED,250.0,20-JUN-11,,,,SA17A,736166,Republican
1,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,366010000.0,RETIRED,RETIRED,50.0,23-JUN-11,,,,SA17A,736166,Republican
2,C00410118,P20002978,"Bachmann, Michelle","SMITH, LANIER",LANETT,AL,368633000.0,INFORMATION REQUESTED,NOT PROVIDED,250.0,05-JUL-11,,,,SA17A,749073,Republican
3,C00410118,P20002978,"Bachmann, Michelle","BLEVINS, DARONDA",PIGGOTT,AR,724548000.0,NONE,RETIRED,250.0,01-AUG-11,,,,SA17A,749073,Republican
4,C00410118,P20002978,"Bachmann, Michelle","WARDENBURG, HAROLD",HOT SPRINGS NATION,AR,719016000.0,NONE,RETIRED,300.0,20-JUN-11,,,,SA17A,736166,Republican


In [262]:
by_occupation = fec.pivot_table('contb_receipt_amt',
                                index = 'contbr_occupation',
                                columns='party',
                                aggfunc='sum'

)

In [263]:
by_occupation

party,Democrat,Republican
contbr_occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
MIXED-MEDIA ARTIST / STORYTELLER,100.0,
AREA VICE PRESIDENT,250.0,
RESEARCH ASSOCIATE,100.0,
TEACHER,500.0,
THERAPIST,3900.0,
'MIS MANAGER,,177.60
(PART-TIME) SALES CONSULTANT & WRITER,,285.00
(RETIRED),250.0,
-,5000.0,2114.80
--,,75.00


In [265]:
over_2mm = by_occupation[by_occupation.sum(1)>2000000]

In [268]:
over_2mm.head()

party,Democrat,Republican
contbr_occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
ATTORNEY,11965397.89,6653779.51
CEO,2640072.58,3645942.73
CONSULTANT,2781327.71,2223310.45
ENGINEER,1156978.95,1612920.3
EXECUTIVE,2100655.99,3393355.15


In [270]:
%matplotlib
over_2mm.plot(kind='barh')

Using matplotlib backend: TkAgg


<matplotlib.axes._subplots.AxesSubplot at 0x7f61847d4f28>

In [280]:
def get_top_amounts(group, key, n=5):
    totals = group.groupby(key)['contb_receipt_amt'].sum()
    return totals.sort_values(ascending=False)[n:]

In [272]:
grouped = fec_mrbo.groupby('cand_nm')

In [275]:
get_top_amounts7 = lambda x:get_top_amounts(x,'contbr_occupation', n=7)

In [282]:
grouped.apply(get_top_amounts7).head()

cand_nm        contbr_occupation
Obama, Barack  PROFESSOR            2165071.08
               CEO                  2073284.79
               PRESIDENT            1878509.95
               NOT EMPLOYED         1709188.20
               EXECUTIVE            1355161.05
Name: contb_receipt_amt, dtype: float64

## 对出资金额分组

In [283]:
bins  = np.array([0,1,10,100,1000,10000,100000,1000000,10000000])

In [284]:
bins

array([       0,        1,       10,      100,     1000,    10000,
         100000,  1000000, 10000000])

In [285]:
labels = pd.cut(fec_mrbo.contb_receipt_amt, bins)

In [287]:
labels.head()

411      (10, 100]
412    (100, 1000]
413    (100, 1000]
414      (10, 100]
415      (10, 100]
Name: contb_receipt_amt, dtype: category
Categories (8, interval[int64]): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]

In [288]:
grouped = fec_mrbo.groupby(['cand_nm', labels])

In [289]:
grouped.size()

cand_nm        contb_receipt_amt  
Obama, Barack  (0, 1]                    493
               (1, 10]                 40070
               (10, 100]              372280
               (100, 1000]            153991
               (1000, 10000]           22284
               (10000, 100000]             2
               (100000, 1000000]           3
               (1000000, 10000000]         4
Romney, Mitt   (0, 1]                     77
               (1, 10]                  3681
               (10, 100]               31853
               (100, 1000]             43357
               (1000, 10000]           26186
               (10000, 100000]             1
dtype: int64

In [291]:
grouped.size().unstack(0)

cand_nm,"Obama, Barack","Romney, Mitt"
contb_receipt_amt,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0, 1]",493.0,77.0
"(1, 10]",40070.0,3681.0
"(10, 100]",372280.0,31853.0
"(100, 1000]",153991.0,43357.0
"(1000, 10000]",22284.0,26186.0
"(10000, 100000]",2.0,1.0
"(100000, 1000000]",3.0,
"(1000000, 10000000]",4.0,


In [292]:
bucket_sums = grouped.contb_receipt_amt.sum()

In [293]:
bucket_sums

cand_nm        contb_receipt_amt  
Obama, Barack  (0, 1]                      318.24
               (1, 10]                  337267.62
               (10, 100]              20288981.41
               (100, 1000]            54798531.46
               (1000, 10000]          51753705.67
               (10000, 100000]           59100.00
               (100000, 1000000]       1490683.08
               (1000000, 10000000]     7148839.76
Romney, Mitt   (0, 1]                       77.00
               (1, 10]                   29819.66
               (10, 100]               1987783.76
               (100, 1000]            22363381.69
               (1000, 10000]          63942145.42
               (10000, 100000]           12700.00
Name: contb_receipt_amt, dtype: float64

In [294]:
bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)

In [295]:
bucket_sums

cand_nm,"Obama, Barack","Romney, Mitt"
contb_receipt_amt,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0, 1]",318.24,77.0
"(1, 10]",337267.62,29819.66
"(10, 100]",20288981.41,1987783.76
"(100, 1000]",54798531.46,22363381.69
"(1000, 10000]",51753705.67,63942145.42
"(10000, 100000]",59100.0,12700.0
"(100000, 1000000]",1490683.08,
"(1000000, 10000000]",7148839.76,


In [298]:
bucket_sums.sum(axis=1)

contb_receipt_amt
(0, 1]                 3.952400e+02
(1, 10]                3.670873e+05
(10, 100]              2.227677e+07
(100, 1000]            7.716191e+07
(1000, 10000]          1.156959e+08
(10000, 100000]        7.180000e+04
(100000, 1000000]      1.490683e+06
(1000000, 10000000]    7.148840e+06
dtype: float64

In [296]:
normed_sums = bucket_sums.div(bucket_sums.sum(axis=1), axis=0)

In [297]:
normed_sums

cand_nm,"Obama, Barack","Romney, Mitt"
contb_receipt_amt,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0, 1]",0.805182,0.194818
"(1, 10]",0.918767,0.081233
"(10, 100]",0.910769,0.089231
"(100, 1000]",0.710176,0.289824
"(1000, 10000]",0.447326,0.552674
"(10000, 100000]",0.82312,0.17688
"(100000, 1000000]",1.0,
"(1000000, 10000000]",1.0,


In [299]:
normed_sums[:-2].plot(kind='barh', stacked=True)

<matplotlib.axes._subplots.AxesSubplot at 0x7f61898a3978>

## 根据州统计赞助赞助信息

In [301]:
grouped = fec_mrbo.groupby(['cand_nm', 'contbr_st'])

In [302]:
grouped

<pandas.core.groupby.DataFrameGroupBy object at 0x7f618976cba8>

In [305]:
totals = grouped.contb_receipt_amt.sum().unstack().fillna(0)

In [306]:
totals

contbr_st,AA,AB,AE,AK,AL,AP,AR,AS,AZ,CA,...,UT,VA,VI,VT,WA,WI,WV,WY,XX,ZZ
cand_nm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Obama, Barack",56405.0,2048.0,42973.75,281840.15,543123.48,37130.5,359247.28,2955.0,1506476.98,23824984.24,...,519851.37,4259977.19,80712.0,986510.59,4250933.16,1130155.46,169154.47,194046.74,0.0,5963.0
"Romney, Mitt",135.0,0.0,5680.0,86204.24,527303.51,1655.0,105556.0,0.0,1888436.23,11237636.6,...,3717300.48,3465765.85,3500.0,55229.44,1341521.56,270316.32,126725.12,252595.84,400250.0,0.0


In [307]:
totals= totals[totals.sum(1)>100000]

In [308]:
totals

contbr_st,AA,AB,AE,AK,AL,AP,AR,AS,AZ,CA,...,UT,VA,VI,VT,WA,WI,WV,WY,XX,ZZ
cand_nm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Obama, Barack",56405.0,2048.0,42973.75,281840.15,543123.48,37130.5,359247.28,2955.0,1506476.98,23824984.24,...,519851.37,4259977.19,80712.0,986510.59,4250933.16,1130155.46,169154.47,194046.74,0.0,5963.0
"Romney, Mitt",135.0,0.0,5680.0,86204.24,527303.51,1655.0,105556.0,0.0,1888436.23,11237636.6,...,3717300.48,3465765.85,3500.0,55229.44,1341521.56,270316.32,126725.12,252595.84,400250.0,0.0


In [309]:
totals.sum(1)>100000

cand_nm
Obama, Barack    True
Romney, Mitt     True
dtype: bool

In [311]:
percent = totals.div(totals.sum(1),axis=0)
percent

contbr_st,AA,AB,AE,AK,AL,AP,AR,AS,AZ,CA,...,UT,VA,VI,VT,WA,WI,WV,WY,XX,ZZ
cand_nm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Obama, Barack",0.000415,1.5e-05,0.000316,0.002074,0.003997,0.000273,0.002644,2.2e-05,0.011087,0.175344,...,0.003826,0.031352,0.000594,0.00726,0.031285,0.008318,0.001245,0.001428,0.0,4.4e-05
"Romney, Mitt",2e-06,0.0,6.4e-05,0.000976,0.005969,1.9e-05,0.001195,0.0,0.021378,0.127215,...,0.042081,0.039234,4e-05,0.000625,0.015187,0.00306,0.001435,0.002859,0.004531,0.0


In [312]:
percent = totals.div(totals.sum(1),axis=1)
percent

Unnamed: 0_level_0,AA,AB,AE,AK,AL,AP,AR,AS,AZ,CA,...,UT,VA,VI,VT,WA,WI,WV,WY,XX,ZZ
cand_nm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Obama, Barack",,,,,,,,,,,...,,,,,,,,,,
"Romney, Mitt",,,,,,,,,,,...,,,,,,,,,,


In [313]:
totals.sum(1)

cand_nm
Obama, Barack    1.358756e+08
Romney, Mitt     8.833591e+07
dtype: float64

## 赞助信息的美国地图

In [334]:
from mpl_toolkits.basemap import Basemap , cm
import numpy as np
from matplotlib import rcParams
from matplotlib.collections import LineCollection
import matplotlib.pyplot as plt

# from shapelib import ShapeFile
# import dbflib
obama = percent['Obama, Barack']

In [346]:
fig = plt.figure(figsize=(12,12))

In [347]:
ax = fig.add_axes([0.1,0.1,0.8,0.8])

In [348]:
lllat = 21; urlat=53;lllon=-118;urlon=-62

In [349]:
m = Basemap(ax= ax,
           projection='stere',
            lon_0=(urlon + lllon)/2,
            lat_0=(urlat + lllat)/2,
           llcrnrlat = lllat,
            urcrnrlat = urlat,
            llcrnrlon = lllon,
            urcrnrlon = urlon,
            resolution = 'l'
           )

In [350]:
m.drawcoastlines()
m.drawcountries()

<matplotlib.collections.LineCollection at 0x7f615a199ef0>

In [351]:
import dbflib


ImportError: No module named 'dbflib'

In [352]:
from matplotlib import dbflib

ImportError: cannot import name 'dbflib'