# p385 할차례

# Data Aggregation and Group Operations
p345

In [1]:
import numpy as np
import pandas as pd

In [None]:
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20

In [2]:
np.random.seed(12345)

In [2]:
import matplotlib.pyplot as plt

In [3]:
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [6]:
%matplotlib inline

## GroupBy Mechanics

In [12]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,1.239695,-0.109786,a,one
1,-0.737723,0.031947,a,two
2,-1.16725,-0.867356,b,one
3,1.449556,1.037156,b,two
4,2.164399,1.047967,a,one


##### groupby객체 만으로는 아무것도 안 뜸 ㅇㅇ

In [19]:
#key1기준으로 본 data1. groupby 객체
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x0000000007FCEDD8>

In [22]:
dict(list(grouped))

{'a': 0    1.239695
1   -0.737723
4    2.164399
Name: data1, dtype: float64, 'b': 2   -1.167250
3    1.449556
Name: data1, dtype: float64}


In [20]:
# groupby 객체에서 평균값
grouped.mean()

key1
a    0.746672
b   -0.537585
Name: data1, dtype: float64

In [21]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one     0.880536
      two     0.478943
b     one    -0.519439
      two    -0.555730
Name: data1, dtype: float64

In [22]:
type(means)

pandas.core.series.Series

In [30]:
# import pandas as pd

In [53]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.880536,0.478943
b,-0.519439,-0.55573


In [None]:
means.unstack()

In [None]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

In [None]:
df.groupby('key1').mean()
df.groupby(['key1', 'key2']).mean()

In [None]:
df.groupby(['key1', 'key2']).size()

### Iterating Over Groups

In [14]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0 -0.204708  1.393406    a  one
1  0.478943  0.092908    a  two
4  1.965781  1.246435    a  one
b
      data1     data2 key1 key2
2 -0.519439  0.281746    b  one
3 -0.555730  0.769023    b  two


In [15]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
      data1     data2 key1 key2
0 -0.204708  1.393406    a  one
4  1.965781  1.246435    a  one
('a', 'two')
      data1     data2 key1 key2
1  0.478943  0.092908    a  two
('b', 'one')
      data1     data2 key1 key2
2 -0.519439  0.281746    b  one
('b', 'two')
     data1     data2 key1 key2
3 -0.55573  0.769023    b  two


In [18]:
pieces = dict(list(df.groupby('key1')))
pieces['b']
# pieces

Unnamed: 0,data1,data2,key1,key2
2,-0.519439,0.281746,b,one
3,-0.55573,0.769023,b,two


In [55]:
df.dtypes
grouped = df.groupby(df.dtypes, axis=1)

In [56]:
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -0.204708  1.393406
1  0.478943  0.092908
2 -0.519439  0.281746
3 -0.555730  0.769023
4  1.965781  1.246435
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


### Selecting a Column or Subset of Columns

df.groupby('key1')['data1']
df.groupby('key1')[['data2']]

df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])

In [57]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,1.31992
a,two,0.092908
b,one,0.281746
b,two,0.769023


In [58]:
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped
s_grouped.mean()

key1  key2
a     one     1.319920
      two     0.092908
b     one     0.281746
      two     0.769023
Name: data2, dtype: float64

### Grouping with Dicts and Series

In [59]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
people

Unnamed: 0,a,b,c,d,e
Joe,1.007189,-1.296221,0.274992,0.228913,1.352917
Steve,0.886429,-2.001637,-0.371843,1.669025,-0.43857
Wes,-0.539741,,,-1.021228,-0.577087
Jim,0.124121,0.302614,0.523772,0.00094,1.34381
Travis,-0.713544,-0.831154,-2.370232,-1.860761,-0.860757


In [61]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red'}

In [62]:
by_column = people.groupby(mapping, axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,0.503905,1.063885
Steve,1.297183,-1.553778
Wes,-1.021228,-1.116829
Jim,0.524712,1.770545
Travis,-4.230992,-2.405455


In [63]:
map_series = pd.Series(mapping)
map_series
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


### Grouping with Functions
p355

In [None]:
people.groupby(len).sum()

In [None]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()

### Grouping by Index Levels

In [None]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                    [1, 3, 5, 1, 3]],
                                    names=['cty', 'tenor'])
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

In [None]:
hier_df.groupby(level='cty', axis=1).count()

## Data Aggregation

In [26]:
df

Unnamed: 0,data1,data2,key1,key2
0,1.239695,-0.109786,a,one
1,-0.737723,0.031947,a,two
2,-1.16725,-0.867356,b,one
3,1.449556,1.037156,b,two
4,2.164399,1.047967,a,one


In [27]:
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)

key1
a    1.979458
b    1.187875
Name: data1, dtype: float64

In [29]:
dict(list(grouped))

{'a':       data1     data2 key1 key2
0  1.239695 -0.109786    a  one
1 -0.737723  0.031947    a  two
4  2.164399  1.047967    a  one, 'b':       data1     data2 key1 key2
2 -1.167250 -0.867356    b  one
3  1.449556  1.037156    b  two}

In [15]:
df.groupby('key1')['data1'].quantile(0.9)

### agg는 max, min가능한 숫자만 알아서 연산
그래서 grouped에서 칼럼만 골라내지 않고, 통째로 넣어도 잘 되는 것

In [43]:
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.902122,1.157753
b,2.616806,1.904513


In [30]:
grouped['data1'].agg(peak_to_peak)

key1
a    2.902122
b    2.616806
Name: data1, dtype: float64

In [31]:
grouped['data1'].apply(peak_to_peak)

key1
a    2.902122
b    2.616806
Name: data1, dtype: float64

In [None]:
grouped.describe()

### .apply(fct) 는 골라 넣어주지 않으면 에러!
인자에 숫자만 와야 함
> if not,  unsupported operand type(s) for -: 'float' and 'str'

In [17]:
df.groupby('key1')['data1'].apply(peak_to_peak)

key1
a    2.902122
b    2.616806
Name: data1, dtype: float64

In [1]:
df.groupby('key1').apply(peak_to_peak)

### Column-Wise and Multiple Function Application

#### groupby객체.agg([fct1, fct2, ...])

In [49]:
tips = pd.read_csv('examples/tips.csv')
# Add tip percentage of total bill
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808
5,25.29,4.71,No,Sun,Dinner,4,0.18624


In [50]:
grouped = tips.groupby(['day', 'smoker'])

In [51]:
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [37]:
grouped.agg([peak_to_peak])

Unnamed: 0_level_0,data1,data2
Unnamed: 0_level_1,peak_to_peak,peak_to_peak
key1,Unnamed: 1_level_2,Unnamed: 2_level_2
a,2.902122,1.157753
b,2.616806,1.904513


In [42]:
grouped.agg([('최솟',min), 'mean', ('픽2픽',peak_to_peak)])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,최솟,mean,픽2픽,최솟,mean,픽2픽
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
a,-0.737723,0.88879,2.902122,-0.109786,0.323376,1.157753
b,-1.16725,0.141153,2.616806,-0.867356,0.0849,1.904513


In [47]:
grouped.agg([('최댓',max), 'mean', 'count'])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,최댓,mean,count,최댓,mean,count
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
a,2.164399,0.88879,3,1.047967,0.323376,3
b,1.449556,0.141153,2,1.037156,0.0849,2


In [52]:
grouped_pct.agg(['mean', 'std', peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [53]:
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [54]:
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions)
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [55]:
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,4,0.15165,0.187735
Fri,Yes,15,0.174783,0.26348
Sat,No,45,0.158048,0.29199
Sat,Yes,42,0.147906,0.325733
Sun,No,57,0.160113,0.252672
Sun,Yes,19,0.18725,0.710345
Thur,No,45,0.160298,0.266312
Thur,Yes,17,0.163863,0.241255


##### 차피 (칼럼,fct)할거, 깔끔히 list로
p361

In [57]:
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)

<class 'list'>


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.000791,18.42,25.596333
Fri,Yes,0.174783,0.002631,16.813333,82.562438
Sat,No,0.158048,0.001581,19.661778,79.908965
Sat,Yes,0.147906,0.003767,21.276667,101.387535
Sun,No,0.160113,0.001793,20.506667,66.09998
Sun,Yes,0.18725,0.023757,24.12,109.046044
Thur,No,0.160298,0.001503,17.113111,59.625081
Thur,Yes,0.163863,0.001551,19.190588,69.808518


In [58]:
grouped.agg({'tip' : np.max, 'size' : 'sum'})
grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],
             'size' : 'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,0.120385,0.187735,0.15165,0.028123,9
Fri,Yes,0.103555,0.26348,0.174783,0.051293,31
Sat,No,0.056797,0.29199,0.158048,0.039767,115
Sat,Yes,0.035638,0.325733,0.147906,0.061375,104
Sun,No,0.059447,0.252672,0.160113,0.042347,167
Sun,Yes,0.06566,0.710345,0.18725,0.154134,49
Thur,No,0.072961,0.266312,0.160298,0.038774,112
Thur,Yes,0.090014,0.241255,0.163863,0.039389,40


### Returning Aggregated Data Without Row Indexes
p362

In [60]:
tips.groupby(['day', 'smoker'], as_index=True).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,size,tip_pct
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,18.42,2.8125,2.25,0.15165
Fri,Yes,16.813333,2.714,2.066667,0.174783
Sat,No,19.661778,3.102889,2.555556,0.158048
Sat,Yes,21.276667,2.875476,2.47619,0.147906
Sun,No,20.506667,3.167895,2.929825,0.160113
Sun,Yes,24.12,3.516842,2.578947,0.18725
Thur,No,17.113111,2.673778,2.488889,0.160298
Thur,Yes,19.190588,3.03,2.352941,0.163863


In [427]:
tips.groupby(['day', 'smoker'], as_index=False).mean()

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


## data.groupby(key).transform(fct)
p364

In [436]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
people

Unnamed: 0,a,b,c,d,e
Joe,-0.165275,2.608034,-0.173721,1.543488,0.005439
Steve,1.122628,0.356349,-0.260007,-0.248941,0.969121
Wes,0.953721,,,-0.27436,0.350032
Jim,1.479273,-0.085289,0.855879,-1.201002,0.601366
Travis,0.639231,0.148032,0.832724,1.295909,-0.212503


In [437]:
key= ['one','two','one','two','one']

In [438]:
people.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,0.475892,1.378033,0.329502,0.855012,0.047656
two,1.300951,0.13553,0.297936,-0.724972,0.785244


In [439]:
people.groupby(key).transform(np.mean)

Unnamed: 0,a,b,c,d,e
Joe,0.475892,1.378033,0.329502,0.855012,0.047656
Steve,1.300951,0.13553,0.297936,-0.724972,0.785244
Wes,0.475892,1.378033,0.329502,0.855012,0.047656
Jim,1.300951,0.13553,0.297936,-0.724972,0.785244
Travis,0.475892,1.378033,0.329502,0.855012,0.047656


#### 평균값을 빼고 싶다면
뭔소리??

In [440]:
def demean(arr):
    return arr-arr.mean()

In [441]:
demeaned= people.groupby(key).transform(demean)
demeaned

Unnamed: 0,a,b,c,d,e
Joe,-0.641167,1.230001,-0.503223,0.688476,-0.042217
Steve,-0.178323,0.220819,-0.557943,0.47603,0.183878
Wes,0.477828,,,-1.129372,0.302376
Jim,0.178323,-0.220819,0.557943,-0.47603,-0.183878
Travis,0.163339,-1.230001,0.503223,0.440897,-0.260159


In [442]:
demeaned.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,-3.700743e-17,0.0,0.0,-7.401487e-17,0.0
two,-1.110223e-16,-1.387779e-17,0.0,0.0,0.0


In [451]:
!start .

In [449]:
# %save result/transform.py 436-442

## Apply: General split-apply-combine
p365

In [10]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]
top(tips, n=16)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
214,28.17,6.5,Yes,Sat,Dinner,3,0.230742
88,24.71,5.85,No,Thur,Lunch,2,0.236746
174,16.82,4.0,Yes,Sun,Dinner,2,0.237812
194,16.58,4.0,Yes,Thur,Lunch,2,0.241255
185,20.69,5.0,No,Sun,Dinner,5,0.241663
181,23.33,5.65,Yes,Sun,Dinner,2,0.242177
51,10.29,2.6,No,Sun,Dinner,2,0.252672
221,13.42,3.48,Yes,Fri,Lunch,2,0.259314
93,16.32,4.3,Yes,Fri,Dinner,2,0.26348
149,7.51,2.0,No,Thur,Lunch,2,0.266312


In [9]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


> apply에 넘기는 함수에 인자를 여럿 넘기는 경우

#### data...apply(fct, param2, param3, ...)

In [452]:
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Fri,91,22.49,3.50,No,Fri,Dinner,2,0.155625
No,Fri,223,15.98,3.00,No,Fri,Lunch,3,0.187735
No,Fri,99,12.46,1.50,No,Fri,Dinner,2,0.120385
No,Sat,212,48.33,9.00,No,Sat,Dinner,4,0.186220
No,Sat,59,48.27,6.73,No,Sat,Dinner,4,0.139424
No,Sat,23,39.42,7.58,No,Sat,Dinner,4,0.192288
No,Sat,238,35.83,4.67,No,Sat,Dinner,3,0.130338
No,Sat,39,31.27,5.00,No,Sat,Dinner,3,0.159898
No,Sat,239,29.03,5.92,No,Sat,Dinner,3,0.203927


In [458]:
result = tips.groupby('smoker')['tip_pct'].describe()
result
result.unstack('smoker')

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

f = lambda x: x.describe()
grouped.apply(f)

### Suppressing the Group Keys
groupby 그룹이름을 index로 만들지 않음. group_keys=False

In [461]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]
# top(tips, n=16)

In [462]:
tips.groupby('smoker', group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
88,24.71,5.85,No,Thur,Lunch,2,0.236746
185,20.69,5.0,No,Sun,Dinner,5,0.241663
51,10.29,2.6,No,Sun,Dinner,2,0.252672
149,7.51,2.0,No,Thur,Lunch,2,0.266312
232,11.61,3.39,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [465]:
tips.groupby('smoker', group_keys=True).apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


### Quantile and Bucket Analysis
#### 변위치 분석과 버킷분석
p368

In [466]:
frame = pd.DataFrame({'data1': np.random.randn(1000),
                      'data2': np.random.randn(1000)})
frame

Unnamed: 0,data1,data2
0,-1.403417,-0.573991
1,-0.201379,0.033141
2,-0.221990,-0.213133
3,-0.070069,-1.465230
4,0.206460,-0.165568
5,-0.463194,0.741549
6,-0.083519,1.770889
7,2.170929,-1.197838
8,0.233897,-0.648082
9,0.003892,0.690386


In [468]:
quartiles = pd.cut(frame.data1, 4)
quartiles

0      (-1.44, 0.0918]
1      (-1.44, 0.0918]
2      (-1.44, 0.0918]
3      (-1.44, 0.0918]
4      (0.0918, 1.624]
5      (-1.44, 0.0918]
6      (-1.44, 0.0918]
7       (1.624, 3.156]
8      (0.0918, 1.624]
9      (-1.44, 0.0918]
10      (1.624, 3.156]
11     (0.0918, 1.624]
12     (-1.44, 0.0918]
13      (1.624, 3.156]
14     (0.0918, 1.624]
15     (-1.44, 0.0918]
16     (-1.44, 0.0918]
17     (0.0918, 1.624]
18     (0.0918, 1.624]
19     (0.0918, 1.624]
20     (0.0918, 1.624]
21     (0.0918, 1.624]
22     (0.0918, 1.624]
23     (0.0918, 1.624]
24     (-1.44, 0.0918]
25     (0.0918, 1.624]
26     (-2.978, -1.44]
27     (0.0918, 1.624]
28     (-1.44, 0.0918]
29     (0.0918, 1.624]
            ...       
970    (-1.44, 0.0918]
971    (0.0918, 1.624]
972    (-1.44, 0.0918]
973    (-1.44, 0.0918]
974    (0.0918, 1.624]
975    (-1.44, 0.0918]
976    (0.0918, 1.624]
977    (0.0918, 1.624]
978    (0.0918, 1.624]
979    (-1.44, 0.0918]
980    (0.0918, 1.624]
981    (-1.44, 0.0918]
982    (-1.

In [469]:
quartiles[:10]

0    (-1.44, 0.0918]
1    (-1.44, 0.0918]
2    (-1.44, 0.0918]
3    (-1.44, 0.0918]
4    (0.0918, 1.624]
5    (-1.44, 0.0918]
6    (-1.44, 0.0918]
7     (1.624, 3.156]
8    (0.0918, 1.624]
9    (-1.44, 0.0918]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.978, -1.44] < (-1.44, 0.0918] < (0.0918, 1.624] < (1.624, 3.156]]

In [471]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean()}
grouped = frame.data2.groupby(quartiles)

In [477]:
grouped = frame.data2.groupby(quartiles)

In [478]:
grouped.apply(get_stats)

data1                 
(-2.978, -1.44]  count     80.000000
                 max        2.443808
                 mean       0.165706
                 min       -2.879827
(-1.44, 0.0918]  count    452.000000
                 max        2.741751
                 mean      -0.011044
                 min       -3.462111
(0.0918, 1.624]  count    411.000000
                 max        3.178797
                 mean      -0.081387
                 min       -3.942319
(1.624, 3.156]   count     57.000000
                 max        2.269090
                 mean      -0.183248
                 min       -1.656536
Name: data2, dtype: float64

In [479]:
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-2.978, -1.44]",80.0,2.443808,0.165706,-2.879827
"(-1.44, 0.0918]",452.0,2.741751,-0.011044,-3.462111
"(0.0918, 1.624]",411.0,3.178797,-0.081387,-3.942319
"(1.624, 3.156]",57.0,2.26909,-0.183248,-1.656536


In [480]:
# Return quantile numbers
grouping = pd.qcut(frame.data1, 10, labels=False)
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,100.0,2.443808,0.142893,-3.41254
1,100.0,2.741751,0.057415,-3.079084
2,100.0,2.582006,-0.078146,-2.883187
3,100.0,2.07983,-0.149049,-3.462111
4,100.0,2.437535,0.107914,-2.450386
5,100.0,2.336102,-0.056423,-3.942319
6,100.0,3.178797,-0.117671,-2.505761
7,100.0,2.066329,-0.061843,-1.886632
8,100.0,2.068562,-0.088147,-2.121087
9,100.0,2.406391,-0.113247,-1.656536


### Example: Filling Missing Values with Group-Specific       Values

In [None]:
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s
s.fillna(s.mean())

In [None]:
states = ['Ohio', 'New York', 'Vermont', 'Florida',
          'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = pd.Series(np.random.randn(8), index=states)
data

In [None]:
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data
data.groupby(group_key).mean()

In [None]:
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)

In [None]:
fill_values = {'East': 0.5, 'West': -1}
fill_func = lambda g: g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)

### Example: Random Sampling and Permutation

In [None]:
# Hearts, Spades, Clubs, Diamonds
suits = ['H', 'S', 'C', 'D']
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']
cards = []
for suit in ['H', 'S', 'C', 'D']:
    cards.extend(str(num) + suit for num in base_names)

deck = pd.Series(card_val, index=cards)

In [None]:
deck[:13]

In [None]:
def draw(deck, n=5):
    return deck.sample(n)
draw(deck)

In [None]:
get_suit = lambda card: card[-1] # last letter is suit
deck.groupby(get_suit).apply(draw, n=2)

In [None]:
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

### Example: Group Weighted Average and Correlation

In [None]:
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a',
                                'b', 'b', 'b', 'b'],
                   'data': np.random.randn(8),
                   'weights': np.random.rand(8)})
df

In [None]:
grouped = df.groupby('category')
get_wavg = lambda g: np.average(g['data'], weights=g['weights'])
grouped.apply(get_wavg)

In [None]:
close_px = pd.read_csv('examples/stock_px_2.csv', parse_dates=True,
                       index_col=0)
close_px.info()
close_px[-4:]

In [None]:
spx_corr = lambda x: x.corrwith(x['SPX'])

In [None]:
rets = close_px.pct_change().dropna()

In [None]:
get_year = lambda x: x.year
by_year = rets.groupby(get_year)
by_year.apply(spx_corr)

In [None]:
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))

### Example: Group-Wise Linear Regression

In [None]:
import statsmodels.api as sm
def regress(data, yvar, xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params

In [None]:
by_year.apply(regress, 'AAPL', ['SPX'])

## Pivot Tables and Cross-Tabulation
### 피벗 테이블과 교차표
p377
1. 피벗이 더 보기 좋다
    - 기존 groupby- 같은걸 뽑아내긴 한데, 3차 색인이 생겼을것

In [4]:
import pandas as pd

In [5]:
tips = pd.read_csv('examples/tips.csv')
# Add tip percentage of total bill
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808
5,25.29,4.71,No,Sun,Dinner,4,0.18624


In [6]:
tips.pivot_table(index=['day', 'smoker'])

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,2.25,2.8125,0.15165,18.42
Fri,Yes,2.066667,2.714,0.174783,16.813333
Sat,No,2.555556,3.102889,0.158048,19.661778
Sat,Yes,2.47619,2.875476,0.147906,21.276667
Sun,No,2.929825,3.167895,0.160113,20.506667
Sun,Yes,2.578947,3.516842,0.18725,24.12
Thur,No,2.488889,2.673778,0.160298,17.113111
Thur,Yes,2.352941,3.03,0.163863,19.190588


In [7]:
tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],
                 columns='smoker')

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,No,Yes
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Dinner,Fri,2.0,2.222222,0.139622,0.165347
Dinner,Sat,2.555556,2.47619,0.158048,0.147906
Dinner,Sun,2.929825,2.578947,0.160113,0.18725
Dinner,Thur,2.0,,0.159744,
Lunch,Fri,3.0,1.833333,0.187735,0.188937
Lunch,Thur,2.5,2.352941,0.160311,0.163863


In [None]:
tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],
                 columns='smoker', margins=True)

In [None]:
tips.pivot_table('tip_pct', index=['time', 'smoker'], columns='day',
                 aggfunc=len, margins=True)

In [None]:
tips.pivot_table('tip_pct', index=['time', 'size', 'smoker'],
                 columns='day', aggfunc='mean', fill_value=0)

### Cross-Tabulations: Crosstab

In [None]:
from io import StringIO
data = """\
Sample  Nationality  Handedness
1   USA  Right-handed
2   Japan    Left-handed
3   USA  Right-handed
4   Japan    Right-handed
5   Japan    Left-handed
6   Japan    Right-handed
7   USA  Right-handed
8   USA  Left-handed
9   Japan    Right-handed
10  USA  Right-handed"""
data = pd.read_table(StringIO(data), sep='\s+')

In [None]:
data

In [None]:
pd.crosstab(data.Nationality, data.Handedness, margins=True)

In [None]:
pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)

In [None]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS

## 따라치기: 2012년 연방 선거관리위원회 DB
p381

In [2]:
import pandas as pd

In [10]:
pd.read_csv?

In [None]:
#얘는 8MB짜리 작은 거
fec= pd.read_csv('https://raw.githubusercontent.com/lemonbalm/pandas-exercises/master/P00000001-ALL.txt', low_memory=False)

In [17]:
#이건안됨 fec= pd.read_csv('https://github.com/wesm/pydata-book/tree/2nd-edition/datasets/fec/P00000001-ALL.csv', low_memory=False)

In [5]:
#123MB 가까이
fec = pd.read_csv('https://github.com/wesm/pydata-book/blob/2nd-edition/datasets/fec/P00000001-ALL.csv?raw=true'
                  , low_memory=False)

In [18]:
fec

Unnamed: 0,cmte_id,cand_id,cand_nm,contbr_nm,contbr_city,contbr_st,contbr_zip,contbr_employer,contbr_occupation,contb_receipt_amt,contb_receipt_dt,receipt_desc,memo_cd,memo_text,form_tp,file_num
0,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,366010290,RETIRED,RETIRED,250.0,20-JUN-11,,,,SA17A,736166
1,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,366010290,RETIRED,RETIRED,50.0,23-JUN-11,,,,SA17A,736166
2,C00410118,P20002978,"Bachmann, Michelle","SMITH, LANIER",LANETT,AL,368633403,INFORMATION REQUESTED,INFORMATION REQUESTED,250.0,05-JUL-11,,,,SA17A,749073
3,C00410118,P20002978,"Bachmann, Michelle","BLEVINS, DARONDA",PIGGOTT,AR,724548253,NONE,RETIRED,250.0,01-AUG-11,,,,SA17A,749073
4,C00410118,P20002978,"Bachmann, Michelle","WARDENBURG, HAROLD",HOT SPRINGS NATION,AR,719016467,NONE,RETIRED,300.0,20-JUN-11,,,,SA17A,736166
5,C00410118,P20002978,"Bachmann, Michelle","BECKMAN, JAMES",SPRINGDALE,AR,727647190,NONE,RETIRED,500.0,23-JUN-11,,,,SA17A,736166
6,C00410118,P20002978,"Bachmann, Michelle","BLEVINS, DARONDA",PIGGOTT,AR,724548253,INFORMATION REQUESTED,INFORMATION REQUESTED,250.0,21-JUN-11,,,,SA17A,736166
7,C00410118,P20002978,"Bachmann, Michelle","BLEVINS, DARONDA",PIGGOTT,AR,724548253,NONE,RETIRED,250.0,05-JUL-11,,,,SA17A,749073
8,C00410118,P20002978,"Bachmann, Michelle","COLLINS, SARAH",MESA,AZ,852106725,ST. JOSEPH HOSPITAL,RN,250.0,21-JUN-11,,,,SA17A,736166
9,C00410118,P20002978,"Bachmann, Michelle","COLEMAN, RONALD",TUCSON,AZ,857498865,RAYTHEON,ELECTRICAL ENGINEER,250.0,20-JUN-11,,,,SA17A,736166


In [19]:
fec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001731 entries, 0 to 1001730
Data columns (total 16 columns):
cmte_id              1001731 non-null object
cand_id              1001731 non-null object
cand_nm              1001731 non-null object
contbr_nm            1001731 non-null object
contbr_city          1001712 non-null object
contbr_st            1001727 non-null object
contbr_zip           1001620 non-null object
contbr_employer      988002 non-null object
contbr_occupation    993301 non-null object
contb_receipt_amt    1001731 non-null float64
contb_receipt_dt     1001731 non-null object
receipt_desc         14166 non-null object
memo_cd              92482 non-null object
memo_text            97770 non-null object
form_tp              1001731 non-null object
file_num             1001731 non-null int64
dtypes: float64(1), int64(1), object(14)
memory usage: 122.3+ MB


In [23]:
fec.iloc[123456]

cmte_id                             C00431445
cand_id                             P80003338
cand_nm                         Obama, Barack
contbr_nm                         ELLMAN, IRA
contbr_city                             TEMPE
contbr_st                                  AZ
contbr_zip                          852816719
contbr_employer      ARIZONA STATE UNIVERSITY
contbr_occupation                   PROFESSOR
contb_receipt_amt                          50
contb_receipt_dt                    01-DEC-11
receipt_desc                              NaN
memo_cd                                   NaN
memo_text                                 NaN
form_tp                                 SA17A
file_num                               772372
Name: 123456, dtype: object

#### 정당가입여부 추가

In [None]:
unique_cands= fec.cand_nm.unique()

In [61]:
tuc= tuple(unique_cands)
tuc

('Bachmann, Michelle',
 'Romney, Mitt',
 'Obama, Barack',
 "Roemer, Charles E. 'Buddy' III",
 'Pawlenty, Timothy',
 'Johnson, Gary Earl',
 'Paul, Ron',
 'Santorum, Rick',
 'Cain, Herman',
 'Gingrich, Newt',
 'McCotter, Thaddeus G',
 'Huntsman, Jon',
 'Perry, Rick')

In [62]:
print(unique_cands.shape)
print(len(part_val))

(13,)
13


In [63]:
unique_cands[2]

'Obama, Barack'

In [64]:
part_val= ['Republican']*6
part_val.append('Democrat')
part_val+= ['Republican']*6
part_val

['Republican',
 'Republican',
 'Republican',
 'Republican',
 'Republican',
 'Republican',
 'Democrat',
 'Republican',
 'Republican',
 'Republican',
 'Republican',
 'Republican',
 'Republican']

In [65]:
# 안됨 parties= dict(keys= unique_cands, vars= part_val)
# 안됨 parties= dict(unique_cands=part_val)

In [None]:
# python 2 list to dict
dict(zip(unique_cands, part_val))

In [66]:
parties= {}
for key, val in zip(tuc, part_val):
    parties[key]=val

In [67]:
parties

{'Bachmann, Michelle': 'Republican',
 'Cain, Herman': 'Republican',
 'Gingrich, Newt': 'Republican',
 'Huntsman, Jon': 'Republican',
 'Johnson, Gary Earl': 'Republican',
 'McCotter, Thaddeus G': 'Republican',
 'Obama, Barack': 'Republican',
 'Paul, Ron': 'Democrat',
 'Pawlenty, Timothy': 'Republican',
 'Perry, Rick': 'Republican',
 "Roemer, Charles E. 'Buddy' III": 'Republican',
 'Romney, Mitt': 'Republican',
 'Santorum, Rick': 'Republican'}

In [69]:
# %save mkDic 61-67
# !start .

### p383 dictionary 만들기- 필요 코드만

In [111]:
unique_cands= fec.cand_nm.unique()
tuc= tuple(unique_cands)
part_val= ['Republican']*6
part_val.append('Democrat')
part_val+= ['Republican']*6
parties= {}
for key, val in zip(tuc, part_val):
    parties[key]=val
parties

{'Bachmann, Michelle': 'Republican',
 'Cain, Herman': 'Republican',
 'Gingrich, Newt': 'Republican',
 'Huntsman, Jon': 'Republican',
 'Johnson, Gary Earl': 'Republican',
 'McCotter, Thaddeus G': 'Republican',
 'Obama, Barack': 'Republican',
 'Paul, Ron': 'Democrat',
 'Pawlenty, Timothy': 'Republican',
 'Perry, Rick': 'Republican',
 "Roemer, Charles E. 'Buddy' III": 'Republican',
 'Romney, Mitt': 'Republican',
 'Santorum, Rick': 'Republican'}

- 더 좋아보이지만 몰라서 못 쓴 방법
https://stackoverflow.com/questions/12905999/python-dict-how-to-create-key-or-append-an-element-to-key
- dic.setdefault(key,[]).append(value)

In [106]:
# whos

In [110]:
# parties= {}
# parties.setdefault(tuc,[]).append(part_val)
# # 안되는데 ;;>  {(...):[[...]]} 로 들어감

### 후보이름으로부터 정당 배열 알아내기
Series_데이터.map(딕셔너리)
> 데이터로 키 전달 되서, 1:1대응하는 값 뽑아내기

In [71]:
fec.cand_nm[123456:123461]

123456    Obama, Barack
123457    Obama, Barack
123458    Obama, Barack
123459    Obama, Barack
123460    Obama, Barack
Name: cand_nm, dtype: object

In [74]:
#일부만 봄. map()잘 작동하나 보려고
fec.cand_nm[123456:123461].map(parties)

123456    Republican
123457    Republican
123458    Republican
123459    Republican
123460    Republican
Name: cand_nm, dtype: object

#### 새 칼럼 추가

In [75]:
fec['party']= fec.cand_nm.map(parties)
fec['party'].value_counts()

Republican    857974
Democrat      143757
Name: party, dtype: int64

### 기부금만 추려내기
환급금: 기부금이 마이너스

In [81]:
fec.columns

Index(['cmte_id', 'cand_id', 'cand_nm', 'contbr_nm', 'contbr_city',
       'contbr_st', 'contbr_zip', 'contbr_employer', 'contbr_occupation',
       'contb_receipt_amt', 'contb_receipt_dt', 'receipt_desc', 'memo_cd',
       'memo_text', 'form_tp', 'file_num', 'party'],
      dtype='object')

In [77]:
# fec.contb_receipt_amt

In [79]:
(fec.contb_receipt_amt> 0).value_counts()

True     991475
False     10256
Name: contb_receipt_amt, dtype: int64

In [82]:
fec= fec[fec.contb_receipt_amt>0]

##### 버락오바마와 미트롬니가 양대 후보 이므로 이 2명의 기부금만 추려내기

In [89]:
fec_mrbo= fec[fec.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])]

In [97]:
bm= fec.cand_nm.isin(['Obama, Barack', 'Romney, Mitt']).sum()
bm_no= (fec.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])==False).sum()
print(bm, "나머지: ", bm_no)
print("두 후보 기부금/전체:", bm/(bm+bm_no)*100,"%")

694282 나머지:  297193
두 후보 기부금/전체: 70.0251645276 %


## 직업에 따른 기부 내역 통계
p384

In [102]:
# dict이라 그런지 알아서 정렬되어 있음
# fec.contbr_occupation#.value_counts()[:10]
# fec.contbr_occupation.value_counts()#[:10]
fec.contbr_occupation.value_counts()[:10]

RETIRED                                   233990
INFORMATION REQUESTED                      35107
ATTORNEY                                   34286
HOMEMAKER                                  29931
PHYSICIAN                                  23432
INFORMATION REQUESTED PER BEST EFFORTS     21138
ENGINEER                                   14334
TEACHER                                    13990
CONSULTANT                                 13273
PROFESSOR                                  12555
Name: contbr_occupation, dtype: int64

#### 유형은 같으나, 이름이 다른 경우- 같은걸로 만들기

In [115]:
fec.columns

Index(['cmte_id', 'cand_id', 'cand_nm', 'contbr_nm', 'contbr_city',
       'contbr_st', 'contbr_zip', 'contbr_employer', 'contbr_occupation',
       'contb_receipt_amt', 'contb_receipt_dt', 'receipt_desc', 'memo_cd',
       'memo_text', 'form_tp', 'file_num', 'party'],
      dtype='object')

In [118]:
con_occu= fec.contbr_occupation.unique()
con_occu.shape

(45068,)

In [119]:
con_emp= fec.contbr_employer.unique()
con_emp.shape

(135643,)

#### split(' ')로 쪼개서 칼럼 따로 만들고, ..
## stemming
1. Err- contbr_occupation 에 float이 있다구 ?? ㄷㄷ
2. Err- 글자가 박살남. RETIRED -> retir. ATTORNEY -> RN

In [121]:
# from nltk.stem import PorterStemmer
# from nltk.tokenize import sent_tokenize, word_tokenize

In [124]:
# ps= PorterStemmer()
# for word in con_occu:
#     print(ps.stem(word))
# # AttributeError: 'float' object has no attribute 'lower'

### stemming 말고 그냥 일단 책대로 ㅇㅇ

p385 할 차례