# `merge()` : 데이터 프레임 병합

- `merge()` 함수는 SQL의 join 과 유사
- 기준되는 열/인덱스를 키
- 키 열이나 인덱스는 반드시 양쪽 데이터 프레임에 모두 존재


In [0]:
import numpy as np
import pandas as pd
import seaborn as sns
import xlrd

## `xlrd` : 엑셀 파일 읽기 위한 라이브러리

- 주식 데이터로 데이터 프레임 만들기

In [4]:
df1 = pd.read_excel('/content/stock price.xlsx')
df2 = pd.read_excel('/content/stock valuation.xlsx')

print (df1, '\n', type (df1), '\n')
print (df2, '\n')

       id stock_name          value   price
0  128940       한미약품   59385.666667  421000
1  130960     CJ E&M   58540.666667   98900
2  138250      엔에스쇼핑   14558.666667   13200
3  139480        이마트  239230.833333  254500
4  142280     녹십자엠에스     468.833333   10200
5  145990        삼양사   82750.000000   82000
6  185750        종근당   40293.666667  100500
7  192400      쿠쿠홀딩스  179204.666667  177500
8  199800         툴젠   -2514.333333  115400
9  204210     모두투어리츠    3093.333333    3475 
 <class 'pandas.core.frame.DataFrame'> 

       id       name           eps     bps        per       pbr
0  130960     CJ E&M   6301.333333   54068  15.695091  1.829178
1  136480         하림    274.166667    3551  11.489362  0.887074
2  138040    메리츠금융지주   2122.333333   14894   6.313806  0.899691
3  139480        이마트  18268.166667  295780  13.931338  0.860437
4  145990        삼양사   5741.000000  108090  14.283226  0.758627
5  161390      한국타이어   5648.500000   51341   7.453306  0.820007
6  181710  NHN엔터테인먼트   211

## `pd.merge()`
- 기본 option : `on = 'None', how = 'inner'`
  *  `on = 'None'` : 두 데이터 프레임에 공통으로 속하는 열을 키로 병합

In [5]:
merge_inner = pd.merge(df1, df2)
display (merge_inner)

Unnamed: 0,id,stock_name,value,price,name,eps,bps,per,pbr
0,130960,CJ E&M,58540.666667,98900,CJ E&M,6301.333333,54068,15.695091,1.829178
1,139480,이마트,239230.833333,254500,이마트,18268.166667,295780,13.931338,0.860437
2,145990,삼양사,82750.0,82000,삼양사,5741.0,108090,14.283226,0.758627
3,185750,종근당,40293.666667,100500,종근당,3990.333333,40684,25.185866,2.470259
4,204210,모두투어리츠,3093.333333,3475,모두투어리츠,85.166667,5335,40.802348,0.651359


- 왼쪽 dataframe 기준으로 키 값 분리
- `how='left'` : 왼쪽 df 키 열에 속하는 데이터 값 기준 병합
- `left_on='stock_name', right_on='name'` : 좌우 df에 각각 다르게 키 값 지정, `id_x`, `id_y` 구분하여 표시

In [6]:
merge_left = pd.merge(df1, df2, how='left', left_on='stock_name', right_on='name')
display (merge_left)

Unnamed: 0,id_x,stock_name,value,price,id_y,name,eps,bps,per,pbr
0,128940,한미약품,59385.666667,421000,,,,,,
1,130960,CJ E&M,58540.666667,98900,130960.0,CJ E&M,6301.333333,54068.0,15.695091,1.829178
2,138250,엔에스쇼핑,14558.666667,13200,,,,,,
3,139480,이마트,239230.833333,254500,139480.0,이마트,18268.166667,295780.0,13.931338,0.860437
4,142280,녹십자엠에스,468.833333,10200,,,,,,
5,145990,삼양사,82750.0,82000,145990.0,삼양사,5741.0,108090.0,14.283226,0.758627
6,185750,종근당,40293.666667,100500,185750.0,종근당,3990.333333,40684.0,25.185866,2.470259
7,192400,쿠쿠홀딩스,179204.666667,177500,,,,,,
8,199800,툴젠,-2514.333333,115400,,,,,,
9,204210,모두투어리츠,3093.333333,3475,204210.0,모두투어리츠,85.166667,5335.0,40.802348,0.651359


In [7]:
merge_right = pd.merge(df1, df2, how='right', left_on='stock_name', right_on='name')
display (merge_right)

Unnamed: 0,id_x,stock_name,value,price,id_y,name,eps,bps,per,pbr
0,130960.0,CJ E&M,58540.666667,98900.0,130960,CJ E&M,6301.333333,54068,15.695091,1.829178
1,139480.0,이마트,239230.833333,254500.0,139480,이마트,18268.166667,295780,13.931338,0.860437
2,145990.0,삼양사,82750.0,82000.0,145990,삼양사,5741.0,108090,14.283226,0.758627
3,185750.0,종근당,40293.666667,100500.0,185750,종근당,3990.333333,40684,25.185866,2.470259
4,204210.0,모두투어리츠,3093.333333,3475.0,204210,모두투어리츠,85.166667,5335,40.802348,0.651359
5,,,,,136480,하림,274.166667,3551,11.489362,0.887074
6,,,,,138040,메리츠금융지주,2122.333333,14894,6.313806,0.899691
7,,,,,161390,한국타이어,5648.5,51341,7.453306,0.820007
8,,,,,181710,NHN엔터테인먼트,2110.166667,78434,30.755864,0.827447
9,,,,,207940,삼성바이오로직스,4644.166667,60099,89.790059,6.938551


In [8]:
price = df1[ df1['price']<50000]
display (price.head())

Unnamed: 0,id,stock_name,value,price
2,138250,엔에스쇼핑,14558.666667,13200
4,142280,녹십자엠에스,468.833333,10200
9,204210,모두투어리츠,3093.333333,3475


In [9]:
value = pd.merge(price, df2)
display (value)

Unnamed: 0,id,stock_name,value,price,name,eps,bps,per,pbr
0,204210,모두투어리츠,3093.333333,3475,모두투어리츠,85.166667,5335,40.802348,0.651359


In [10]:
pcon = df2 [(df2.per > 30) & (df2.pbr > 0.8)]
display (pcon)

Unnamed: 0,id,name,eps,bps,per,pbr
6,181710,NHN엔터테인먼트,2110.166667,78434,30.755864,0.827447
9,207940,삼성바이오로직스,4644.166667,60099,89.790059,6.938551


# 데이터 결합

- pandas `join()` 은 `merge()` 기반으로 만들어졌기 떄문에 기본 작동방식 유사
- `join()` 은 두 데이터프레임의 행 인덱스를 기준으로 결합하는 점에서 `merge(()`와 차이 있으나, `on=keys` 옵션 설정하면 열 기준으로 병합

In [11]:
df1 = pd.read_excel('/content/stock price.xlsx', index_col='id')
df2 = pd.read_excel('/content/stock valuation.xlsx', index_col='id')

print (df1, '\n')
print (df2, '\n')

       stock_name          value   price
id                                      
128940       한미약품   59385.666667  421000
130960     CJ E&M   58540.666667   98900
138250      엔에스쇼핑   14558.666667   13200
139480        이마트  239230.833333  254500
142280     녹십자엠에스     468.833333   10200
145990        삼양사   82750.000000   82000
185750        종근당   40293.666667  100500
192400      쿠쿠홀딩스  179204.666667  177500
199800         툴젠   -2514.333333  115400
204210     모두투어리츠    3093.333333    3475 

             name           eps     bps        per       pbr
id                                                          
130960     CJ E&M   6301.333333   54068  15.695091  1.829178
136480         하림    274.166667    3551  11.489362  0.887074
138040    메리츠금융지주   2122.333333   14894   6.313806  0.899691
139480        이마트  18268.166667  295780  13.931338  0.860437
145990        삼양사   5741.000000  108090  14.283226  0.758627
161390      한국타이어   5648.500000   51341   7.453306  0.820007
181710  NHN엔터테인먼트 

In [12]:
display (df1.join(df2))

Unnamed: 0_level_0,stock_name,value,price,name,eps,bps,per,pbr
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
128940,한미약품,59385.666667,421000,,,,,
130960,CJ E&M,58540.666667,98900,CJ E&M,6301.333333,54068.0,15.695091,1.829178
138250,엔에스쇼핑,14558.666667,13200,,,,,
139480,이마트,239230.833333,254500,이마트,18268.166667,295780.0,13.931338,0.860437
142280,녹십자엠에스,468.833333,10200,,,,,
145990,삼양사,82750.0,82000,삼양사,5741.0,108090.0,14.283226,0.758627
185750,종근당,40293.666667,100500,종근당,3990.333333,40684.0,25.185866,2.470259
192400,쿠쿠홀딩스,179204.666667,177500,,,,,
199800,툴젠,-2514.333333,115400,,,,,
204210,모두투어리츠,3093.333333,3475,모두투어리츠,85.166667,5335.0,40.802348,0.651359


In [13]:
display (df1.join(df2, on='id', how='outer'))

Unnamed: 0,id,stock_name,value,price,name,eps,bps,per,pbr
128940.0,128940,한미약품,59385.666667,421000.0,,,,,
130960.0,130960,CJ E&M,58540.666667,98900.0,CJ E&M,6301.333333,54068.0,15.695091,1.829178
138250.0,138250,엔에스쇼핑,14558.666667,13200.0,,,,,
139480.0,139480,이마트,239230.833333,254500.0,이마트,18268.166667,295780.0,13.931338,0.860437
142280.0,142280,녹십자엠에스,468.833333,10200.0,,,,,
145990.0,145990,삼양사,82750.0,82000.0,삼양사,5741.0,108090.0,14.283226,0.758627
185750.0,185750,종근당,40293.666667,100500.0,종근당,3990.333333,40684.0,25.185866,2.470259
192400.0,192400,쿠쿠홀딩스,179204.666667,177500.0,,,,,
199800.0,199800,툴젠,-2514.333333,115400.0,,,,,
204210.0,204210,모두투어리츠,3093.333333,3475.0,모두투어리츠,85.166667,5335.0,40.802348,0.651359


# 그룹연산

- 복잡한 데이터를 어떤 기준에 따라 여러 그룹으로 나누어서 관찰 가능, 이런 방식으로 분할 처리하는 것을 그룹 연산이라 함
- 그룹연산은 데이터를 집계, 변환, 필터링하는데 효율적, pandas `groupby()` 사용
- 그룹 객체 만들기 (분할)
- 그룹 연산 메소드 (적용-결합)

In [14]:
titanic = sns.load_dataset('titanic')
df = titanic.loc[:, ['age','sex','class','fare','survived']]

print('승객 수 : ', len(df))
display(df.head())

승객 수 :  891


Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.925,1
3,35.0,female,First,53.1,1
4,35.0,male,Third,8.05,0


In [15]:
grouped = df.groupby(['class'])
print (grouped)
# list (grouped)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f31cc9af780>


In [16]:
for key, group in grouped :
    print (' - key : ', key)
    print (' - number : ', len(group))
    print (group.head(), '\n')

 - key :  First
 - number :  216
     age     sex  class     fare  survived
1   38.0  female  First  71.2833         1
3   35.0  female  First  53.1000         1
6   54.0    male  First  51.8625         0
11  58.0  female  First  26.5500         1
23  28.0    male  First  35.5000         1 

 - key :  Second
 - number :  184
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
17   NaN    male  Second  13.0000         1
20  35.0    male  Second  26.0000         0
21  34.0    male  Second  13.0000         1 

 - key :  Third
 - number :  491
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
2  26.0  female  Third   7.9250         1
4  35.0    male  Third   8.0500         0
5   NaN    male  Third   8.4583         0
7   2.0    male  Third  21.0750         0 



In [17]:
print (grouped.mean(), '\n')
print (grouped.survived.mean(), '\n')

print (grouped.max (), '\n')
print (grouped.fare.max (), '\n')

print (grouped.min (), '\n')

              age       fare  survived
class                                 
First   38.233441  84.154687  0.629630
Second  29.877630  20.662183  0.472826
Third   25.140620  13.675550  0.242363 

class
First     0.629630
Second    0.472826
Third     0.242363
Name: survived, dtype: float64 

         age   sex      fare  survived
class                                 
First   80.0  male  512.3292         1
Second  70.0  male   73.5000         1
Third   74.0  male   69.5500         1 

class
First     512.3292
Second     73.5000
Third      69.5500
Name: fare, dtype: float64 

         age     sex  fare  survived
class                               
First   0.92  female   0.0         0
Second  0.67  female   0.0         0
Third   0.42  female   0.0         0 



In [18]:
group3 = grouped.get_group('Third')
display (group3.sample(5))

Unnamed: 0,age,sex,fare,survived
4,35.0,male,8.05,0
459,,male,7.75,0
636,32.0,male,7.925,0
5,,male,8.4583,0
442,25.0,male,7.775,0


In [19]:
group3.describe()

Unnamed: 0,age,fare,survived
count,355.0,491.0,491.0
mean,25.14062,13.67555,0.242363
std,12.495398,11.778142,0.428949
min,0.42,0.0,0.0
25%,18.0,7.75,0.0
50%,24.0,8.05,0.0
75%,32.0,15.5,0.0
max,74.0,69.55,1.0


In [20]:
group1 = grouped.get_group('First')
display (group1.sample(5))

Unnamed: 0,age,sex,fare,survived
453,49.0,male,89.1042,1
3,35.0,female,53.1,1
434,50.0,male,55.9,0
139,24.0,male,79.2,0
1,38.0,female,71.2833,1


In [21]:
# print (group1.fare.mean(), '\n', group3.fare.mean())
display (group1.describe(), group3.describe())
# print (group1.isnull().sum())

Unnamed: 0,age,fare,survived
count,186.0,216.0,216.0
mean,38.233441,84.154687,0.62963
std,14.802856,78.380373,0.484026
min,0.92,0.0,0.0
25%,27.0,30.92395,0.0
50%,37.0,60.2875,1.0
75%,49.0,93.5,1.0
max,80.0,512.3292,1.0


Unnamed: 0,age,fare,survived
count,355.0,491.0,491.0
mean,25.14062,13.67555,0.242363
std,12.495398,11.778142,0.428949
min,0.42,0.0,0.0
25%,18.0,7.75,0.0
50%,24.0,8.05,0.0
75%,32.0,15.5,0.0
max,74.0,69.55,1.0


In [22]:
print (grouped.get_group('First').describe().fare)
print (type(grouped.get_group('First').describe()))
# print (grouped.get_group('First').describe().loc['fare', 'std'])

count    216.000000
mean      84.154687
std       78.380373
min        0.000000
25%       30.923950
50%       60.287500
75%       93.500000
max      512.329200
Name: fare, dtype: float64
<class 'pandas.core.frame.DataFrame'>


In [0]:
grouped_two = df.groupby(['class', 'sex'])

In [24]:
for key, group in grouped_two :
    print (' - key : ', key)
    print (' - numbers : ', len(group))
    display (group.head())
    print ()

 - key :  ('First', 'female')
 - numbers :  94


Unnamed: 0,age,sex,class,fare,survived
1,38.0,female,First,71.2833,1
3,35.0,female,First,53.1,1
11,58.0,female,First,26.55,1
31,,female,First,146.5208,1
52,49.0,female,First,76.7292,1



 - key :  ('First', 'male')
 - numbers :  122


Unnamed: 0,age,sex,class,fare,survived
6,54.0,male,First,51.8625,0
23,28.0,male,First,35.5,1
27,19.0,male,First,263.0,0
30,40.0,male,First,27.7208,0
34,28.0,male,First,82.1708,0



 - key :  ('Second', 'female')
 - numbers :  76


Unnamed: 0,age,sex,class,fare,survived
9,14.0,female,Second,30.0708,1
15,55.0,female,Second,16.0,1
41,27.0,female,Second,21.0,0
43,3.0,female,Second,41.5792,1
53,29.0,female,Second,26.0,1



 - key :  ('Second', 'male')
 - numbers :  108


Unnamed: 0,age,sex,class,fare,survived
17,,male,Second,13.0,1
20,35.0,male,Second,26.0,0
21,34.0,male,Second,13.0,1
33,66.0,male,Second,10.5,0
70,32.0,male,Second,10.5,0



 - key :  ('Third', 'female')
 - numbers :  144


Unnamed: 0,age,sex,class,fare,survived
2,26.0,female,Third,7.925,1
8,27.0,female,Third,11.1333,1
10,4.0,female,Third,16.7,1
14,14.0,female,Third,7.8542,0
18,31.0,female,Third,18.0,0



 - key :  ('Third', 'male')
 - numbers :  347


Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
4,35.0,male,Third,8.05,0
5,,male,Third,8.4583,0
7,2.0,male,Third,21.075,0
12,20.0,male,Third,8.05,0





In [25]:
for key, group in grouped_two : 
    print (key)
    print (group.mean())
    print ()

print (grouped_two.mean())

('First', 'female')
age          34.611765
fare        106.125798
survived      0.968085
dtype: float64

('First', 'male')
age         41.281386
fare        67.226127
survived     0.368852
dtype: float64

('Second', 'female')
age         28.722973
fare        21.970121
survived     0.921053
dtype: float64

('Second', 'male')
age         30.740707
fare        19.741782
survived     0.157407
dtype: float64

('Third', 'female')
age         21.75000
fare        16.11881
survived     0.50000
dtype: float64

('Third', 'male')
age         26.507589
fare        12.661633
survived     0.135447
dtype: float64

                     age        fare  survived
class  sex                                    
First  female  34.611765  106.125798  0.968085
       male    41.281386   67.226127  0.368852
Second female  28.722973   21.970121  0.921053
       male    30.740707   19.741782  0.157407
Third  female  21.750000   16.118810  0.500000
       male    26.507589   12.661633  0.135447


In [26]:
group3f = grouped_two.get_group(('Third','female'))
print (group3f.head())

     age     sex  class     fare  survived
2   26.0  female  Third   7.9250         1
8   27.0  female  Third  11.1333         1
10   4.0  female  Third  16.7000         1
14  14.0  female  Third   7.8542         0
18  31.0  female  Third  18.0000         0


# 적용 - 결합
- `agg`
- `transform`
- `filter`
- `apply`

In [0]:
titanic = sns.load_dataset('titanic')
df = titanic.loc[:, ['age','sex','class','fare','survived']]

grouped = df.groupby(['class'])

In [28]:
std_all = grouped.std()
print (std_all, '\n', type (std_all))

              age       fare  survived
class                                 
First   14.802856  78.380373  0.484026
Second  14.001077  13.417399  0.500623
Third   12.495398  11.778142  0.428949 
 <class 'pandas.core.frame.DataFrame'>


In [29]:
std_fare = grouped.fare.std()
print (std_fare, '\n', type (std_fare))

class
First     78.380373
Second    13.417399
Third     11.778142
Name: fare, dtype: float64 
 <class 'pandas.core.series.Series'>


## `agg()` 적용 : 사용자 정의 함수를 인수로 전달

In [30]:
def agg_minmax (df) : 
    return df.max()-df.min()

# print (agg_minmax(std_fare))

agg_mm = grouped.agg(agg_minmax)
print (agg_mm.head())

          age      fare  survived
class                            
First   79.08  512.3292         1
Second  69.33   73.5000         1
Third   73.58   69.5500         1


In [31]:
agg_all = grouped.agg(['max','min', 'std', 'mean'])
display (agg_all.head())

Unnamed: 0_level_0,age,age,age,age,fare,fare,fare,fare,survived,survived,survived,survived
Unnamed: 0_level_1,max,min,std,mean,max,min,std,mean,max,min,std,mean
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
First,80.0,0.92,14.802856,38.233441,512.3292,0.0,78.380373,84.154687,1,0,0.484026,0.62963
Second,70.0,0.67,14.001077,29.87763,73.5,0.0,13.417399,20.662183,1,0,0.500623,0.472826
Third,74.0,0.42,12.495398,25.14062,69.55,0.0,11.778142,13.67555,1,0,0.428949,0.242363


In [32]:
agg_sep = grouped.agg({'fare' : ['min','max'], 'age' : 'mean'})
print (agg_sep.head())

       fare                  age
        min       max       mean
class                           
First   0.0  512.3292  38.233441
Second  0.0   73.5000  29.877630
Third   0.0   69.5500  25.140620


In [37]:
print ( grouped.agg({'sex' : ['min', 'max']}).head())
# print ( grouped.agg({'sex' : 'mean'}).head())
# string이라 평균은 안구해짐

           sex      
           min   max
class               
First   female  male
Second  female  male
Third   female  male


In [69]:
# df = titanic.loc[:,['age','sex','class','fare','survived']]
grouped = titanic.loc[:,['age','sex','class','fare','survived']].groupby(['class'])
display (grouped)
print (grouped.count())
print (type (grouped.count()))

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f31cc953160>

        age  sex  fare  survived
class                           
First   186  216   216       216
Second  173  184   184       184
Third   355  491   491       491
<class 'pandas.core.frame.DataFrame'>


In [74]:
grouped_filter = grouped.filter (lambda x : len(x) > 200 )
display (grouped_filter.head())
print (grouped_filter.count())

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.925,1
3,35.0,female,First,53.1,1
4,35.0,male,Third,8.05,0


age         541
sex         707
class       707
fare        707
survived    707
dtype: int64


In [83]:
# grouped_filter = grouped.age.filter (lambda x : x.mean() < 30 )
grouped_filter = grouped.filter (lambda x : x.age.mean() < 30 )
display (grouped_filter)
print (grouped_filter.count())

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.2500,0
2,26.0,female,Third,7.9250,1
4,35.0,male,Third,8.0500,0
5,,male,Third,8.4583,0
7,2.0,male,Third,21.0750,0
...,...,...,...,...,...
884,25.0,male,Third,7.0500,0
885,39.0,female,Third,29.1250,0
886,27.0,male,Second,13.0000,0
888,,female,Third,23.4500,0


age         528
sex         675
class       675
fare        675
survived    675
dtype: int64


In [90]:
# titanic = sns.load_dataset('titanic')
group = sns.load_dataset('titanic').loc[:,['age','sex','class','fare','survived']].groupby(['class'])
display (group.describe())
print ()
display (group.apply(lambda x : x.describe()))

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,fare,fare,fare,fare,fare,fare,fare,fare,survived,survived,survived,survived,survived,survived,survived,survived
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
First,186.0,38.233441,14.802856,0.92,27.0,37.0,49.0,80.0,216.0,84.154687,78.380373,0.0,30.92395,60.2875,93.5,512.3292,216.0,0.62963,0.484026,0.0,0.0,1.0,1.0,1.0
Second,173.0,29.87763,14.001077,0.67,23.0,29.0,36.0,70.0,184.0,20.662183,13.417399,0.0,13.0,14.25,26.0,73.5,184.0,0.472826,0.500623,0.0,0.0,0.0,1.0,1.0
Third,355.0,25.14062,12.495398,0.42,18.0,24.0,32.0,74.0,491.0,13.67555,11.778142,0.0,7.75,8.05,15.5,69.55,491.0,0.242363,0.428949,0.0,0.0,0.0,0.0,1.0





Unnamed: 0_level_0,Unnamed: 1_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
First,count,186.0,216.0,216.0
First,mean,38.233441,84.154687,0.62963
First,std,14.802856,78.380373,0.484026
First,min,0.92,0.0,0.0
First,25%,27.0,30.92395,0.0
First,50%,37.0,60.2875,1.0
First,75%,49.0,93.5,1.0
First,max,80.0,512.3292,1.0
Second,count,173.0,184.0,184.0
Second,mean,29.87763,20.662183,0.472826


In [106]:
def zscore(x) : 
    return (x - x.mean()) / x.std()
# app = group.age.apply(zscore)
print (group.age.head(), '\n')
print (group.age.apply(zscore).head())

0     22.0
1     38.0
2     26.0
3     35.0
4     35.0
5      NaN
6     54.0
7      2.0
9     14.0
11    58.0
15    55.0
17     NaN
20    35.0
21    34.0
23    28.0
Name: age, dtype: float64 

0   -0.251342
1   -0.015770
2    0.068776
3   -0.218434
4    0.789041
Name: age, dtype: float64
