# [transform](https://blog.naver.com/PostView.naver?blogId=sw4r&logNo=222392753166&categoryNo=136&parentCategoryNo=0&viewDate=&currentPage=1&postListTopCurrentPage=1&from=postView)

In [1]:
import pandas as pd
import seaborn as sns

# titanic 데이터셋에서 age, sex 등 5개 열을 선택하여 데이터프레임 만들기
titanic = sns.load_dataset('titanic')
df = titanic.loc[:, ['age','sex', 'class', 'fare', 'survived']]

print('승객 수:', len(df))
display(df.head())

승객 수: 891


Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.925,1
3,35.0,female,First,53.1,1
4,35.0,male,Third,8.05,0


In [2]:
# class 열을 기준으로 분할
grouped = df.groupby(['class']) 
print(grouped)
print(0x000001FA2B3CF760)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001D2BB294BE0>
2173978867552


In [3]:
# 그룹 객체를 iteration으로 출력: head() 메소드로 첫 5행만을 출력
for key, group in grouped:
    print('* key :', key)
    print('* number :', len(group))    
    display(group.head())
    print()

* key : First
* number : 216


Unnamed: 0,age,sex,class,fare,survived
1,38.0,female,First,71.2833,1
3,35.0,female,First,53.1,1
6,54.0,male,First,51.8625,0
11,58.0,female,First,26.55,1
23,28.0,male,First,35.5,1



* key : Second
* number : 184


Unnamed: 0,age,sex,class,fare,survived
9,14.0,female,Second,30.0708,1
15,55.0,female,Second,16.0,1
17,,male,Second,13.0,1
20,35.0,male,Second,26.0,0
21,34.0,male,Second,13.0,1



* key : Third
* number : 491


Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
2,26.0,female,Third,7.925,1
4,35.0,male,Third,8.05,0
5,,male,Third,8.4583,0
7,2.0,male,Third,21.075,0





In [5]:
# 연산 메소드 적용    
average = grouped.mean()                                                                                # 데이터프레임으로 출력된다.
display(average)                                                                            # mean을 사용해 모든 데이터프레임을 계산했기 때문이다.
type(average)

Unnamed: 0_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,38.233441,84.154687,0.62963
Second,29.87763,20.662183,0.472826
Third,25.14062,13.67555,0.242363


pandas.core.frame.DataFrame

In [6]:
average1 = grouped['age'].mean()                                                                         # 하나의 컬럼을 선택하고 mean을 사용하면 시리즈로 출력된다.
display(average1)                                                                         # 여기서 계산한 mean 값을 데이터프레임에 넣기 위해서는 merge를 사용해야한다.
type(average1)

class
First     38.233441
Second    29.877630
Third     25.140620
Name: age, dtype: float64

pandas.core.series.Series

In [8]:
df_1 = average1.rename('age_mean').reset_index()                                          # 이런 식으로 데이터프레임을 만들어야한다.
df_new = df.merge(df_1)
display(df_new)

Unnamed: 0,age,sex,class,fare,survived,age_mean
0,22.0,male,Third,7.2500,0,25.14062
1,26.0,female,Third,7.9250,1,25.14062
2,35.0,male,Third,8.0500,0,25.14062
3,,male,Third,8.4583,0,25.14062
4,2.0,male,Third,21.0750,0,25.14062
...,...,...,...,...,...,...
886,27.0,female,Second,13.8583,1,29.87763
887,28.0,female,Second,24.0000,1,29.87763
888,25.0,female,Second,26.0000,1,29.87763
889,28.0,male,Second,10.5000,0,29.87763


In [None]:
# 위의 내용을 trnasform 을 사용하면 한 줄에 끝낼 수 있다.

In [9]:
df

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.2500,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.9250,1
3,35.0,female,First,53.1000,1
4,35.0,male,Third,8.0500,0
...,...,...,...,...,...
886,27.0,male,Second,13.0000,0
887,19.0,female,First,30.0000,1
888,,female,Third,23.4500,0
889,26.0,male,First,30.0000,1


In [12]:
df['mean_age'] = df.groupby('class')['age'].transform('mean')
df

Unnamed: 0,age,sex,class,fare,survived,mean_age
0,22.0,male,Third,7.2500,0,25.140620
1,38.0,female,First,71.2833,1,38.233441
2,26.0,female,Third,7.9250,1,25.140620
3,35.0,female,First,53.1000,1,38.233441
4,35.0,male,Third,8.0500,0,25.140620
...,...,...,...,...,...,...
886,27.0,male,Second,13.0000,0,29.877630
887,19.0,female,First,30.0000,1,38.233441
888,,female,Third,23.4500,0,25.140620
889,26.0,male,First,30.0000,1,38.233441


## transform은 한 열을 가지고 계산할 수 있는 상황에서만 사용이 가능하다.
## 그렇기 때문에 모든 열을 끌어 모아 계산할 수 있는 apply 함수와는 다르다고 할 수 있다.

In [15]:
df_p = pd.DataFrame({'a' : [1, 2, 3],
                     'b' : [2, 3, 4],
                     'c' : [5, 6, 7]})
display(df_p)

Unnamed: 0,a,b,c
0,1,2,5
1,2,3,6
2,3,4,7


In [18]:
df_p['d'] = df_p.apply(lambda x: x.a + x.b + x.c, axis=1)
df_p

Unnamed: 0,a,b,c,d
0,1,2,5,8
1,2,3,6,11
2,3,4,7,14
