### 2.8 그룹 분석

In [1]:
import numpy as np 
import pandas as pd

#### 1. 피벗(pivot) 테이블

In [2]:
data = {
    "도시": ["서울", "서울", "서울", "부산", "부산", "부산", "인천", "인천"],
    "연도": ["2015", "2010", "2005", "2015", "2010", "2005", "2015", "2010"],
    "인구": [9904312, 9631482, 9762546, 3448737, 3393191, 3512547, 2890451, 263203],
    "지역": ["수도권", "수도권", "수도권", "경상권", "경상권", "경상권", "수도권", "수도권"]
}
columns = ["도시", "연도", "인구", "지역"]
df1 = pd.DataFrame(data, columns=columns)
df1

Unnamed: 0,도시,연도,인구,지역
0,서울,2015,9904312,수도권
1,서울,2010,9631482,수도권
2,서울,2005,9762546,수도권
3,부산,2015,3448737,경상권
4,부산,2010,3393191,경상권
5,부산,2005,3512547,경상권
6,인천,2015,2890451,수도권
7,인천,2010,263203,수도권


In [3]:
df1.pivot(values='인구', index='도시', columns='연도')

연도,2005,2010,2015
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
부산,3512547.0,3393191.0,3448737.0
서울,9762546.0,9631482.0,9904312.0
인천,,263203.0,2890451.0


In [4]:
df1.pivot(values='인구', index=['지역','도시'], columns='연도')

Unnamed: 0_level_0,연도,2005,2010,2015
지역,도시,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
경상권,부산,3512547.0,3393191.0,3448737.0
수도권,서울,9762546.0,9631482.0,9904312.0
수도권,인천,,263203.0,2890451.0


In [5]:
df1.pivot_table('인구', ['지역','도시'], '연도')

Unnamed: 0_level_0,연도,2005,2010,2015
지역,도시,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
경상권,부산,3512547.0,3393191.0,3448737.0
수도권,서울,9762546.0,9631482.0,9904312.0
수도권,인천,,263203.0,2890451.0


#### 2. 그룹 분석

In [6]:
import seaborn as sns 
iris = sns.load_dataset('iris')
tips = sns.load_dataset('tips')

- group by

In [7]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [8]:
iris.species.value_counts()

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [9]:
# Iris 품종별 항목의 갯수
iris.groupby('species').count()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,50,50,50,50
versicolor,50,50,50,50
virginica,50,50,50,50


In [10]:
# Iris 품종별 feature의 평균
iris.groupby('species').mean()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.428,1.462,0.246
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


In [11]:
# 품종별 sepal length(꽃받침 길이)의 평균
iris.groupby('species').mean()['sepal_length']      # 결과는 Series

species
setosa        5.006
versicolor    5.936
virginica     6.588
Name: sepal_length, dtype: float64

In [12]:
iris.groupby('species')['sepal_length'].mean()

species
setosa        5.006
versicolor    5.936
virginica     6.588
Name: sepal_length, dtype: float64

In [13]:
# 품종별 sepal length(꽃받침 길이)의 평균을 데이터프레임으로
iris[['sepal_length', 'species']].groupby('species').mean()

Unnamed: 0_level_0,sepal_length
species,Unnamed: 1_level_1
setosa,5.006
versicolor,5.936
virginica,6.588


In [14]:
# 품종별 피쳐의 평균과 표준편차
iris.groupby('species').agg(['mean','std'])

Unnamed: 0_level_0,sepal_length,sepal_length,sepal_width,sepal_width,petal_length,petal_length,petal_width,petal_width
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
setosa,5.006,0.35249,3.428,0.379064,1.462,0.173664,0.246,0.105386
versicolor,5.936,0.516171,2.77,0.313798,4.26,0.469911,1.326,0.197753
virginica,6.588,0.63588,2.974,0.322497,5.552,0.551895,2.026,0.27465


In [15]:
# sepal length의 품종별 평균, 표준편차, 최대값, 최소값
iris.groupby('species')['sepal_length'].agg(['mean','std','max','min'])

Unnamed: 0_level_0,mean,std,max,min
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,0.35249,5.8,4.3
versicolor,5.936,0.516171,7.0,4.9
virginica,6.588,0.63588,7.9,4.9


In [16]:
#################################################################################
tips.shape

(244, 7)

In [17]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [18]:
# tip_pct 컬럼 생성
tips['tip_pct'] = (tips.tip / tips.total_bill * 100).round(2)
tips.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
239,29.03,5.92,Male,No,Sat,Dinner,3,20.39
240,27.18,2.0,Female,Yes,Sat,Dinner,2,7.36
241,22.67,2.0,Male,Yes,Sat,Dinner,2,8.82
242,17.82,1.75,Male,No,Sat,Dinner,2,9.82
243,18.78,3.0,Female,No,Thur,Dinner,2,15.97


In [19]:
# 성별 팁비율의 평균, 최대, 최소
tips.groupby('sex')['tip_pct'].agg(['mean','max','min'])

Unnamed: 0_level_0,mean,max,min
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,15.764713,71.03,3.56
Female,16.648276,41.67,5.64


In [20]:
# 흡연 유무별로 팁비율의 평균, 표준편차, 최소, 최대
tips.groupby('smoker')['tip_pct'].agg(['mean','std','min','max'])

Unnamed: 0_level_0,mean,std,min,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Yes,16.31914,8.511748,3.56,71.03
No,15.932318,3.990701,5.68,29.2


In [21]:
# 성별, 흡연유무별 팁 비율의 평균, 표준편차
tips.groupby(['sex','smoker'])['tip_pct'].agg(['mean','std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,Yes,15.276667,9.05849
Male,No,16.066598,4.184634
Female,Yes,18.214545,7.159585
Female,No,15.691111,3.641717


In [22]:
# 요일별, 시간대별 팁 비율의 횟수, 평균, 표준편차
tips.groupby(['day','time'])['tip_pct'].agg(['count','mean','std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std
day,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Thur,Lunch,61,16.129016,3.89698
Thur,Dinner,1,15.97,
Fri,Lunch,7,18.875714,4.588834
Fri,Dinner,12,15.8925,4.703129
Sat,Lunch,0,,
Sat,Dinner,87,15.314598,5.129187
Sun,Lunch,0,,
Sun,Dinner,76,16.689605,8.473462


#### 3. 피벗 테이블(pivot_table)

In [23]:
# 성별, 흡연유무별 팁의 비율
tips.pivot_table('tip_pct', 'sex', 'smoker', 'mean')

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,15.276667,16.066598
Female,18.214545,15.691111


In [24]:
# 합계 기능
# 성별, 흡연유무별 인원
tips.pivot_table('tip_pct', 'sex', 'smoker', 'count', margins=True, margins_name='계')

smoker,Yes,No,계
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,60,97,157
Female,33,54,87
계,93,151,244


#### 4. groupby vs pivot_table

- 성별 팁 비율의 평균

In [25]:
tips.groupby('sex')[['tip_pct']].mean()

Unnamed: 0_level_0,tip_pct
sex,Unnamed: 1_level_1
Male,15.764713
Female,16.648276


In [26]:
tips.pivot_table('tip_pct', 'sex', aggfunc='mean')

Unnamed: 0_level_0,tip_pct
sex,Unnamed: 1_level_1
Male,15.764713
Female,16.648276


In [27]:
tips.pivot_table('tip_pct', 'sex')      # aggfunc 생략하면 mean

Unnamed: 0_level_0,tip_pct
sex,Unnamed: 1_level_1
Male,15.764713
Female,16.648276


- 흡연 유무에 따른 평균 팁 비율과 최소값, 최대값

In [28]:
tips.groupby('smoker')['tip_pct'].agg(['mean','min','max'])

Unnamed: 0_level_0,mean,min,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Yes,16.31914,3.56,71.03
No,15.932318,5.68,29.2


In [29]:
tips.pivot_table('tip_pct', 'smoker', aggfunc=['mean','min','max'])

Unnamed: 0_level_0,mean,min,max
Unnamed: 0_level_1,tip_pct,tip_pct,tip_pct
smoker,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Yes,16.31914,3.56,71.03
No,15.932318,5.68,29.2


- 연습문제 4.7.3

In [30]:
# 1. 팁의 비율이 요일과 점심/저녁 여부, 인원수에 어떤 영향을 받는지 살펴본다.
# 요일별
tips.groupby('day')['tip_pct'].agg(['mean','std'])
tips.pivot_table('tip_pct', 'day', aggfunc=['mean','std'])

Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,tip_pct,tip_pct
day,Unnamed: 1_level_2,Unnamed: 2_level_2
Thur,16.126452,3.864958
Fri,16.991579,4.766803
Sat,15.314598,5.129187
Sun,16.689605,8.473462


In [31]:
# 시간대별
tips.groupby('time')['tip_pct'].agg(['mean','std'])
tips.pivot_table('tip_pct', 'time', aggfunc=['mean','std'])

Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,tip_pct,tip_pct
time,Unnamed: 1_level_2,Unnamed: 2_level_2
Lunch,16.411765,4.024006
Dinner,15.951477,6.74751


In [32]:
# 인원수별
tips.groupby('size')['tip_pct'].agg(['mean','std'])
tips.pivot_table('tip_pct', 'size', aggfunc=['mean','std'])

Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,tip_pct,tip_pct
size,Unnamed: 1_level_2,Unnamed: 2_level_2
1,21.7275,8.034203
2,16.57141,6.684833
3,15.214737,4.545196
4,14.594865,4.238842
5,14.152,6.773143
6,15.62,4.213716


In [33]:
# 2. 어떤 요인이 가장 크게 작용하는지 판단할 수 있는 방법이 있는가?
# 표준편차의 차이가 가장 큰 요일별이 팁 비율에 가장 큰 요인을 끼쳤다.

- 연습문제 4.7.4

In [37]:
df = sns.load_dataset('titanic')
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [38]:
# 1. qcut 명령으로 세 개의 나이 그룹을 만든다.
df['age_group'] = pd.qcut(df.age, 3, labels=['A1','A2','A3'])

In [39]:
# 2. 성별, 선실, 나이 그룹에 의한 생존율을 데이터프레임으로 계산한다.
df.pivot_table('survived', ['sex','class'], 'age_group')

Unnamed: 0_level_0,age_group,A1,A2,A3
sex,class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,First,0.954545,0.947368,0.977273
female,Second,1.0,0.909091,0.857143
female,Third,0.508475,0.481481,0.25
male,First,0.5,0.5,0.347826
male,Second,0.357143,0.076923,0.0625
male,Third,0.158879,0.195652,0.055556


In [40]:
# 3. 성별 및 선실에 의한 생존율
df.pivot_table('survived', 'sex', 'class')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [42]:
df.groupby(['sex','class'])[['survived']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,survived
sex,class,Unnamed: 2_level_1
female,First,0.968085
female,Second,0.921053
female,Third,0.5
male,First,0.368852
male,Second,0.157407
male,Third,0.135447
