In [1]:
import pandas as pd
import pingouin as pg

In [2]:
sf = pd.read_excel('data/safety.xlsx')
sf.head()

Unnamed: 0,risk,fuel,aspiration,doors,body,wheels,engine_location,wheel_base,length,width,height,curb_weight,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg
0,2,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,3.19,3.4,10.0,102,5500,24,30
1,2,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,3.19,3.4,8.0,115,5500,18,22
2,1,gas,std,four,sedan,fwd,front,105.8,192.7,71.4,55.7,2844,3.19,3.4,8.5,110,5500,19,25
3,1,gas,turbo,four,sedan,fwd,front,105.8,192.7,71.4,55.9,3086,3.13,3.4,8.3,140,5500,17,20
4,2,gas,std,two,sedan,rwd,front,101.2,176.8,64.8,54.3,2395,3.5,2.8,8.8,101,5800,23,29


# 기술 통계(descriptive statistics)
    1-1. 데이터가 어디에 몰려 있나?
    1-2. 데이터를 어떻게 나눌 수 있나?
    1-3. 데이터가 어떻게 분포되어 있나?

## 데이터가 어디에 몰려 있나?

### 평균(mean)

In [3]:
width_mean = sf.width.mean()

### 중앙값(median)

In [4]:
width_median = sf.width.median()

### 최빈값(mode)

In [5]:
body_mode = sf.body.mode()

## 데이터를 어떻게 나눌 수 있나?

### 최대값

In [6]:
width_max = sf.width.max()

### 최소값

In [7]:
width_min = sf.width.min()

### 범위

In [9]:
width_range = width_max - width_min
width_range

11.400000000000006

### 분위수

In [10]:
width_1q = sf.width.quantile(0.25)
width_3q = sf.width.quantile(0.75)

### IQR

In [11]:
width_iqr = width_3q - width_1q
width_iqr

2.5

## 데이터가 어떻게 분포되어 있나?

### 편차: 평균과의 거리

In [13]:
width_mean - sf.loc[4, 'width']

0.8075471698113432

### 분산(variance): 편차 제곱의 평균

In [14]:
sf.width.var()

3.7942464771913063

### 표준편차(standard deviation): 분산의 제곱근

In [15]:
sf.width.std()

1.9478825624742644

# 집단 간 비교

자료형, 비교 집단 수에 따라 분석 방법이 다름

1. t-test: 연속형 자료. 2개 이하 집단.
2. 분산 분석: 연속형 자료. 3개 이상 집단
3. 카이제곱 검정: 범주형 자료.

## t-test

### 1개 집단
- 1개 집단의 표본 평균을 모평균 추정치(기준값)와 비교할 때 사용
- 예시: 초등학교 6학년 평균 키가 160cm 이라고 할때, 서울 광진 초등학교 6학년 1반의 평균 키를 측정해 비교

In [25]:
# 신뢰 수준 95%에서 width 검정
t = pg.ttest(sf.width, 0, confidence=0.95)
t

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,424.707171,158,two-sided,1.6897650000000002e-243,"[65.3, 65.91]",33.68147,5.0310000000000005e+238,1.0


In [26]:
# 통계적 유의성 확인: 
p = t['p-val'] 
p < 0.05

T-test    True
Name: p-val, dtype: bool

In [28]:
# 신뢰 구간
c_level = t['CI95%']
c_level

T-test    [65.3, 65.91]
Name: CI95%, dtype: object

In [30]:
hr = pd.read_excel('data/hr.xlsx')
hr.head()

Unnamed: 0,department,job_level,marriage,rating,overtime
0,Sales,Salaried,single,4,14
1,Engineering,Hourly,single,4,8
2,Engineering,Hourly,single,4,4
3,Engineering,Salaried,married,4,0
4,Engineering,Hourly,married,2,21


In [32]:
pg.ttest(hr.overtime, 12)['p-val'] < 0.05

T-test    True
Name: p-val, dtype: bool

### 2개 집단
2개 집단의 표본 평균을 비교해서 같지 않으면 귀무가설 기각(A-B = 0)

1. 유의수준 5%에서 결혼 여부에 따른 rating 비교

In [46]:
# 1. 결혼 여부에 따른 rating 

single_rating = hr[hr.marriage == 'single'].rating
married_rating = hr[hr.marriage == 'married'].rating

In [47]:
# 2. t-test

t = pg.ttest(single_rating, married_rating, confidence=0.95)
t

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-3.126582,1425.438569,two-sided,0.001804,"[-0.28, -0.07]",0.163818,7.294,0.88023


In [45]:
# 3. 유의성 검정
t['p-val'] < 0.05

T-test    True
Name: p-val, dtype: bool

2. 유의수준 5%에서 결혼 여부에 따른 overtime 비교

In [48]:
single_overtime = hr[hr.marriage == 'single'].overtime
married_overtime = hr[hr.marriage == 'married'].overtime

In [49]:
t = pg.ttest(single_overtime, married_overtime)
t

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.498229,1465.031434,two-sided,0.618398,"[-0.72, 1.21]",0.025978,0.066,0.078781


In [50]:
t['p-val'] < 0.05

T-test    False
Name: p-val, dtype: bool

### 효과 크기(effect size)
    1. 코헨의 d
    2. 에타 제곱

In [58]:
# 1. 코헨의 d
t['cohen-d']

# 또는 
pg.compute_effsize(single_rating, married_rating, eftype='cohen')

-0.16381785049867503

In [57]:
# 2. 에타 제곱
pg.compute_effsize(single_rating, married_rating, eftype='eta-square')

0.006664360361763945

## 분산 분석과 사후 검정

### 분산 분석(ANOVA)

분산이 같다는 가정 하에 ANOVA 분석 실시

1. 유의수준 5%에서 휠 종류에 따라 risk에 차이가 있는지 비교

In [62]:
sf.head()

Unnamed: 0,risk,fuel,aspiration,doors,body,wheels,engine_location,wheel_base,length,width,height,curb_weight,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg
0,2,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,3.19,3.4,10.0,102,5500,24,30
1,2,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,3.19,3.4,8.0,115,5500,18,22
2,1,gas,std,four,sedan,fwd,front,105.8,192.7,71.4,55.7,2844,3.19,3.4,8.5,110,5500,19,25
3,1,gas,turbo,four,sedan,fwd,front,105.8,192.7,71.4,55.9,3086,3.13,3.4,8.3,140,5500,17,20
4,2,gas,std,two,sedan,rwd,front,101.2,176.8,64.8,54.3,2395,3.5,2.8,8.8,101,5800,23,29


In [67]:
anova = pg.anova(data=sf, between='wheels', dv='risk')
anova

Unnamed: 0,Source,ddof1,ddof2,F,p-unc,np2
0,wheels,2,156,2.841743,0.061347,0.035152


In [68]:
anova['p-unc'] < 0.05

0    False
Name: p-unc, dtype: bool

휠 종류에 따른 리스크에는 유의미한 차이가 없음

2. 인종에 따른 교육 수에 차이가 있는지 비교

In [69]:
census = pd.read_excel('data/census.xlsx')
census.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [74]:
anova = pg.anova(data=census, between='race', dv='education_num')
anova['p-unc'] < 0.05

0    True
Name: p-unc, dtype: bool

In [None]:
인종에 따른 교육 수에는 유의미한 차이가 있음. 사후 검정 진행

### 사후 검정(Tukey HSD)
ANOVA 분석에서 통계적 유의성이 확인되면 사후 검정을 통해 집단 간 차이 비교

In [76]:
tukey = pg.pairwise_tukey(data=census, between='race', dv='education_num')
tukey[tukey['p-tukey'] < 0.05]

Unnamed: 0,A,B,mean(A),mean(B),diff,se,T,p-tukey,hedges
0,Amer-Indian-Eskimo,Asian-Pac-Islander,9.311897,10.960539,-1.648642,0.1653,-9.973665,0.001,-0.644306
3,Amer-Indian-Eskimo,White,9.311897,10.135246,-0.823349,0.145823,-5.64621,0.001,-0.321943
4,Asian-Pac-Islander,Black,10.960539,9.486236,1.474303,0.091587,16.097347,0.001,0.576389
5,Asian-Pac-Islander,Other,10.960539,8.841328,2.119211,0.174436,12.148936,0.001,0.828195
6,Asian-Pac-Islander,White,10.960539,10.135246,0.825293,0.080807,10.21316,0.001,0.322704
7,Black,Other,9.486236,8.841328,0.644907,0.161947,3.982216,0.001,0.252121
8,Black,White,9.486236,10.135246,-0.64901,0.048256,-13.449361,0.001,-0.253775
9,Other,White,8.841328,10.135246,-1.293917,0.156104,-8.288831,0.001,-0.505944


p-tukey 값이 0.05 이하인 경우 hedges 값으로 영향력 확인

### 등분산성 검정

- 통상적으로 분산이 같다는 가정 하에 anova 분석과 tukey 검정을 진행
- 정석대로라면 분산 분석 이전에 분산이 같은지 확인하고 그 결과에 따라 분석을 달리해야함
- 만약 분산이 같지 않다면 welch anova 분석 후에 games howell 검정
- `pg.homoscedasticity(data=DataFrame, group='col', dv='col')`

In [77]:
pg.homoscedasticity(data=census, group='race', dv='education_num')

Unnamed: 0,W,pval,equal_var
levene,45.022712,9.087049999999999e-38,False


## 카이제곱 검정

### 피봇 테이블

In [82]:
sf.pivot_table(index='doors', columns='aspiration', aggfunc='size')

aspiration,std,turbo
doors,Unnamed: 1_level_1,Unnamed: 2_level_1
four,76,19
two,56,8


### 카이제곱 검정