In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import pingouin as pg

In [2]:
hr = pd.read_excel('data/hr.xlsx')
hr.head()

Unnamed: 0,department,job_level,marriage,rating,overtime
0,Sales,Salaried,single,4,14
1,Engineering,Hourly,single,4,8
2,Engineering,Hourly,single,4,4
3,Engineering,Salaried,married,4,0
4,Engineering,Hourly,married,2,21


In [3]:
hr.overtime.mean()

9.153061224489797

In [4]:
hr.overtime.median()

6.0

In [5]:
hr.department.mode()

0    Engineering
dtype: object

In [7]:
a = pd.Series([10, 20, 30, 40, 50])
b = pd.Series([10, 30, 30, 30, 50])

a, b

(0    10
 1    20
 2    30
 3    40
 4    50
 dtype: int64,
 0    10
 1    30
 2    30
 3    30
 4    50
 dtype: int64)

In [8]:
a.max() - a.min(), b.max() - b.min()

(40, 40)

In [9]:
a.var(), b.var()

(250.0, 200.0)

In [10]:
hr.overtime.max() - hr.overtime.min()

55

In [11]:
hr.overtime.quantile(0.75) - hr.overtime.quantile(0.25)

11.0

In [12]:
hr.overtime.var()

89.51161417596366

# 가설 검정(t-test, p-value)

In [13]:
# 귀무가설: "사람들의 야근 시간은 0이다(overtime = 0)."
pg.ttest(hr.overtime, 0, confidence=0.95) 

Unnamed: 0,T,dof,tail,p-val,CI95%,cohen-d,BF10,power
T-test,37.092435,1469,two-sided,4.4343470000000003e-213,"[8.67, 9.64]",0.967446,9.627e+208,1.0


In [14]:
pg.ttest(hr.overtime, 0, confidence=0.99)

Unnamed: 0,T,dof,tail,p-val,CI99%,cohen-d,BF10,power
T-test,37.092435,1469,two-sided,4.4343470000000003e-213,"[8.516613519590663, 9.789508929388928]",0.967446,9.627e+208,1.0


In [19]:
# 귀무가설: overtime = 12, 유의수준: 5%
p = pg.ttest(hr.overtime, 12, confidence=0.95)['p-val']
s = 0.05

print(p < s)   # true

T-test    True
Name: p-val, dtype: bool

# 집단 간 비교

## 두개 집단 평균 비교: t-test

In [23]:
car = pd.read_excel('data/car.xlsx')
car.head()

Unnamed: 0,mileage,model,price,year,my_car_damage,other_car_damage
0,63608,K3,970,2017,0,564596
1,69336,K3,1130,2015,1839700,1140150
2,36000,K3,1380,2016,446520,2244910
3,19029,K3,1390,2017,889000,4196110
4,97090,K3,760,2015,2339137,2029570


In [27]:
k3_price = car.price[car.model == 'K3']
avante_price = car.price[car.model == 'Avante']

k3_price.mean(), avante_price.mean()

(913.8115942028985, 833.4146341463414)

In [30]:
# 귀무가설: avante_price.mean() = k3_price.mean()
p = pg.ttest(avante_price, k3_price, confidence=0.95)['p-val']
s = 0.05

p < s

T-test    True
Name: p-val, dtype: bool

In [49]:
# 귀무가설: single_r.mean() = married_r.mean()
single_r = hr.rating[hr.marriage == 'single']
married_r = hr.rating[hr.marriage == 'married']

single_r.mean(), married_r.mean()

(2.745740498034076, 2.9207920792079207)

In [50]:
p = pg.ttest(single_r, married_r, confidence=0.95)['p-val']
s = 0.05

p < s

T-test    True
Name: p-val, dtype: bool

In [53]:
# 귀무가설: single_o.mean() = married_o.mean()
single_o = hr.overtime[hr.marriage == 'single']
married_o = hr.overtime[hr.marriage == 'married']

single_o.mean(), married_o.mean()

(9.271297509829619, 9.025459688826025)

In [54]:
p_ = pg.ttest(single_o, married_o, confidence=0.95)['p-val']
s_ = 0.05

p_ < s_

T-test    False
Name: p-val, dtype: bool

In [59]:
# hr.groupby('marriage').rating.mean()
hr.groupby('marriage').agg({'rating': 'mean'})

Unnamed: 0_level_0,rating
marriage,Unnamed: 1_level_1
married,2.920792
single,2.74574


## 효과 크기

In [60]:
pg.ttest(single_r, married_r, confidence=0.95)

Unnamed: 0,T,dof,tail,p-val,CI95%,cohen-d,BF10,power
T-test,-3.126582,1425.438569,two-sided,0.001804,"[-0.28, -0.07]",0.163818,7.294,0.88023


In [62]:
pg.compute_effsize(single_r, married_r, eftype='eta-square')

0.006664360361763945

## 분산 분석

In [64]:
sf = pd.read_excel('data/safety.xlsx')
sf.head()

Unnamed: 0,risk,fuel,aspiration,doors,body,wheels,engine_location,wheel_base,length,width,height,curb_weight,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg
0,2,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,3.19,3.4,10.0,102,5500,24,30
1,2,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,3.19,3.4,8.0,115,5500,18,22
2,1,gas,std,four,sedan,fwd,front,105.8,192.7,71.4,55.7,2844,3.19,3.4,8.5,110,5500,19,25
3,1,gas,turbo,four,sedan,fwd,front,105.8,192.7,71.4,55.9,3086,3.13,3.4,8.3,140,5500,17,20
4,2,gas,std,two,sedan,rwd,front,101.2,176.8,64.8,54.3,2395,3.5,2.8,8.8,101,5800,23,29


In [71]:
p = pg.anova(dv='risk', between='wheels', data=sf)['p-unc']
s = 0.05

p < s

0    False
Name: p-unc, dtype: bool

In [70]:
pg.anova(dv='risk', between='wheels', data=sf)

Unnamed: 0,Source,ddof1,ddof2,F,p-unc,np2
0,wheels,2,156,2.841743,0.061347,0.035152


In [4]:
?pg.ttest