In [1]:
import pandas as pd
import numpy as np
np.random.seed(104)
df = pd.read_csv('data/MDA_09_coffee_dataset.csv')
print(df.info())
print(df.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2974 entries, 0 to 2973
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   user_id        2974 non-null   int64  
 1   age            2974 non-null   object 
 2   drinks_coffee  2974 non-null   bool   
 3   height         2974 non-null   float64
dtypes: bool(1), float64(1), int64(1), object(1)
memory usage: 72.7+ KB
None
   user_id   age  drinks_coffee     height
0     4509   <21          False  64.538179
1     1864  >=21           True  65.824249
2     2060   <21          False  71.319854
3     7875  >=21           True  68.569404
4     6254   <21           True  64.020226


In [2]:
# 모집단에서 무작위로 200개의 샘플을 추출합니다.
df_sample = df.sample(200)
print(df_sample.info())
print(df_sample.head())


<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 486 to 914
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   user_id        200 non-null    int64  
 1   age            200 non-null    object 
 2   drinks_coffee  200 non-null    bool   
 3   height         200 non-null    float64
dtypes: bool(1), float64(1), int64(1), object(1)
memory usage: 6.4+ KB
None
      user_id   age  drinks_coffee     height
486      4951   <21          False  68.539632
2297     5383  >=21           True  70.594166
911      3904   <21          False  61.114035
146      4351  >=21           True  68.952444
2255     3998   <21          False  65.493818


In [3]:
##### 부트스트랩을 이용한 신뢰 구간 #####
# 커피를 마시지 않는 사람과 커피를 마시는 사람 사이의 키 차이에 대한 99.7% 신뢰 구간을 찾기 위해 부트스트랩을 10,000번 반복합니다.
# 1. 커피를 마시지 않는 사람과 커피를 마시는 사람의 평균 키 차이
iterationNum = 1000
diffHeightList = []
for _ in range(iterationNum):
    bootSample = df_sample.sample(200, replace=True) # 복원 추출(뺀게 사라지지 않음)
    nonCoffeeHeightMean = bootSample[bootSample['drinks_coffee'] == False].height.mean()
    # 커피를 마시는 사람의 평균 키
    coffeeHeightMean = bootSample[bootSample['drinks_coffee'] == True].height.mean()
    diff = nonCoffeeHeightMean - coffeeHeightMean
    diffHeightList.append(diff)
print(diffHeightList)


[-2.3929667770768503, -2.0647578175229455, -2.32933016661255, -2.6223135326238918, -2.5263249774364596, -2.1554450102931924, -2.072318551245317, -2.4804111167239853, -2.2028504931912494, -2.7655965824984747, -1.8479659231709746, -1.9811933635816814, -1.9115400542464727, -1.7141440784259743, -2.2514928597209263, -2.707708321185521, -1.697445299755742, -2.45954446934752, -1.3526580509481931, -2.9077439480872016, -1.555719804323843, -2.6004039920906052, -2.1863029384512913, -1.9707099488349797, -1.8899046345292447, -2.5872088550983534, -1.9516748565569912, -2.530864326736946, -1.3770599285108034, -1.5617915080703568, -2.4739879363049653, -2.8362440094340826, -2.711936457278867, -2.3251594966263696, -2.1404143201666983, -2.2922368871256964, -2.2381982122964814, -1.6652385646013812, -1.6748272599637346, -1.911526735231476, -2.37524377636538, -1.21913423271522, -1.5087655634795425, -2.2129684639498635, -1.6581479117318736, -2.162129632618374, -1.551961033315564, -1.9382113696158285, -2.22181

In [4]:
print("mean of height diff:", np.mean(diffHeightList))
print("SE of Height diff:", np.std(diffHeightList))
print("Lowerbound(0.3):", np.percentile(diffHeightList, 0.3))
print("Uppperbound(99.7):", np.percentile(diffHeightList, 99.7))

mean of height diff: -1.9686647364200545
SE of Height diff: 0.46858231596131156
Lowerbound(0.3): -3.1053882693534907
Uppperbound(99.7): -0.8117815664737708


실제 모집단이 잘 모사되었는지 확인

In [5]:
print("##### Height differences in the population #####")
# # 1. Average height difference between non-coffee drinkers and coffee drinkers
diffHeight = df[df['drinks_coffee'] == False].height.mean() - df[df['drinks_coffee'] == True].height.mean()
print("diffHeight : ",diffHeight)

##### Height differences in the population #####
diffHeight :  -1.9568024933368093


In [6]:
print("Lowerbound(0.3):", np.percentile(diffHeightList, 0.3))
print("Uppperbound(99.7):", np.percentile(diffHeightList, 99.7))

Lowerbound(0.3): -3.1053882693534907
Uppperbound(99.7): -0.8117815664737708


심슨의 역설 테스트

In [7]:
# 2. Average height difference between people over 21 years old and under 21 years old
diffHeightListByAge = []
for _ in range(iterationNum):
    bootSample = df_sample.sample(200, replace=True) # sampling with replacement
    over21HeightMean = bootSample[bootSample['age'] == '>=21'].height.mean() # Avg.Height for over 21
    under21HeightMean = bootSample[bootSample['age'] == '<21'].height.mean() # Avg.Height for under 21
    diff = over21HeightMean - under21HeightMean
    diffHeightListByAge.append(diff)
# When the confidence level is 99%.7, the confidence interval for the average height difference
print("Lowerbound(0.3):", np.percentile(diffHeightListByAge, 0.3))
print("Uppperbound(99.7):", np.percentile(diffHeightListByAge, 99.7))

Lowerbound(0.3): 3.1355516936531616
Uppperbound(99.7): 5.016793651153235


In [8]:
# 3. Average height difference between non-coffee drinkers and coffee drinkers among people under 21 years of age
diffHeightListUnder21 = []
for _ in range(iterationNum):
    bootSample = df_sample.sample(200, replace=True) # sampling with replacement
    # Average height of people under 21 years of age who do not drink coffee
    nonCoffeeHeightMeanUnder21 = bootSample.query("age == '<21' and drinks_coffee == False").height.mean()
    # Average height of people under 21 years of age who drink coffee
    coffeeHeightMeanUnder21 = bootSample.query("age == '<21' and drinks_coffee == True").height.mean()
    diff = nonCoffeeHeightMeanUnder21 - coffeeHeightMeanUnder21
    diffHeightListUnder21.append(diff)
# When the confidence level is 99%.7, the confidence interval for the average height difference
print("Lowerbound(0.3):", np.percentile(diffHeightListUnder21, 0.3))
print("Uppperbound(99.7):", np.percentile(diffHeightListUnder21, 99.7))

Lowerbound(0.3): 0.26628329321498534
Uppperbound(99.7): 2.7262103458140956


In [9]:
# 4. # Average height difference between non-coffee drinkers and coffee drinkers among people over 21 years of age
diffHeightListOver21 = []
for _ in range(iterationNum):
    bootSample = df_sample.sample(200, replace=True) # sampling with replacement
    # Average height of people over 21 years of age who do not drink coffee
    nonCoffeeHeightMeanOver21 = bootSample.query("age != '<21' and drinks_coffee == False").height.mean()
    # Average height of people over 21 years of age who drink coffee
    coffeeHeightMeanOver21 = bootSample.query("age != '<21' and drinks_coffee == True").height.mean()
    diff = nonCoffeeHeightMeanOver21 - coffeeHeightMeanOver21
    diffHeightListOver21.append(diff)
# When the confidence level is 99%.7, the confidence interval for the average height difference
print("Lowerbound(0.3):", np.percentile(diffHeightListOver21, 0.3))
print("Uppperbound(99.7):", np.percentile(diffHeightListOver21, 99.7))

Lowerbound(0.3): 0.44352554901581825
Uppperbound(99.7): 3.3338172914183075


In [10]:
print("##### Height differences in the population #####")
# # 1. Average height difference between non-coffee drinkers and coffee drinkers
diffHeight = df[df['drinks_coffee'] == False].height.mean() - df[df['drinks_coffee'] == True].height.mean()
print("1. diffHeight : ",diffHeight)
# 2. Average height difference between people over 21 years old and under 21 years old
diffHeightByAge = df[df['age'] == '>=21'].height.mean() - df[df['age'] == '<21'].height.mean()
print("2. diffHeight : ",diffHeightByAge)
# 3. Average height difference between non-coffee drinkers and coffee drinkers among people under 21 years of age
diffHeightUnder21 = df.query("age == '<21' and drinks_coffee == False").height.mean() - df.query("age == '<21' and drinks_coffee == True").height.mean()
print("3. diffHeight : ",diffHeightUnder21)
# 4. Average height difference between non-coffee drinkers and coffee drinkers among people over 21 years of age
diffHeightOver21 = df.query("age != '<21' and drinks_coffee == False").height.mean() - df.query("age != '<21' and drinks_coffee == True").height.mean()
print("4. diffHeight : ",diffHeightOver21)

##### Height differences in the population #####
1. diffHeight :  -1.9568024933368093
2. diffHeight :  3.882291249920982
3. diffHeight :  1.69939009355123
4. diffHeight :  1.950935488978871
