In [1]:
import pandas as pd
import numpy as np
np.random.seed(104)
df = pd.read_csv('data/MDA_09_coffee_dataset.csv')
print(df.info())
print(df.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2974 entries, 0 to 2973
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   user_id        2974 non-null   int64  
 1   age            2974 non-null   object 
 2   drinks_coffee  2974 non-null   bool   
 3   height         2974 non-null   float64
dtypes: bool(1), float64(1), int64(1), object(1)
memory usage: 72.7+ KB
None
   user_id   age  drinks_coffee     height
0     4509   <21          False  64.538179
1     1864  >=21           True  65.824249
2     2060   <21          False  71.319854
3     7875  >=21           True  68.569404
4     6254   <21           True  64.020226


In [2]:
# Randomly sample 200 samples from the population.
df_sample = df.sample(200)
print(df_sample.info())
print(df_sample.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 486 to 914
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   user_id        200 non-null    int64  
 1   age            200 non-null    object 
 2   drinks_coffee  200 non-null    bool   
 3   height         200 non-null    float64
dtypes: bool(1), float64(1), int64(1), object(1)
memory usage: 6.4+ KB
None
      user_id   age  drinks_coffee     height
486      4951   <21          False  68.539632
2297     5383  >=21           True  70.594166
911      3904   <21          False  61.114035
146      4351  >=21           True  68.952444
2255     3998   <21          False  65.493818


In [36]:
##### Confidence interval using bootstrap #####
# Let's repeat the bootstrap 10,000 times to find the 99.7% confidence interval for the height difference
# between people who do not drink coffee and people who drink coffee.
# 1. Average height difference between non-coffee drinkers and coffee drinkers
iterationNum = 1000
diffHeightList = []
for _ in range(iterationNum):
    bootSample = df_sample.sample(200, replace=True) # 복원 추출(뺀게 사라지지 않음)
    nonCoffeeHeightMean = bootSample[bootSample['drinks_coffee'] == False].height.mean()
    # Avg. height of people who drink coffee
    coffeeHeightMean = bootSample[bootSample['drinks_coffee'] == True].height.mean()
    diff = nonCoffeeHeightMean - coffeeHeightMean
    diffHeightList.append(diff)
print(diffHeightList)

[-1.8404976143249456, -1.6411363632151108, -2.1059029515546115, -1.8300809516594398, -2.235119989789837, -1.6170014288697416, -1.5282434198686303, -2.4673151809740546, -1.818587345812432, -1.4745163864223514, -1.539476022658988, -1.756440413352152, -2.5313773131382504, -1.8469100536094487, -2.488021548595526, -1.3072817161672106, -2.0593895415056522, -2.2447549165356833, -1.8863000827946053, -2.1961322819566647, -1.9207464402330032, -1.64800030551838, -2.504825805708407, -2.408836648093157, -2.8239245996408613, -2.695568375906163, -1.9504667269473686, -1.7804537598127013, -1.931696376790157, -1.9332127630997462, -1.660114730696307, -2.1343070053863613, -2.203425127453613, -2.261872331287435, -2.4948894328829567, -2.4557516077662314, -2.156070631868829, -1.4520995873126878, -2.866145760145585, -2.0634219569041363, -1.1566289422324871, -2.2892902918836313, -1.826696279577007, -1.9054013382759365, -2.118559726269865, -2.0569866098796865, -1.6487217621068027, -1.8041253846588319, -2.127237

In [27]:
print("mean of height diff:", np.mean(diffHeightList))
print("SE of Height diff:", np.std(diffHeightList))
print("Lowerbound(0.3):", np.percentile(diffHeightList, 0.3))
print("Uppperbound(99.7):", np.percentile(diffHeightList, 99.7))

mean of height diff: -1.9731155933695128
SE of Height diff: 0.47030596985506634
Lowerbound(0.3): -3.2109730755508936
Uppperbound(99.7): -0.7021918266349381


실제 모집단이 잘 모사되었는지 확인

In [28]:
print("##### Height differences in the population #####")
# # 1. Average height difference between non-coffee drinkers and coffee drinkers
diffHeight = df[df['drinks_coffee'] == False].height.mean() - df[df['drinks_coffee'] == True].height.mean()
print("diffHeight : ",diffHeight)

##### Height differences in the population #####
diffHeight :  -1.9568024933368093


In [29]:
print("Lowerbound(0.3):", np.percentile(diffHeightList, 0.3))
print("Uppperbound(99.7):", np.percentile(diffHeightList, 99.7))

Lowerbound(0.3): -3.2109730755508936
Uppperbound(99.7): -0.7021918266349381


심슨의 역설 테스트

In [37]:
# 2. Average height difference between people over 21 years old and under 21 years old
diffHeightListByAge = []
for _ in range(iterationNum):
    bootSample = df_sample.sample(200, replace=True) # sampling with replacement
    over21HeightMean = bootSample[bootSample['age'] == '>=21'].height.mean() # Avg.Height for over 21
    under21HeightMean = bootSample[bootSample['age'] == '<21'].height.mean() # Avg.Height for under 21
    diff = over21HeightMean - under21HeightMean
    diffHeightListByAge.append(diff)
# When the confidence level is 99%.7, the confidence interval for the average height difference
print("Lowerbound(0.3):", np.percentile(diffHeightListByAge, 0.3))
print("Uppperbound(99.7):", np.percentile(diffHeightListByAge, 99.7))

Lowerbound(0.3): 3.146823373633989
Uppperbound(99.7): 4.978688298300155


In [38]:
# 3. Average height difference between non-coffee drinkers and coffee drinkers among people under 21 years of age
diffHeightListUnder21 = []
for _ in range(iterationNum):
    bootSample = df_sample.sample(200, replace=True) # sampling with replacement
    # Average height of people under 21 years of age who do not drink coffee
    nonCoffeeHeightMeanUnder21 = bootSample.query("age == '<21' and drinks_coffee == False").height.mean()
    # Average height of people under 21 years of age who drink coffee
    coffeeHeightMeanUnder21 = bootSample.query("age == '<21' and drinks_coffee == True").height.mean()
    diff = nonCoffeeHeightMeanUnder21 - coffeeHeightMeanUnder21
    diffHeightListUnder21.append(diff)
# When the confidence level is 99%.7, the confidence interval for the average height difference
print("Lowerbound(0.3):", np.percentile(diffHeightListUnder21, 0.3))
print("Uppperbound(99.7):", np.percentile(diffHeightListUnder21, 99.7))

Lowerbound(0.3): 0.164372696924269
Uppperbound(99.7): 2.471336924390501


In [39]:
# 4. # Average height difference between non-coffee drinkers and coffee drinkers among people over 21 years of age
diffHeightListOver21 = []
for _ in range(iterationNum):
    bootSample = df_sample.sample(200, replace=True) # sampling with replacement
    # Average height of people over 21 years of age who do not drink coffee
    nonCoffeeHeightMeanOver21 = bootSample.query("age != '<21' and drinks_coffee == False").height.mean()
    # Average height of people over 21 years of age who drink coffee
    coffeeHeightMeanOver21 = bootSample.query("age != '<21' and drinks_coffee == True").height.mean()
    diff = nonCoffeeHeightMeanOver21 - coffeeHeightMeanOver21
    diffHeightListOver21.append(diff)
# When the confidence level is 99%.7, the confidence interval for the average height difference
print("Lowerbound(0.3):", np.percentile(diffHeightListOver21, 0.3))
print("Uppperbound(99.7):", np.percentile(diffHeightListOver21, 99.7))

Lowerbound(0.3): 0.42412937369253234
Uppperbound(99.7): 3.6414373788072902


In [40]:
print("##### Height differences in the population #####")
# # 1. Average height difference between non-coffee drinkers and coffee drinkers
diffHeight = df[df['drinks_coffee'] == False].height.mean() - df[df['drinks_coffee'] == True].height.mean()
print("1. diffHeight : ",diffHeight)
# 2. Average height difference between people over 21 years old and under 21 years old
diffHeightByAge = df[df['age'] == '>=21'].height.mean() - df[df['age'] == '<21'].height.mean()
print("2. diffHeight : ",diffHeightByAge)
# 3. Average height difference between non-coffee drinkers and coffee drinkers among people under 21 years of age
diffHeightUnder21 = df.query("age == '<21' and drinks_coffee == False").height.mean() - df.query("age == '<21' and drinks_coffee == True").height.mean()
print("3. diffHeight : ",diffHeightUnder21)
# 4. Average height difference between non-coffee drinkers and coffee drinkers among people over 21 years of age
diffHeightOver21 = df.query("age != '<21' and drinks_coffee == False").height.mean() - df.query("age != '<21' and drinks_coffee == True").height.mean()
print("4. diffHeight : ",diffHeightOver21)

##### Height differences in the population #####
1. diffHeight :  -1.9568024933368093
2. diffHeight :  3.882291249920982
3. diffHeight :  1.69939009355123
4. diffHeight :  1.950935488978871
