In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from scipy.stats import ttest_ind, chisquare
from statsmodels.stats.proportion import proportions_ztest

## US Health Insurance Dataset [(source)](https://www.kaggle.com/datasets/teertha/ushealthinsurancedataset)

This dataset contains 1338 rows of insured data, where the Insurance charges are given against the following attributes of the insured: Age, Sex, BMI, Number of Children, Smoker and Region. There are no missing or undefined values in the dataset.

For the following exercises, assume that the data was taken from a random sample of users.

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Refocus/Colab Notebooks/Home Assignments/5.9.4 HA For Submission/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Problem 1: Using the US Health Insurance Dataset, Test if the insurance charges are different for males vs. females.

1. Define the Null and Alternative Hypothesis
1. Implement the applicable hypothesis test and calculate the p-values
1. What is the conclusion at an alpha of 0.05?


In [None]:
df.groupby('sex').charges.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
female,662.0,12569.578844,11128.703801,1607.5101,4885.1587,9412.9625,14454.691825,63770.42801
male,676.0,13956.751178,12971.025915,1121.8739,4619.134,9369.61575,18989.59025,62592.87309


In [None]:
female = df.loc[df.sex=='female', 'charges']
male = df.loc[df.sex =='male', 'charges']

In [None]:
from scipy.stats import ttest_ind

In [None]:
alpha = 0.05

t_stat, p_val = ttest_ind(female, male, equal_var=False, alternative='two-sided')
print ('Test statistic: ', t_stat)
print ('p-value (one-sided): ', p_val)

Test statistic:  -2.1008878232359565
p-value (one-sided):  0.03584101495601664


In [None]:
"""
Null Hypothesis: H0: μ_male = μ_female
Alternative Hypothesis:Ha: μ_male ≠ μ_female
"""

'\nNull Hypothesis: H0: μ_male = μ_female\nAlternative Hypothesis:Ha: μ_male ≠ μ_female\n'

In [None]:
if p_val < alpha:    # greater than, less than, etc.?
    print (f'Reject the Null Hypothesis (alpha = {alpha}).')
else:
    print (f'Fail to Reject the Null Hypothesis (alpha = {alpha}).')

Reject the Null Hypothesis (alpha = 0.05).


## Problem 2: Test if the insurance charges are higher for those with children.

1. Define the Null and Alternative Hypothesis
1. Implement the applicable hypothesis test and calculate the p-values
1. What is the conclusion at an alpha of 0.05?


In [None]:
no_child.describe()

count      574.000000
mean     12365.975602
std      12023.293942
min       1121.873900
25%       2734.421150
50%       9856.951900
75%      14440.123825
max      63770.428010
Name: charges, dtype: float64

In [None]:
with_child.describe()

count      764.000000
mean     13949.941093
std      12138.305911
min       1711.026800
25%       5809.641625
50%       9223.829500
75%      18232.392400
max      60021.398970
Name: charges, dtype: float64

In [None]:
"""
Null Hypothesis:  There is no significant difference in insurance charges between individuals with children and individuals without children.
Alternative Hypothesis: There is a significant difference in insurance charges between individuals with children and individuals without children.
"""

no_child = df.loc[df.children ==0, 'charges']
with_child = df.loc[df.children >0, 'charges']

alpha = 0.05

t_stat, p_val = ttest_ind(no_child, with_child, alternative= 'greater')    # get t-stat and p-value
print ('Test statistic: ', t_stat)
print ('p-value: ', p_val)

if p_val < alpha:
    print (f'Reject the Null Hypothesis (alpha = {alpha}).')
else:
    print (f'Fail to Reject the Null Hypothesis (alpha = {alpha}).')

Test statistic:  -2.3720611301669337
p-value:  0.9910849126582172
Fail to Reject the Null Hypothesis (alpha = 0.05).


## Problem 3: Test if the proportion of smokers is more than 20%

1. Define the Null and Alternative Hypothesis
1. Implement the applicable hypothesis test and calculate the p-values
1. What is the conclusion at an alpha of 0.05?

In [None]:
df.smoker.value_counts()

no     1064
yes     274
Name: smoker, dtype: int64

In [None]:
smokers = df[df.smoker =='yes']
non_smokers = df[df.smoker == 'no']

In [None]:
""""
Null Hypothesis: The proportion of smokers is equal to or less than 20%.
Alternative Hypothesis: The proportion of smokers is greater than 20%.
"""



alpha = .05

# compute for the p-value
z_stat, p_val = proportions_ztest(count=len(smokers),
                                  nobs=len(smokers)+len(non_smokers),
                                  value=0.2,
                                  alternative='larger')
print('p-value:', p_val)

if p_val < alpha:
    print (f'Reject the Null Hypothesis (alpha = {alpha}).')
else:
    print (f'Fail to Reject the Null Hypothesis (alpha = {alpha}).')

p-value: 0.33229937909683
Fail to Reject the Null Hypothesis (alpha = 0.05).


## Problem 4: Test if the proportion of smokers from each region follows the table below:

|           | Non-Smoker | Smoker |
|-----------|------------|-----------|
| northeast     | 20%        | 5%        |
| northwest | 20%        | 5%       |
| southeast | 20%        | 5%       |
| southwest | 20%        | 5%       |

1. Define the Null and Alternative Hypothesis
1. Implement the applicable hypothesis test and calculate the p-values
1. What is the conclusion at an alpha of 0.05?

In [None]:
obs = df.groupby(['region', 'smoker']).charges.count()    # smokers per region
n_obs = len(df)    # number of observations

obs

region     smoker
northeast  no        257
           yes        67
northwest  no        267
           yes        58
southeast  no        273
           yes        91
southwest  no        267
           yes        58
Name: charges, dtype: int64

In [None]:
n_obs

1338

In [None]:
print('Total surveyed:', n_obs)

Total surveyed: 1338


In [None]:
"""
Null Hypothesis: The proportion of smokers in the region is equal to the specified value in the table.
Alternative Hypothesis: The proportion of smokers in the region is not equal to the specified value in the table.
"""


f_obs = obs.values
f_exp = []    # expected frequencies
for exp in [0.2, 0.05, 0.2, 0.05, 0.2, 0.05, 0.2, 0.05]:
  f_exp.append(exp*n_obs)

alpha = .05

chi_sq, p_val = chisquare(f_obs=f_obs, f_exp=f_exp)
print('p-value:', p_val)

if p_val < alpha:
    print (f'Reject the Null Hypothesis (alpha = {alpha}).')
else:
    print (f'Fail to Reject the Null Hypothesis (alpha = {alpha}).')

p-value: 0.11519018608755745
Fail to Reject the Null Hypothesis (alpha = 0.05).
