In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import ttest_ind
from scipy.stats import chi2_contingency
from scipy.stats import f_oneway

In [8]:
df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vQBN8DPW2rdiRrY34eEM53HAzakNGSRrw4ogI-j8HyCUrbqTB_z4CeIn2IvjLF-w_6sOe5pIlypJGAA/pub?output=csv')

In [9]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [10]:


# Q1: Do smokers have higher insurance charges than non-smokers?
smokers_charges = df[df['smoker'] == 'yes']['charges']
non_smokers_charges = df[df['smoker'] == 'no']['charges']

# Define null and alternative hypotheses
# H0: The means of charges for smokers and non-smokers are equal
# H1: The mean charge for smokers is greater than the mean charge for non-smokers
t_stat, p_value = ttest_ind(smokers_charges, non_smokers_charges, alternative='greater')

# Set your significance level (e.g., 0.05)
alpha = 0.05

# Compare p-value to the significance level
if p_value < alpha:
    print("Reject the null hypothesis. There is evidence that smokers have higher insurance charges than non-smokers.")
else:
    print("Fail to reject the null hypothesis. There is not enough evidence to conclude that smokers have higher insurance charges.")


Reject the null hypothesis. There is evidence that smokers have higher insurance charges than non-smokers.


In [12]:

# Q2: Are men more likely to smoke than women?
contingency_table = pd.crosstab(df['sex'], df['smoker'])

# Define null and alternative hypotheses
# H0: There is no association between gender and smoking status
# H1: There is an association between gender and smoking status
chi2_stat, p_value_chi2, _, _ = chi2_contingency(contingency_table)

# Compare p-value to the significance level
if p_value_chi2 < alpha:
    print("Reject the null hypothesis. There is evidence that men are more likely to smoke than women.")
else:
    print("Fail to reject the null hypothesis. There is not enough evidence to conclude a significant association between gender and smoking status.")

Reject the null hypothesis. There is evidence that men are more likely to smoke than women.


In [13]:


# Q3: Do different regions have different charges, on average?
regions = df['region'].unique()
region_groups = [df[df['region'] == region]['charges'] for region in regions]

# Define null and alternative hypotheses
# H0: There is no significant difference in the means of charges across different regions
# H1: There is a significant difference in the means of charges across different regions
f_stat, p_value_anova = f_oneway(*region_groups)

# Compare p-value to the significance level
if p_value_anova < alpha:
    print("Reject the null hypothesis. There is evidence that different regions have different charges, on average.")
else:
    print("Fail to reject the null hypothesis. There is not enough evidence to conclude a significant difference in charges across different regions.")

Reject the null hypothesis. There is evidence that different regions have different charges, on average.
