In [2]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
import scipy.stats as stats


%matplotlib inline

In [3]:
df = pd.read_csv("insurance.csv")

In [3]:
df.corr()

  df.corr()


Unnamed: 0,age,bmi,children,expenses
age,1.0,0.109341,0.042469,0.299008
bmi,0.109341,1.0,0.012645,0.198576
children,0.042469,0.012645,1.0,0.067998
expenses,0.299008,0.198576,0.067998,1.0


- **Assumptions** needs to be checked first before applying hypothesis testing like for TTEST
- **Independent** sample/random sample should ebe chosen from population
- The data should be **normally distributed**
**Homogenity of variance-** The variance should be equally distributed between each group

In [23]:
grouped_data = [group['expenses'] for name, group in df.groupby('smoker')]


for smoker, group in zip(df['smoker'].unique(), grouped_data):
    shapiro_stat, shapiro_p = stats.shapiro(group)
    print(f"Shapiro-Wilk test for 'expenses' in {smoker} smoker:")
    print("Test Statistic =", shapiro_stat)
    print("p-value =", shapiro_p)
    print("Is the data normally distributed? (p-value < 0.05):", shapiro_p < 0.05)
    print()

Shapiro-Wilk test for 'expenses' in yes smoker:
Test Statistic = 0.8728631138801575
p-value = 1.4456972664537127e-28
Is the data normally distributed? (p-value < 0.05): True

Shapiro-Wilk test for 'expenses' in no smoker:
Test Statistic = 0.9395526647567749
p-value = 3.6255991542333277e-09
Is the data normally distributed? (p-value < 0.05): True



In [20]:
grouped_data

[1        1725.55
 2        4449.46
 3       21984.47
 4        3866.86
 5        3756.62
           ...   
 1332    11411.69
 1333    10600.55
 1334     2205.98
 1335     1629.83
 1336     2007.95
 Name: expenses, Length: 1064, dtype: float64,
 0       16884.92
 11      27808.73
 14      39611.76
 19      36837.47
 23      37701.88
           ...   
 1313    36397.58
 1314    18765.88
 1321    28101.33
 1323    43896.38
 1337    29141.36
 Name: expenses, Length: 274, dtype: float64]

In [18]:
for name, group in df.groupby('smoker'):
  print(name)
  print(group)

no
      age     sex   bmi  children smoker     region  expenses
1      18    male  33.8         1     no  southeast   1725.55
2      28    male  33.0         3     no  southeast   4449.46
3      33    male  22.7         0     no  northwest  21984.47
4      32    male  28.9         0     no  northwest   3866.86
5      31  female  25.7         0     no  southeast   3756.62
...   ...     ...   ...       ...    ...        ...       ...
1332   52  female  44.7         3     no  southwest  11411.69
1333   50    male  31.0         3     no  northwest  10600.55
1334   18  female  31.9         0     no  northeast   2205.98
1335   18  female  36.9         0     no  southeast   1629.83
1336   21  female  25.8         0     no  southwest   2007.95

[1064 rows x 7 columns]
yes
      age     sex   bmi  children smoker     region  expenses
0      19  female  27.9         0    yes  southwest  16884.92
11     62  female  26.3         0    yes  southeast  27808.73
14     27    male  42.1         0    y

In [41]:
# Perform Levene's test for homogeneity of variances
levene_stat, levene_p = stats.levene(df['expenses'][df['smoker'] == 'no'],
                                     df['expenses'][df['smoker'] == 'yes'])

print("Levene's test for homogeneity of variances:")
print("Test Statistic =", levene_stat)
print("p-value =", levene_p)
print("Do groups have equal variances? (p-value >= 0.05):", levene_p >= 0.05)

Levene's test for homogeneity of variances:
Test Statistic = 332.6135230494157
p-value = 1.559324245141577e-66
Do groups have equal variances? (p-value >= 0.05): False


In [43]:
df['log_expenses'] = np.log(df['expenses'])

**Dataset is normally distributted but homogenity of variance is voilated**

In [44]:
# Perform Levene's test for homogeneity of variances
levene_stat, levene_p = stats.levene(df['log_expenses'][df['smoker'] == 'no'],
                                     df['log_expenses'][df['smoker'] == 'yes'])

print("Levene's test for homogeneity of variances:")
print("Test Statistic =", levene_stat)
print("p-value =", levene_p)
print("Do groups have equal variances? (p-value >= 0.05):", levene_p >= 0.05)

Levene's test for homogeneity of variances:
Test Statistic = 89.49619232212505
p-value = 1.3326123871073872e-20
Do groups have equal variances? (p-value >= 0.05): False


**As the data is not following homogenity of variance you can't apply TTEST_IND , use kruskal test**

In [15]:
import pandas as pd
import scipy.stats as stats


data = pd.read_csv('insurance.csv')

result = stats.kruskal(data['expenses'][data['smoker'] == 'no'],
                        data['expenses'][data['smoker'] == 'yes'])

print("Kruskal-Wallis Test:")
print("Test Statistic =", result.statistic)
print("p-value =", result.pvalue)
print("Are the medians significantly different? (p-value < 0.05):", result.pvalue < 0.05)

Kruskal-Wallis Test:
Test Statistic = 588.5196583657973
p-value = 5.259018267498522e-130
Are the medians significantly different? (p-value < 0.05): True


In [4]:
## code for ttest_ind which can't be applied on it
data_smokers= df[df['smoker']=='yes']['expenses']
data_non_smokers = df[df['smoker']=='no']['expenses']
_,p_value= ttest_ind(data_smokers , data_non_smokers)

print(p_value)

8.271449574495316e-283


In [6]:
# Null hypothesis-> expenses of smokers and non smokers are equal, there is no significant difference
#between them
if p_value<0.05:
  print("we reject the null hypothesis")
else:
  print("we failed to reject the null hypothesis")

we reject the null hypothesis


we reject that expenses of smokers and non smokers are equal, there is a significant difference between them

In [7]:
data_region = df['region'].value_counts()

In [8]:
data_region

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

- **CHI square** test to determine relation between two categorical variables

In [11]:
from scipy.stats import chi2_contingency
#H0= There is no association between region and smoker

contingency_table = pd.crosstab(df['region'], df['smoker'])
contingency_table

smoker,no,yes
region,Unnamed: 1_level_1,Unnamed: 2_level_1
northeast,257,67
northwest,267,58
southeast,273,91
southwest,267,58


In [12]:
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

In [14]:
print("Chi-Square Statistic:", chi2)
print("P-value:", p_value)

if p_value<0.05:
  print("we reject the null hypothesis")
else:
  print("we failed to reject the null hypothesis")

Chi-Square Statistic: 7.343477761407071
P-value: 0.06171954839170541
we failed to reject the null hypothesis


- **Assumptions** needs to be checked first before applying hypothesis testing like for Anova
- **Independent** sample/random sample should ebe chosen from population
- The data should be **normally distributed**
**Homogenity of variance-** The variance should be equally distributed between each group

In [27]:
import scipy.stats as stats
import matplotlib.pyplot as plt
# Assumption check for 'bmi' and 'expenses' columns
columns_to_check = ['bmi', 'expenses']

for column in columns_to_check:
    # Shapiro-Wilk test for normality
    shapiro_stat, shapiro_p = stats.shapiro(df[column])
    print(f"Shapiro-Wilk test for '{column}':")
    print("Test Statistic =", shapiro_stat)
    print("p-value =", shapiro_p)
    print("Is the data normally distributed? (p-value < 0.05):", shapiro_p < 0.05)
    print()


Shapiro-Wilk test for 'bmi':
Test Statistic = 0.9938240051269531
p-value = 2.3355698431259952e-05
Is the data normally distributed? (p-value < 0.05): True

Shapiro-Wilk test for 'expenses':
Test Statistic = 0.8146882057189941
p-value = 1.1505333015369624e-36
Is the data normally distributed? (p-value < 0.05): True



In [33]:
grouped_data = [group['expenses'] for name, group in df.groupby('region')]


for region, group in zip(df['region'].unique(), grouped_data):
    shapiro_stat, shapiro_p = stats.shapiro(group)
    print(f"Shapiro-Wilk test for 'expenses' in {region} region:")
    print("Test Statistic =", shapiro_stat)
    print("p-value =", shapiro_p)
    print("Is the data normally distributed? (p-value < 0.05):", shapiro_p < 0.05)
    print()

Shapiro-Wilk test for 'expenses' in southwest region:
Test Statistic = 0.8353429436683655
p-value = 6.552730011610829e-18
Is the data normally distributed? (p-value < 0.05): True

Shapiro-Wilk test for 'expenses' in southeast region:
Test Statistic = 0.8128034472465515
p-value = 4.268497271524033e-19
Is the data normally distributed? (p-value < 0.05): True

Shapiro-Wilk test for 'expenses' in northwest region:
Test Statistic = 0.8242297768592834
p-value = 1.2326628686159026e-19
Is the data normally distributed? (p-value < 0.05): True

Shapiro-Wilk test for 'expenses' in northeast region:
Test Statistic = 0.7842957377433777
p-value = 2.0237754014695492e-20
Is the data normally distributed? (p-value < 0.05): True



In [4]:
# Perform Levene's test for homogeneity of variances
levene_stat, levene_p = stats.levene(df['expenses'][df['region'] == 'northeast'],
                                     df['expenses'][df['region'] == 'northwest'],
                                     df['expenses'][df['region'] == 'southeast'],
                                     df['expenses'][df['region'] == 'southwest'])

print("Levene's test for homogeneity of variances:")
print("Test Statistic =", levene_stat)
print("p-value =", levene_p)
print("Do groups have equal variances? (p-value >= 0.05):", levene_p >= 0.05)

Levene's test for homogeneity of variances:
Test Statistic = 5.559967624103929
p-value = 0.0008610579801088541
Do groups have equal variances? (p-value >= 0.05): False


Homogenoty of variance is voilated so you cant apply f_oneway use kruskal test

In [14]:
import pandas as pd
import scipy.stats as stats


data = pd.read_csv('insurance.csv')

result = stats.kruskal(data['expenses'][data['region'] == 'northeast'],
                        data['expenses'][data['region'] == 'northwest'],
                        data['expenses'][data['region'] == 'southeast'],
                        data['expenses'][data['region'] == 'southwest'])

print("Kruskal-Wallis Test:")
print("Test Statistic =", result.statistic)
print("p-value =", result.pvalue)
print("Are the medians significantly different? (p-value < 0.05):", result.pvalue < 0.05)


Kruskal-Wallis Test:
Test Statistic = 4.734181215658743
p-value = 0.19232908072121002
Are the medians significantly different? (p-value < 0.05): False


**ANOVA TEST**

In [5]:
##one way anova can't be applied on it but to just to code we appled it
from scipy.stats import f_oneway

result = stats.f_oneway(df['expenses'][df['region'] == 'northeast'],
                        df['expenses'][df['region'] == 'northwest'],
                        df['expenses'][df['region'] == 'southeast'],
                        df['expenses'][df['region'] == 'southwest'])

print("Welch's ANOVA:")
print("Test Statistic =", result.statistic)
print("p-value =", result.pvalue)
print("Are the means significantly different? (p-value < 0.05):", result.pvalue < 0.05)

Welch's ANOVA:
Test Statistic = 2.9696265038563223
p-value = 0.030893363993447484
Are the means significantly different? (p-value < 0.05): True


Conclusion- All group means are not equal there is a different between means of each region based on expenses

In [6]:
 df.groupby('region')['expenses'].value_counts()

region     expenses
northeast  1694.80     1
           1702.46     1
           1704.57     1
           1704.70     1
           1705.62     1
                      ..
southwest  47928.03    1
           48173.36    1
           48824.45    1
           51194.56    1
           52590.83    1
Name: expenses, Length: 1337, dtype: int64

**correlation test between BMI and expenses**

In [7]:
# Calculate Pearson correlation coefficient
correlation_coefficient = df['bmi'].corr(df['expenses'])

print("Pearson Correlation Coefficient between 'bmi' and 'expenses':", correlation_coefficient)

Pearson Correlation Coefficient between 'bmi' and 'expenses': 0.19857625501893203


value between -1 and 1, where 1 indicates a perfect positive correlation, -1 indicates a perfect negative correlation, and 0 indicates no correlation.

**Spearman coefficient is calculated when the data is not linear or not normally distributed**

In [24]:
# Calculate Pearson correlation coefficient
correlation_coefficient = df['bmi'].corr(df['expenses'], method='spearman')

print("Pearson Correlation Coefficient between 'bmi' and 'expenses':", correlation_coefficient)

Pearson Correlation Coefficient between 'bmi' and 'expenses': 0.1194188536346528
