In [3]:
import json
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
import scipy


In [2]:
## load the kiva_loans.csv. display info and head
df = pd.read_csv('Data/insurance - insurance.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# Q1: Do smokers have higher insurance charges than non-smokers?

## 1. State the Hypothesis & Null Hypothesis

 Ho (Null Hypothesis):Everyone has the same insurance charges regardless if they smoke.
 
 Ha (Alternative Hypothesis): Those that smoke have higher insurance charges than those that don't smoke 

## 2. Determine the correct test to perform.

Type of Data? Numeric

How many groups/samples? 2 groups

Therefore, which test is appropriate? 2 sample T test

In [7]:
# creating a df with only the columns that I am interested in 
df1 = df[['smoker','charges']]
df1.head()

Unnamed: 0,smoker,charges
0,yes,16884.924
1,no,1725.5523
2,no,4449.462
3,no,21984.47061
4,no,3866.8552


In [9]:
# creating a group that has a discount and showing the counts

df1['is_smoker'] = df1['smoker'] == 'yes'
df1['is_smoker'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['is_smoker'] = df1['smoker'] == 'yes'


False    1064
True      274
Name: is_smoker, dtype: int64

In [10]:
## save list of columns needed for each group
needed_cols = ['charges','is_smoker']
df1[needed_cols]

Unnamed: 0,charges,is_smoker
0,16884.92400,True
1,1725.55230,False
2,4449.46200,False
3,21984.47061,False
4,3866.85520,False
...,...,...
1333,10600.54830,False
1334,2205.98080,False
1335,1629.83350,False
1336,2007.94500,False


In [13]:
# creating a non smoker df from non smoker group
non_df = df1.loc[df1['is_smoker']==False, needed_cols]
non_df

Unnamed: 0,charges,is_smoker
1,1725.55230,False
2,4449.46200,False
3,21984.47061,False
4,3866.85520,False
5,3756.62160,False
...,...,...
1332,11411.68500,False
1333,10600.54830,False
1334,2205.98080,False
1335,1629.83350,False


In [14]:
# creating a is smoker df 
smoke_df = df1.loc[df1['is_smoker']==True, needed_cols]
smoke_df

Unnamed: 0,charges,is_smoker
0,16884.92400,True
11,27808.72510,True
14,39611.75770,True
19,36837.46700,True
23,37701.87680,True
...,...,...
1313,36397.57600,True
1314,18765.87545,True
1321,28101.33305,True
1323,43896.37630,True


## Testing Assumptions

### No significant Outliers

In [15]:
## Saving the numeric col as final group variables

smoker = smoke_df['charges']
non_smoker = non_df['charges']

smoker

0       16884.92400
11      27808.72510
14      39611.75770
19      36837.46700
23      37701.87680
           ...     
1313    36397.57600
1314    18765.87545
1321    28101.33305
1323    43896.37630
1337    29141.36030
Name: charges, Length: 274, dtype: float64

In [16]:
# checking for outliers in smoker group

smoke_outliers = np.abs(stats.zscore(smoker)) > 3

# how many outliers in the has discounts group

smoke_outliers.value_counts()

False    274
Name: charges, dtype: int64

In [17]:
# checking for outliers in smoker group

non_smoke_outliers = np.abs(stats.zscore(non_smoker)) > 3

# how many outliers in the has discounts group

non_smoke_outliers.value_counts()

False    1040
True       24
Name: charges, dtype: int64

In [18]:
# removing the outliers for the non smoking group
non_smoker = non_smoker[~non_smoke_outliers]
non_smoker

1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
5        3756.62160
           ...     
1332    11411.68500
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
Name: charges, Length: 1040, dtype: float64

### Normality

According to the lesson plan, since both groups have n> 15, it is possible to safely ignore the assumption of normality. For that reason I will not be checking the normality of either group.

In [19]:
## Use Levene's test for equal variance
result= stats.levene(smoker, non_smoker)
print(result)
result.pvalue <0.05

LeveneResult(statistic=520.7468821724297, pvalue=2.4247238784347824e-97)


True

SUMMARY OF ASSUMPTIONS OBSERVATIONS:

The non smoking group had outliers which I was able to remove. For Normality i was able to ignore the test since both groups have more than 15 samples. Finally when looking for equal variance I found that the groups do NOT have equal variance. Because that assumption is not met, I will have to run the ttest using the equal_var=False. This will run the Welch's T-Test which takes into account groups of unequal variance.

In [20]:
result = stats.ttest_ind(smoker, non_smoker, equal_var=False)
print(result)

result.pvalue < 0.05

Ttest_indResult(statistic=33.732305987092516, pvalue=2.575322662587431e-104)


True

## Conclusion

The Welch's T-Test returned a p-value that was less then the chosen alpha of 5%. Because of this the null hypothesis can be rejected. This supports the alternative hypothesis that states:

Those that smoke have higher insurance charges than those that don't smoke

In [21]:
print(f"The average amount of charges that smokers were billed is {round(non_smoker.mean(), 2)}")
print(f"The average amount of charges that non-smokers were billed is {round(smoker.mean(), 2)}")

The average amount of charges that smokers were billed is 7939.86
The average amount of charges that non-smokers were billed is 32050.23


# Q2: Are men more likely to smoke than women?

## 1. State the Hypothesis & Null Hypothesis

 Ho (Null Hypothesis): Both men and women are equably as likely to smoke.
 Ha (Alternative Hypothesis): There is a higher likelihood that someone smoke based on their gender. 

## 2. Determine the correct test to perform.

Type of Data? Categorical 

How many groups/samples? 2 groups

Therefore, which test is appropriate? Chi-Square

# Q3: Do different regions have different charges, on average?

## 1. State the Hypothesis & Null Hypothesis

 Ho (Null Hypothesis): On average all regions have similar charges.
 Ha (Alternative Hypothesis): There is a big difference in charges based on which region a person is located.

## 2. Determine the correct test to perform.

Type of Data? Numerical

How many groups/samples? 2 groups

Therefore, which test is appropriate? 2 Sample T-Test