In [None]:
!unzip "/content/drive/MyDrive/Pacmann AI/Probability/insurance.zip" -d "/content/drive/MyDrive/Pacmann AI/Probability/data"

Archive:  /content/drive/MyDrive/Pacmann AI/Probability/insurance.zip
  inflating: /content/drive/MyDrive/Pacmann AI/Probability/data/insurance.csv  


### Import Library

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Pacmann AI/Probability/data/insurance.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Analysis

#### Descriptive Statistics

In [4]:
df.age.mean().round()

39.0

In [5]:
df_perokok  = df[df.smoker=='yes']
df_non_perokok = df[df.smoker=='no']

In [6]:
df_perokok.bmi.mean().round()

31.0

In [7]:
df_perokok.groupby('sex')['age'].mean().round()

sex
female    39.0
male      38.0
Name: age, dtype: float64

In [8]:
df.groupby('smoker')['charges'].mean()

smoker
no      8434.268298
yes    32050.231832
Name: charges, dtype: float64

In [9]:
df.groupby('smoker')['bmi'].mean()

smoker
no     30.651795
yes    30.708449
Name: bmi, dtype: float64

In [10]:
df.groupby('sex')['charges'].mean()

sex
female    12569.578844
male      13956.751178
Name: charges, dtype: float64

In [11]:
num_count = df.groupby('smoker')['smoker'].count()
num_count/num_count.sum()

smoker
no     0.795217
yes    0.204783
Name: smoker, dtype: float64

In [12]:
num_count = df_perokok.groupby('sex')['sex'].count()
num_count/num_count.sum()

sex
female    0.419708
male      0.580292
Name: sex, dtype: float64

In [13]:
num_count = df.groupby('region')['region'].count()
num_count/num_count.sum()

region
northeast    0.242152
northwest    0.242900
southeast    0.272048
southwest    0.242900
Name: region, dtype: float64

In [43]:
df_category = df.copy()
df_category['charges>16700'] = df_category.charges>16700
df_category['bmi>25'] = df_category.bmi>25
pd.crosstab(df_category['bmi>25'],df_category['charges>16700'],margins=True)


charges>16700,False,True,All
bmi>25,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,196,51,247
True,808,283,1091
All,1004,334,1338


In [41]:
283/1091

0.25939505041246563

$P(B>16700 \mid A>25)$=$\frac{P(B>16700 \cap A>25)}{P(A>25)}$=$\frac{283}{1091}$=0.26

In [40]:
51/247

0.20647773279352227

$P(B>16700 \mid A<25)$=$\frac{P(B>16700 \cap A<25)}{P(A<25)}$=$\frac{51}{247}$=0.21

In [47]:
df_bmi_above_25 = df_category[df_category['bmi>25']].copy()
pd.crosstab(df_bmi_above_25['smoker'],df_bmi_above_25['charges>16700'],margins=True)

charges>16700,False,True,All
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,804,68,872
yes,4,215,219
All,808,283,1091


In [48]:
215/219

0.9817351598173516

$P(B>16700 \mid A>25 \cap C=Yes)$=$\frac{P(B>16700 \cap A>25 \cap C=Yes)}{P(A>25 \cap C=Yes)}$=$\frac{215}{219}$=0.98

In [49]:
68/872

0.0779816513761468

$P(B>16700 \mid A>25 \cap C=No)$=$\frac{P(B>16700 \cap A>25 \cap C=Yes)}{P(A>25 \cap C=No)}$=$\frac{68}{872}$=0.08

### Analisis Korelasi

In [50]:
df_perokok.corr()

  df_perokok.corr()


Unnamed: 0,age,bmi,children,charges
age,1.0,0.059674,0.081183,0.368224
bmi,0.059674,1.0,-0.012619,0.806481
children,0.081183,-0.012619,1.0,0.035945
charges,0.368224,0.806481,0.035945,1.0


In [51]:
df_non_perokok.corr()

  df_non_perokok.corr()


Unnamed: 0,age,bmi,children,charges
age,1.0,0.122638,0.033395,0.627947
bmi,0.122638,1.0,0.019208,0.084037
children,0.033395,0.019208,1.0,0.138929
charges,0.627947,0.084037,0.138929,1.0


### Uji Hipotesis

Proporsi perokok laki-laki lebih besar dari pada perempuan

In [52]:
#import package
from statsmodels.stats.proportion import proportions_ztest
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [53]:
df_perokok.groupby('sex')['sex'].count()

sex
female    115
male      159
Name: sex, dtype: int64

In [54]:
df.groupby('sex')['sex'].count()

sex
female    662
male      676
Name: sex, dtype: int64

In [55]:
number_of_successes = np.array([159, 115])
total_sample_sizes = np.array([676,662])

In [56]:
# Hitung uji statistik dan p_value
(test_stat, p_value) = proportions_ztest(number_of_successes, total_sample_sizes,alternative='larger')
print("The computed Z test statistic is: ",test_stat)
print("The p-value is: ", p_value)

The computed Z test statistic is:  2.7867402154855503
The p-value is:  0.002662057082160266


Variansi tagihan perokok dan non perokok sama

In [57]:
import scipy.stats as stats
import math

In [58]:
df_perokok['charges'].describe()

count      274.000000
mean     32050.231832
std      11541.547176
min      12829.455100
25%      20826.244213
50%      34456.348450
75%      41019.207275
max      63770.428010
Name: charges, dtype: float64

In [59]:
df_non_perokok['charges'].describe()

count     1064.000000
mean      8434.268298
std       5993.781819
min       1121.873900
25%       3986.438700
50%       7345.405300
75%      11362.887050
max      36910.608030
Name: charges, dtype: float64

In [60]:
# grup 1 = perokok
# grup 2 = non perokok
rataan1 = 32050.231832
rataan2 = 8434.268298

# Menentukan sample variance
s_x = 11541.547176**2
s_y = 5993.781819**2

# Ukuran sampel dua kelompok
sample1 = 274
sample2 = 1064

In [61]:
# Hitung uji statistik
f_test=s_x/s_y
print(f_test)

3.7078845392753403


In [62]:
# Nilai Critical Value berdasarkan critical f-value dari f-table dengan nilai alpha 0.05
import scipy.stats as stats
f_crit=stats.f.ppf(1-0.05, sample1-1, sample2-1)
print(f_crit)

1.1662297531965873


Tagihan Kesehatan Perokok lebih besar dari pada non perokok

In [63]:
np.var(df_perokok.charges), np.var(df_non_perokok.charges)

(132721153.13625307, 35891656.00316426)

In [64]:
from scipy.stats import ttest_ind

stat, p = ttest_ind(a = df_perokok.charges, b = df_non_perokok.charges, equal_var=False, alternative='greater') # eaual_var= False karena varians kedua populasi berbeda

# Interpretasi Hasil
print('Statistics = %.4f, p-value = %.4f' % (stat, p)) 

Statistics = 32.7519, p-value = 0.0000
