In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('heart_failure_clinical_records_dataset.csv')
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [4]:
source = pd.crosstab(index=df['sex'], columns=df['smoking'],margins=True, margins_name='Total')
source.index = ['Female','Male','Total']
source.columns = ['Non Smoker','Smoker','Total']

source

Unnamed: 0,Non Smoker,Smoker,Total
Female,101,4,105
Male,102,92,194
Total,203,96,299


In [5]:
# what is the probability of selecting male
source.loc['Male']['Total'] / source.loc['Total']['Total']

0.6488294314381271

In [6]:
# what is the probability of selecting male who is non smoker
source.loc['Male']['Non Smoker'] / source.loc['Total']['Total']

0.3411371237458194

In [7]:
# what is the probability of selecting male from sub-population smoker
source.loc['Male']['Smoker'] / source.loc['Total']['Smoker']

0.9583333333333334

In [8]:
# is the probability of smoker unaffected by the occurence or non occurence gender
# A Gender, B Smoker
# p(B)
p_smoker = source.loc['Total']['Smoker'] / source.loc['Total']['Total']
# p(B|Female)
p_female_smoker = source.loc['Female']['Smoker'] / source.loc['Female']['Total']
# p(B|Male)
p_male_smoker = source.loc['Male']['Smoker'] / source.loc['Male']['Total']

In [9]:
p_smoker

0.3210702341137124

In [10]:
p_female_smoker

0.0380952380952381

In [11]:
p_male_smoker

0.4742268041237113

**Interpretation**

- P(Smoker) = 32.1%
- P(Smoker | Female) = 3.8%
- P(Smoker | Male) = 47.4%

Probabilitas pasien SMOKER dengan jenis kelamin (Male/Female) tidak sama dengan probabilitas keseluruhan SMOKER
Karena probabilitas ini tidak sama, maka SMOKER dan GENDER tidak independen
Individu dengan jenis kelamin Male lebih berpeluang untuk SMOKER dibandingkan dengan Female

In [13]:
source = pd.crosstab(index=df['DEATH_EVENT'], columns=df['diabetes'],margins=True, margins_name='Total')
source.index = ['Survived','Death','Total']
source.columns = ['No Diabet','Yes Diabet','Total']

source

Unnamed: 0,No Diabet,Yes Diabet,Total
Survived,118,85,203
Death,56,40,96
Total,174,125,299


In [15]:
# if the patient diabetes, what are the chances the patient is death
# P(death | Diabet)
source.loc['Death']['Yes Diabet'] / source.loc['Total']['Yes Diabet']


0.32

In [19]:
# suppose it is known that 5% of patient who undergo certain therapy will experience a reduced risk of death
# if the therapy is given to all patients, what is the probability that it is effective in 50 patients?

p = 0.05
k = 50
n =len(df.query('DEATH_EVENT==0'))

import scipy

In [20]:
scipy.stats.binom.pmf(k,n,p)

3.695418948575821e-21

Interpretation: 0% peluang bahwa tepat 50 pasien akan sembuh dari penyakitnya setelah menjalani therapy dengan tingkat keberhasilan 5%

In [21]:
# assume serum_sodium gaussian
# what is probability of observing a value greater than 140
source = pd.DataFrame({'x': df.serum_sodium.values,
                       'zscore': scipy.stats.zscore(df.serum_sodium),
                       'P(X < x)': [scipy.stats.norm.cdf(i) for i in scipy.stats.zscore(df.serum_sodium)]})

In [22]:
source.head()

Unnamed: 0,x,zscore,P(X < x)
0,130,-1.504036,0.066286
1,136,-0.141976,0.443549
2,129,-1.731046,0.041722
3,137,0.085034,0.533883
4,116,-4.682176,1e-06


In [28]:
#prob X>140
p_val_140 = source.query('x == 140').iloc[0,2]

In [29]:
p_val_140

0.7781808383369373

In [27]:
source[source['x']<= 125]

Unnamed: 0,x,zscore,P(X < x)
4,116,-4.682176,1.419228e-06
19,121,-3.547126,0.0001947291
126,124,-2.866096,0.00207784
199,113,-5.363206,4.087883e-08
225,125,-2.639086,0.004156493


In [31]:
# what is probability of observing a value less than 120
# source.query('x==120').iloc[0,2]

# Z score
(120 - df.serum_sodium.mean()) / df.serum_sodium.std()

p_val_120 = scipy.stats.norm.cdf(120, loc=df.serum_sodium.mean(), scale=df.serum_sodium.std())
p_val_120


8.233984330639427e-05

In [32]:
# what is probability of observing a value greate than 120 and less than 140
p_val_140 - p_val_120

0.7780984984936309