# Cardiovascular Disease Dataset Analysis
Use `pip install` to install the required libraries (numpy, pandas, scipy, matplotlib, seaborn, jupyter).

In [None]:
# Imports
import numpy as np
import pandas as pd
import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns

# Jupyter Notebook Setup
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Data Exploration

In [None]:
df = pd.read_csv('./data/heart_failure_clinical_records_dataset.csv')

# Drop time column because it does not help with analysis
df = df.drop(['time'], axis=1)
df.head()
df.count()

### Conditional Probabilities

In [None]:
# P(a | b) = pd.crosstab(df.a, df.b, normalize='columns')
# P(b | a) = pd.crosstab(df.a, df.b, normalize='index')
cp_anaemia = pd.crosstab(df.anaemia, df.DEATH_EVENT, normalize='index')
print(cp_anaemia)

cp_diabetes = pd.crosstab(df.diabetes, df.DEATH_EVENT, normalize='index')
print(cp_diabetes)

cp_blood_pressure = pd.crosstab(df.high_blood_pressure, df.DEATH_EVENT, normalize='index')
print(cp_blood_pressure)

cp_sex = pd.crosstab(df.sex, df.DEATH_EVENT, normalize='index')
print(cp_sex)

cp_smoke = pd.crosstab(df.smoking, df.DEATH_EVENT, normalize='index')
print(cp_smoke)

### T-Tests

In [8]:
# Helper Functions for T-Tests
def calculate_S_squared(m0, m1, v0, v1, n0, n1):
    s2 = ((n0 - 1) * v0) + ((n1 - 1) * v1)
    s2 = s2 / (n0 + n1 -2)

    return s2


def calculate_test_statistic(m0, m1, v0, v1, n0, n1):
    s2 = calculate_S_squared(m0, m1, v0, v1, n0, n1)
    sp = np.sqrt(s2)
    t0 = (m0 - m1) / (sp * np.sqrt((1 / n0) + (1 / n1)))

    return t0


def get_column_values(df_mean, df_var, df_count, key):
    m0 = df_mean[key][0]
    m1 = df_mean[key][1]
    v0 = df_var[key][0]
    v1 = df_var[key][1]
    n0 = df_count[key][0]
    n1 = df_count[key][1]

    return m0, m1, v0, v1, n0, n1


def calculate_all_test_statistics(df_mean, df_var, df_count):
    cols = []
    data = []
    for c in df_mean.columns:
        if c != 'DEATH_EVENT':
            m0, m1, v0, v1, n0, n1 = get_column_values(df_mean, df_var, df_count, c)
            t0 = calculate_test_statistic(m0, m1, v0, v1, n0, n1)
            cols.append(c)
            data.append(t0)
    data = np.array(data).reshape(1, 6)
    df_tstats = pd.DataFrame(data=data, columns=cols)
    return df_tstats

In [9]:
continuous_df = df[['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium', 'DEATH_EVENT']]
groupby_death_mean = continuous_df.groupby(['DEATH_EVENT']).mean()
groupby_death_var = continuous_df.groupby(['DEATH_EVENT']).var()
groupby_death_count = continuous_df.groupby(['DEATH_EVENT']).count()
print(groupby_death_mean)
print(groupby_death_var)
print(groupby_death_count)

test_statistics = calculate_all_test_statistics(groupby_death_mean, groupby_death_var, groupby_death_count)
print(test_statistics)

                   age  creatinine_phosphokinase  ejection_fraction  \
DEATH_EVENT                                                           
0            58.761906                540.054187           40.26601   
1            65.215281                670.197917           33.46875   

                 platelets  serum_creatinine  serum_sodium  
DEATH_EVENT                                                 
0            266657.489901          1.184877    137.216749  
1            256381.044792          1.835833    135.375000  
                    age  creatinine_phosphokinase  ejection_fraction  \
DEATH_EVENT                                                            
0            113.164708              5.682138e+05         117.938789   
1            174.624481              1.733385e+06         156.883224   

                platelets  serum_creatinine  serum_sodium  
DEATH_EVENT                                                
0            9.512335e+09          0.427824     15.863678  
1 