# Analysis of heart disease data

In this notebook we analyse a classic dataset for machine learning, the heart disease dataset. The task is that of developing a machine learning model that "makes a diagnosis" of heart disease (with various degrees for how serious it is) given patients' data.

We will use this dataset to build an example of how to carry out a data science project.

The data is taken [from Kaggle](https://www.kaggle.com/danimal/heartdiseaseensembleclassifier/).

__Features:__

- `age` - age in years
- `sex` - sex (1 = male; 0 = female)
- `cp` - chest pain type (1 = typical angina; 2 = atypical angina; 3 = non-anginal pain; 4 = asymptomatic)
- `trestbps` - resting blood pressure (in mm Hg on admission to the hospital)
- `chol` - serum cholestoral in mg/dl
- `fbs` - fasting blood sugar > 120 mg/dl (1 = true; 0 = false)
- `restecg` - resting electrocardiographic results (0 = normal; 1 = having ST-T; 2 = hypertrophy)
- `thalach` - maximum heart rate achieved
- `exang` - exercise induced angina (1 = yes; 0 = no)
- `oldpeak` - ST depression induced by exercise relative to rest
- `slope` - the slope of the peak exercise ST segment (1 = upsloping; 2 = flat; 3 = downsloping)
- `ca` - number of major vessels (0-3) colored by flourosopy
- `thal` - 3 = normal; 6 = fixed defect; 7 = reversable defect
- `pred_attribute` - the predicted attribute, a parameter that indicates how bad the predicted heart disease is.

In [None]:
import pandas as pd

## Read the file containing the dataset

In [None]:
data_df.columns

In [None]:
data_df = pd.read_csv("../data/Heart_Disease_Data.csv")

In [None]:
data_df.head()

In [None]:
data_df.describe()

## Define features ($\vec{x}_i$) and target variables ($y_i$)

In [None]:
X = data_df.drop('pred_attribute', axis=1)
Y = pd.DataFrame(data_df['pred_attribute'])

In [None]:
X.head()

In [None]:
Y.head()

## Plot distributions for some of the features

In [None]:
import numpy as np

from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
from plotly import tools

init_notebook_mode(connected=True)

In [None]:
cols_to_plot = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

In [None]:
# fig = go.Figure(data=data, layout=layout)
fig = tools.make_subplots(rows=2, cols=3, subplot_titles = tuple(cols_to_plot))

counter = 0

for col in cols_to_plot:
    trace = go.Histogram(
        x = X[col],
        name = col
    )

    fig.append_trace(trace, counter//3+1, counter%3+1)
    
    counter += 1

iplot(fig)

In [None]:
print(f"Number of female patients: {X[X['sex']==0].shape[0]}")
print(f"Number of male patients: {X[X['sex']==1].shape[0]}")

In [None]:
trace_f = go.Histogram(
    x = X[X['sex']==0]['age'],
    histnorm = 'probability',
    name = 'female'
)

trace_m = go.Histogram(
    x = X[X['sex']==1]['age'],
    histnorm = 'probability',
    name = 'male'
)

layout = go.Layout(
    xaxis = dict(
        title = 'age'
    ),
    yaxis = dict(
        title = 'probability'
    ),
    title='Age distribution by gender'
)

data = [trace_m, trace_f]

fig = go.Figure(data=data, layout=layout)

iplot(fig)

## Feature engineering: one-hot encoding of categorical variables

In [None]:
X = pd.get_dummies(X)

In [None]:
X.head()

In [None]:
X.columns

In [None]:
print(f"The dataset has {X.shape[1]} columns now.")

## Train a model to classify the samples and cross-validate

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_score

In [None]:
rfc = RandomForestClassifier(n_estimators=100)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y.values.ravel())

In [None]:
rfc.fit(X_train, Y_train)

In [None]:
rfc.predict(X_test)[:10]

In [None]:
scores = cross_val_score(rcf, X, Y.values.ravel(), cv=5)

In [None]:
print(f"Mean accuracy: {np.mean(scores).round(2)} ± {np.std(scores).round(2)}")

## Search for the best values of the hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
rfc_cv = RandomForestClassifier(class_weight='balanced')

In [None]:
gs = GridSearchCV(
    rfc_cv,
    {'n_estimators': [10, 25, 50, 100, 200, 500, 1000]},
    cv = 5,
    return_train_score = True
)

In [None]:
gs.fit(X, Y.values.ravel())

In [None]:
pd.DataFrame(gs.cv_results_)

In [None]:
trace = go.Bar(
    x=gs.cv_results_['param_n_estimators'],
    y=list(gs.cv_results_['mean_test_score']),
    error_y=dict(
        type='data',
        array=list(gs.cv_results_['std_test_score']),
        visible=True
    ),
    width=0.4
)

data = [trace]

layout = go.Layout(
    xaxis = dict(
        title='n_estimators',
        type='category'
    ),
    yaxis = dict(
        title='mean test score'
    ),
    title="Mean accuracies"
)

iplot(go.Figure(data=data, layout=layout))

## Export the best performing model

In [None]:
from sklearn.externals import joblib

In [None]:
# joblib.dump(gs.best_estimator_, "../model/trained_model.pkl")

## Compute the probability of disease for males and females varying age

In [None]:
import sys
sys.path.insert(0, "../modules/")
from utils import cols_dummies
import pickle

In [None]:
males_df = data_df[data_df['sex']==1]
females_df = data_df[data_df['sex']==0]

In [None]:
mean_values = {
    'cp': 3.0,
    'trestbps': 132.0,
    'chol': 247.0,
    'fbs': 0,
    'restecg': 1,
    'thalach': 150,
    'exang': 0,
    'oldpeak': 1.0,
    'slop': 2,
    'ca': 1,
    'thal': 3
}

mean_values_m = {
    'cp': 3.0,
    'trestbps': 131.0,
    'chol': 239.0,
    'fbs': 0,
    'restecg': 1,
    'thalach': 148,
    'exang': 0,
    'oldpeak': 1.1,
    'slop': 2,
    'ca': '1',
    'thal': '6'
}

mean_values_f = {
    'cp': 3.0,
    'trestbps': 133.0,
    'chol': 262.0,
    'fbs': 0,
    'restecg': 1,
    'thalach': 151,
    'exang': 0,
    'oldpeak': 0.9,
    'slop': 2,
    'ca': '1',
    'thal': '3'
}

In [None]:
X_m = pd.DataFrame(columns=cols_dummies)
X_f = pd.DataFrame(columns=cols_dummies)

for age in range(20, 96):
    data_m = mean_values_m
    data_m['age'] = age
    data_m_df = pd.DataFrame(pd.Series(data_m)).T
    
    X_m = X_m.append(pd.get_dummies(
        data_m_df,
        columns=['ca', 'thal']
    ).iloc[0]).fillna(0)
    
    data_f = mean_values_f
    data_f['age'] = age
    data_f_df = pd.DataFrame(pd.Series(data_f)).T
    
    X_f = X_f.append(pd.get_dummies(
        data_f_df,
        columns=['ca', 'thal']
    ).iloc[0]).fillna(0)

In [None]:
best_model = gs.best_estimator_

In [None]:
X_m['proba'] = best_model.predict_proba(X_m)[:,2]
X_f['proba'] = best_model.predict_proba(X_f)[:,2]

In [None]:
trace_m = go.Scatter(
    x = X_m['age'],
    y = X_m['proba'],
    name = 'males'
)

trace_f = go.Scatter(
    x = X_f['age'],
    y = X_f['proba'],
    name = 'females'
)

layout = go.Layout(
    xaxis = dict(
        title='age'
    ),
    yaxis = dict(
        title='probability'
    ),
    title='Probability of heart diseases as age varies'
)

fig_proba = go.Figure(data=[trace_m, trace_f], layout=layout)

iplot(fig_proba)

In [None]:
# with open("../data/X_m.pkl", "wb") as f:
#     pickle.dump(X_m, f)
    
# with open("../data/X_f.pkl", "wb") as f:
#     pickle.dump(X_f, f)