<a href="https://colab.research.google.com/github/equitymarkets/health_project_group_1/blob/main/hyper_parameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [22]:
# reading data set 
df = pd.read_csv('/content/med_conditions.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,asthma,heart_failure,chronic_heart_disease,angina_pectoris,stroke,thyroid,liver_disease,gallbladder_pr,heart_attack
0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0


In [75]:
df.shape

(14986, 10)

In [77]:
heart_attacks = (df["heart_attack"] == 1).sum()
heart_attacks

432

In [23]:
df.columns

Index(['Unnamed: 0', 'asthma', 'heart_failure', 'chronic_heart_disease',
       'angina_pectoris', 'stroke', 'thyroid', 'liver_disease',
       'gallbladder_pr', 'heart_attack'],
      dtype='object')

In [None]:
binary_df = df.copy()
columns = []


In [59]:
# features set
X = df.copy()
X.drop(['heart_attack', 'Unnamed: 0'], axis = 1, inplace = True)
X.head()


Unnamed: 0,asthma,heart_failure,chronic_heart_disease,angina_pectoris,stroke,thyroid,liver_disease,gallbladder_pr
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0


In [60]:
# target set
y = df['heart_attack'].values.reshape(-1,1)
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [61]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=444, stratify = y)

In [79]:
sum = 0
for label in y_train:
  if label == 1:
    sum += 1

sum

324

In [80]:
sum = 0 
for label in y_test:
  if label == 1:
    sum += 1
sum

108

In [9]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [10]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [11]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [62]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# creating random grid
n_estimators = [int(x) for x in np.linspace(200, 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf,
               'min_samples_split': min_samples_split,
               'bootstrap': bootstrap}

In [63]:
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions= random_grid,
                               n_iter= 10, cv = 5, verbose = 2, random_state = 444, n_jobs = -1)

rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [40]:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 50,
 'bootstrap': True}

In [64]:
best_random = rf_random.best_estimator_

In [65]:
predictions = best_random.predict(X_test)

In [66]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, predictions)

In [67]:
display(cm)
print(acc_score)
print(classification_report(y_test, predictions))

array([[3639,    0],
       [  17,   91]])

0.9954630370963438
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3639
           1       1.00      0.84      0.91       108

    accuracy                           1.00      3747
   macro avg       1.00      0.92      0.96      3747
weighted avg       1.00      1.00      1.00      3747



In [68]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'bootstrap': [True],
    'max_depth': [60, 70, 80, 90],
    'max_features': ['auto'],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [3, 5, 7],
    'n_estimators': [200, 300, 400, 1000]
}

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                           cv = 3, n_jobs=-1, verbose=2)

In [69]:
grid_search.fit(X_train, y_train)

grid_search.best_params_

Fitting 3 folds for each of 144 candidates, totalling 432 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


{'bootstrap': True,
 'max_depth': 60,
 'max_features': 'auto',
 'min_samples_leaf': 3,
 'min_samples_split': 3,
 'n_estimators': 200}

In [70]:
best_grid = grid_search.best_estimator_

grid_predictions = best_grid.predict(X_test)

cm = confusion_matrix(y_test, grid_predictions)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, grid_predictions)

In [72]:
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3639,0
Actual 1,17,91


In [73]:

print(acc_score)
print(classification_report(y_test, predictions))

0.9954630370963438
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3639
           1       1.00      0.84      0.91       108

    accuracy                           1.00      3747
   macro avg       1.00      0.92      0.96      3747
weighted avg       1.00      1.00      1.00      3747



In [81]:
importances = best_grid.feature_importances_

importances_sorted = sorted(zip(best_grid.feature_importances_, X.columns), reverse = True)
importances_df = pd.DataFrame(importances_sorted)
importances_df

Unnamed: 0,0,1
0,0.40604,chronic_heart_disease
1,0.158327,heart_failure
2,0.11482,angina_pectoris
3,0.11015,stroke
4,0.091296,asthma
5,0.068038,gallbladder_pr
6,0.036545,thyroid
7,0.014785,liver_disease
