<a href="https://colab.research.google.com/github/equitymarkets/health_project_group_1/blob/main/hyper_parameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [22]:
# reading data set 
df = pd.read_csv('/content/med_conditions.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,asthma,heart_failure,chronic_heart_disease,angina_pectoris,stroke,thyroid,liver_disease,gallbladder_pr,heart_attack
0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0


In [23]:
df.columns

Index(['Unnamed: 0', 'asthma', 'heart_failure', 'chronic_heart_disease',
       'angina_pectoris', 'stroke', 'thyroid', 'liver_disease',
       'gallbladder_pr', 'heart_attack'],
      dtype='object')

In [None]:
binary_df = df.copy()
columns = []


In [25]:
# features set
X = df.copy()
X.drop('heart_attack', axis = 1, inplace = True)
X.head()


Unnamed: 0.1,Unnamed: 0,asthma,heart_failure,chronic_heart_disease,angina_pectoris,stroke,thyroid,liver_disease,gallbladder_pr
0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0


In [26]:
# target set
y = df['heart_attack'].values.reshape(-1,1)
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [27]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=444)

In [9]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [10]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [11]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [28]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# creating random grid
n_estimators = [int(x) for x in np.linspace(200, 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf,
               'min_samples_split': min_samples_split,
               'bootstrap': bootstrap}

In [39]:
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions= random_grid,
                               n_iter= 10, cv = 5, verbose = 2, random_state = 444, n_jobs = -1)

rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


In [40]:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 50,
 'bootstrap': True}

In [41]:
best_random = rf_random.best_estimator_

In [42]:
predictions = best_random.predict(X_test)

In [43]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, predictions)

In [44]:
display(cm)
print(acc_score)
print(classification_report(y_test, predictions))

array([[3628,    0],
       [  35,   84]])

0.9906591940218842
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3628
           1       1.00      0.71      0.83       119

    accuracy                           0.99      3747
   macro avg       1.00      0.85      0.91      3747
weighted avg       0.99      0.99      0.99      3747



In [53]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'bootstrap': [True],
    'max_depth': [60, 70, 80, 90],
    'max_features': ['auto'],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [3, 5, 7],
    'n_estimators': [200, 300, 400, 1000]
}

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                           cv = 3, n_jobs=-1, verbose=2)

In [54]:
grid_search.fit(X_train, y_train)

grid_search.best_params_

Fitting 3 folds for each of 144 candidates, totalling 432 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


{'bootstrap': True,
 'max_depth': 60,
 'max_features': 'auto',
 'min_samples_leaf': 3,
 'min_samples_split': 3,
 'n_estimators': 200}

In [55]:
best_grid = grid_search.best_estimator_

grid_predictions = best_grid.predict(X_test)

cm = confusion_matrix(y_test, grid_predictions)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, grid_predictions)

In [56]:
display(cm)
print(acc_score)
print(classification_report(y_test, predictions))

array([[3628,    0],
       [  35,   84]])

0.9906591940218842
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3628
           1       1.00      0.71      0.83       119

    accuracy                           0.99      3747
   macro avg       1.00      0.85      0.91      3747
weighted avg       0.99      0.99      0.99      3747

