<a href="https://colab.research.google.com/github/equitymarkets/health_project_group_1/blob/main/hyper_parameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [None]:
# reading data set 
df = pd.read_csv('/content/med_conditions.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,asthma,heart_failure,chronic_heart_disease,angina_pectoris,stroke,thyroid,liver_disease,gallbladder_pr,heart_attack
0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0


In [None]:
df.shape

(14986, 10)

In [None]:
heart_attacks = (df["heart_attack"] == 1).sum()
heart_attacks

432

In [None]:
df.columns

Index(['Unnamed: 0', 'asthma', 'heart_failure', 'chronic_heart_disease',
       'angina_pectoris', 'stroke', 'thyroid', 'liver_disease',
       'gallbladder_pr', 'heart_attack'],
      dtype='object')

In [None]:
binary_df = df.copy()
columns = []


In [None]:
# features set
X = df.copy()
X.drop(['heart_attack', 'Unnamed: 0'], axis = 1, inplace = True)
X.head()


Unnamed: 0,asthma,heart_failure,chronic_heart_disease,angina_pectoris,stroke,thyroid,liver_disease,gallbladder_pr
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0


In [None]:
# target set
y = df['heart_attack'].values.reshape(-1,1)
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=444, stratify = y)

In [None]:
sum = 0
for label in y_train:
  if label == 1:
    sum += 1

sum

324

In [None]:
sum = 0 
for label in y_test:
  if label == 1:
    sum += 1
sum

108

In [None]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [None]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [None]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# creating random grid
n_estimators = [int(x) for x in np.linspace(200, 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf,
               'min_samples_split': min_samples_split,
               'bootstrap': bootstrap}

In [None]:
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions= random_grid,
                               n_iter= 100, cv = 5, verbose = 2, random_state = 444, n_jobs = -1)

rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [None]:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 50,
 'bootstrap': True}

In [None]:
best_random = rf_random.best_estimator_

In [None]:
predictions = best_random.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, predictions)

In [None]:
display(cm)
print(acc_score)
print(classification_report(y_test, predictions))

array([[3639,    0],
       [  17,   91]])

0.9954630370963438
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3639
           1       1.00      0.84      0.91       108

    accuracy                           1.00      3747
   macro avg       1.00      0.92      0.96      3747
weighted avg       1.00      1.00      1.00      3747



In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'bootstrap': [True],
    'max_depth': [60, 70, 80, 90],
    'max_features': ['auto'],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [3, 5, 7],
    'n_estimators': [200, 300, 400, 1000]
}

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                           cv = 3, n_jobs=-1, verbose=2)

In [None]:
grid_search.fit(X_train, y_train)

grid_search.best_params_

Fitting 3 folds for each of 144 candidates, totalling 432 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


{'bootstrap': True,
 'max_depth': 60,
 'max_features': 'auto',
 'min_samples_leaf': 3,
 'min_samples_split': 3,
 'n_estimators': 200}

In [None]:
best_grid = grid_search.best_estimator_

grid_predictions = best_grid.predict(X_test)

cm = confusion_matrix(y_test, grid_predictions)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, grid_predictions)

In [None]:
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3639,0
Actual 1,17,91


In [None]:

print(acc_score)
print(classification_report(y_test, predictions))

0.9954630370963438
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3639
           1       1.00      0.84      0.91       108

    accuracy                           1.00      3747
   macro avg       1.00      0.92      0.96      3747
weighted avg       1.00      1.00      1.00      3747



In [None]:
importances = best_grid.feature_importances_

importances_sorted = sorted(zip(best_grid.feature_importances_, X.columns), reverse = True)
importances_df = pd.DataFrame(importances_sorted)
importances_df

Unnamed: 0,0,1
0,0.40604,chronic_heart_disease
1,0.158327,heart_failure
2,0.11482,angina_pectoris
3,0.11015,stroke
4,0.091296,asthma
5,0.068038,gallbladder_pr
6,0.036545,thyroid
7,0.014785,liver_disease


Raw Data

In [33]:
med_conditions = pd.read_sas('/content/P_MCQ.XPT', format = 'xport')
med_conditions.head()

Unnamed: 0,SEQN,MCQ010,MCQ025,MCQ035,MCQ040,MCQ050,AGQ030,MCQ053,MCQ080,MCQ092,...,MCQ300A,MCQ366A,MCQ366B,MCQ366C,MCQ366D,MCQ371A,MCQ371B,MCQ371C,MCQ371D,OSQ230
0,109263.0,2.0,,,,,2.0,2.0,,,...,,,,,,,,,,
1,109264.0,2.0,,,,,,2.0,,2.0,...,,,,,,,,,,
2,109265.0,2.0,,,,,,2.0,,,...,,,,,,,,,,
3,109266.0,2.0,,,,,2.0,2.0,1.0,9.0,...,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,
4,109267.0,2.0,,,,,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,


In [34]:
columns = ['SEQN', 'MCQ010', 'MCQ160B', 'MCQ160C', 'MCQ160D', 'MCQ160E', 'MCQ160F', 'MCQ160M', 'MCQ500', 'MCQ560']
med_conditions_reduced = med_conditions[columns]
med_conditions_reduced.head()

Unnamed: 0,SEQN,MCQ010,MCQ160B,MCQ160C,MCQ160D,MCQ160E,MCQ160F,MCQ160M,MCQ500,MCQ560
0,109263.0,2.0,,,,,,,,
1,109264.0,2.0,,,,,,,2.0,
2,109265.0,2.0,,,,,,,,
3,109266.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0
4,109267.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0


In [92]:
mc_df = med_conditions_reduced.rename(columns = {'MCQ010': 'asthma', 'MCQ160B': 'heart_failure', 
                                                 'MCQ160C': 'heart_disease', 'MCQ160D': 'angina', 
                                                 'MCQ160E': 'heart_attack', 'MCQ160F': 'stroke', 
                                                 'MCQ160M': 'thyroid', 'MCQ500': 'liver', 
                                                 'MCQ560': 'gallbladder', 'SEQN': 'id'})
mc_df.head()

Unnamed: 0,id,asthma,heart_failure,heart_disease,angina,heart_attack,stroke,thyroid,liver,gallbladder
0,109263.0,2.0,,,,,,,,
1,109264.0,2.0,,,,,,,2.0,
2,109265.0,2.0,,,,,,,,
3,109266.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0
4,109267.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0


In [93]:
mc_df['asthma'] = mc_df['asthma'].fillna(0)
mc_df['heart_failure'] = mc_df['heart_failure'].fillna(0)
mc_df['heart_disease'] = mc_df['heart_disease'].fillna(0)
mc_df['angina'] = mc_df['angina'].fillna(0)
mc_df['heart_attack'] = mc_df['heart_attack'].fillna(0)
mc_df['stroke'] = mc_df['stroke'].fillna(0)
mc_df['thyroid'] = mc_df['thyroid'].fillna(0)
mc_df['liver'] = mc_df['liver'].fillna(0)
mc_df['gallbladder'] = mc_df['gallbladder'].fillna(0)

mc_df.head()

Unnamed: 0,id,asthma,heart_failure,heart_disease,angina,heart_attack,stroke,thyroid,liver,gallbladder
0,109263.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,109264.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
2,109265.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,109266.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,2.0
4,109267.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,2.0


In [94]:
mc_df['asthma'].nunique()

3

In [95]:
index_drop = list(mc_df[ (mc_df['asthma']==9) | (mc_df['heart_failure']==9) | (mc_df['heart_disease']==9) | (mc_df['angina']==9) | (mc_df['heart_attack']==9) | (mc_df['stroke']==9) | (mc_df['thyroid']==9) | (mc_df['liver']==9) | (mc_df['gallbladder']==9) ].index)


In [96]:
print(index_drop)

[309, 359, 421, 426, 798, 1103, 1122, 1168, 1490, 2192, 2248, 2302, 2348, 2361, 2710, 2982, 3015, 3101, 3211, 3241, 3353, 3592, 3593, 3729, 4091, 4099, 4125, 4321, 4446, 4530, 4545, 4655, 4837, 4911, 4989, 5020, 5277, 5306, 5312, 5351, 5568, 5608, 5612, 5677, 5695, 5799, 5855, 5859, 5929, 5932, 5950, 5963, 6202, 6362, 6399, 6592, 6606, 6608, 6834, 6943, 6944, 7232, 7590, 7633, 7659, 8009, 8080, 8303, 8385, 8602, 8843, 8856, 9134, 9433, 9438, 9591, 9609, 9681, 9782, 9831, 9976, 10026, 10032, 10116, 10220, 10303, 10420, 10523, 10601, 10628, 10673, 10774, 10803, 10955, 10967, 11110, 11132, 11159, 11166, 11429, 11657, 11676, 11753, 11889, 11919, 11930, 12112, 12232, 12387, 12566, 12583, 12949, 13020, 13074, 13081, 13256, 13481, 13491, 13533, 13628, 13688, 13693, 13722, 13723, 13775, 13781, 13824, 13920, 13981, 14231, 14296, 14339, 14385, 14466, 14540, 14770, 14892, 14921]


In [97]:
mc_df.drop(mc_df.index[index_drop], inplace = True)

In [98]:
print(mc_df)

             id  asthma  heart_failure  heart_disease  angina  heart_attack  \
0      109263.0     2.0            0.0            0.0     0.0           0.0   
1      109264.0     2.0            0.0            0.0     0.0           0.0   
2      109265.0     2.0            0.0            0.0     0.0           0.0   
3      109266.0     2.0            2.0            2.0     2.0           2.0   
4      109267.0     2.0            2.0            2.0     2.0           2.0   
...         ...     ...            ...            ...     ...           ...   
14981  124818.0     2.0            2.0            2.0     2.0           2.0   
14982  124819.0     2.0            0.0            0.0     0.0           0.0   
14983  124820.0     2.0            0.0            0.0     0.0           0.0   
14984  124821.0     1.0            2.0            2.0     2.0           2.0   
14985  124822.0     2.0            2.0            2.0     2.0           1.0   

       stroke  thyroid  liver  gallbladder  
0     

In [99]:
mc_df['asthma'] = mc_df['asthma'].replace({2: 0})
mc_df['heart_failure'] = mc_df['heart_failure'].replace({2: 0})
mc_df['heart_disease'] = mc_df['heart_disease'].replace({2: 0})
mc_df['angina'] = mc_df['angina'].replace({2: 0})
mc_df['heart_attack'] = mc_df['heart_attack'].replace({2: 0})
mc_df['stroke'] = mc_df['stroke'].replace({2: 0})
mc_df['thyroid'] = mc_df['thyroid'].replace({2: 0})
mc_df['liver'] = mc_df['liver'].replace({2: 0})
mc_df['gallbladder'] = mc_df['gallbladder'].replace({2: 0})
mc_df['heart_attack'] = mc_df['heart_attack'].replace({2: 0})

mc_df.head()

Unnamed: 0,id,asthma,heart_failure,heart_disease,angina,heart_attack,stroke,thyroid,liver,gallbladder
0,109263.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,109264.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,109265.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,109266.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,109267.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
mc_df.shape

(14848, 10)

In [101]:
X = mc_df.copy()
X.drop(['heart_attack', 'id'], axis = 1, inplace = True)
X.head()

Unnamed: 0,asthma,heart_failure,heart_disease,angina,stroke,thyroid,liver,gallbladder
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [102]:
y = mc_df['heart_attack'].values.reshape(-1,1)
y[:5]

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=444, stratify = y)

In [104]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# creating random grid
n_estimators = [int(x) for x in np.linspace(200, 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf,
               'min_samples_split': min_samples_split,
               'bootstrap': bootstrap}

In [105]:
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions= random_grid,
                               n_iter= 100, cv = 5, verbose = 2, random_state = 444, n_jobs = -1)

rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


In [106]:
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 90,
 'bootstrap': True}

In [107]:
best_random = rf_random.best_estimator_
predictions = best_random.predict(X_test)

In [108]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, predictions)

In [109]:
display(cm_df)
print(acc_score)
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3594,15
Actual 1,76,27


0.9754849137931034
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      3609
         1.0       0.64      0.26      0.37       103

    accuracy                           0.98      3712
   macro avg       0.81      0.63      0.68      3712
weighted avg       0.97      0.98      0.97      3712



In [110]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'bootstrap': [True],
    'max_depth': [90, 100, 110, 120],
    'max_features': ['auto'],
    'min_samples_leaf': [4, 5, 6],
    'min_samples_split': [1, 2, 3],
    'n_estimators': [200, 300, 400, 1000]
}

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                           cv = 5, n_jobs=-1, verbose=2)

In [111]:
grid_search.fit(X_train, y_train)

grid_search.best_params_

Fitting 5 folds for each of 144 candidates, totalling 720 fits


240 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(


{'bootstrap': True,
 'max_depth': 90,
 'max_features': 'auto',
 'min_samples_leaf': 6,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [112]:
best_grid = grid_search.best_estimator_

grid_predictions = best_grid.predict(X_test)

cm = confusion_matrix(y_test, grid_predictions)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, grid_predictions)

In [113]:
display(cm_df)
print(acc_score)
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3598,11
Actual 1,83,20


0.974676724137931
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      3609
         1.0       0.64      0.26      0.37       103

    accuracy                           0.98      3712
   macro avg       0.81      0.63      0.68      3712
weighted avg       0.97      0.98      0.97      3712



In [114]:
importances = best_grid.feature_importances_

importances_sorted = sorted(zip(best_grid.feature_importances_, X.columns), reverse = True)
importances_df = pd.DataFrame(importances_sorted)
importances_df

Unnamed: 0,0,1
0,0.506627,heart_disease
1,0.245863,heart_failure
2,0.131601,angina
3,0.058889,stroke
4,0.02357,gallbladder
5,0.017192,thyroid
6,0.016255,asthma
7,3e-06,liver
