<a href="https://colab.research.google.com/github/equitymarkets/health_project_group_1/blob/main/hyper_parameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


Raw Data

In [3]:
med_conditions = pd.read_sas('/content/P_MCQ.XPT', format = 'xport')
med_conditions.head()

Unnamed: 0,SEQN,MCQ010,MCQ025,MCQ035,MCQ040,MCQ050,AGQ030,MCQ053,MCQ080,MCQ092,...,MCQ300A,MCQ366A,MCQ366B,MCQ366C,MCQ366D,MCQ371A,MCQ371B,MCQ371C,MCQ371D,OSQ230
0,109263.0,2.0,,,,,2.0,2.0,,,...,,,,,,,,,,
1,109264.0,2.0,,,,,,2.0,,2.0,...,,,,,,,,,,
2,109265.0,2.0,,,,,,2.0,,,...,,,,,,,,,,
3,109266.0,2.0,,,,,2.0,2.0,1.0,9.0,...,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,
4,109267.0,2.0,,,,,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,


In [4]:
columns = ['SEQN', 'MCQ010', 'MCQ160B', 'MCQ160C', 'MCQ160D', 'MCQ160E', 'MCQ160F', 'MCQ160M', 'MCQ500', 'MCQ560']
med_conditions_reduced = med_conditions[columns]
med_conditions_reduced.head()

Unnamed: 0,SEQN,MCQ010,MCQ160B,MCQ160C,MCQ160D,MCQ160E,MCQ160F,MCQ160M,MCQ500,MCQ560
0,109263.0,2.0,,,,,,,,
1,109264.0,2.0,,,,,,,2.0,
2,109265.0,2.0,,,,,,,,
3,109266.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0
4,109267.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0


In [5]:
mc_df = med_conditions_reduced.rename(columns = {'MCQ010': 'asthma', 'MCQ160B': 'heart_failure', 
                                                 'MCQ160C': 'heart_disease', 'MCQ160D': 'angina', 
                                                 'MCQ160E': 'heart_attack', 'MCQ160F': 'stroke', 
                                                 'MCQ160M': 'thyroid', 'MCQ500': 'liver', 
                                                 'MCQ560': 'gallbladder', 'SEQN': 'id'})
mc_df.head()

Unnamed: 0,id,asthma,heart_failure,heart_disease,angina,heart_attack,stroke,thyroid,liver,gallbladder
0,109263.0,2.0,,,,,,,,
1,109264.0,2.0,,,,,,,2.0,
2,109265.0,2.0,,,,,,,,
3,109266.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0
4,109267.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0


In [93]:
mc_df['asthma'] = mc_df['asthma'].fillna(0)
mc_df['heart_failure'] = mc_df['heart_failure'].fillna(0)
mc_df['heart_disease'] = mc_df['heart_disease'].fillna(0)
mc_df['angina'] = mc_df['angina'].fillna(0)
mc_df['heart_attack'] = mc_df['heart_attack'].fillna(0)
mc_df['stroke'] = mc_df['stroke'].fillna(0)
mc_df['thyroid'] = mc_df['thyroid'].fillna(0)
mc_df['liver'] = mc_df['liver'].fillna(0)
mc_df['gallbladder'] = mc_df['gallbladder'].fillna(0)

mc_df.head()

Unnamed: 0,id,asthma,heart_failure,heart_disease,angina,heart_attack,stroke,thyroid,liver,gallbladder
0,109263.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,109264.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
2,109265.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,109266.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,2.0
4,109267.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,2.0


In [94]:
mc_df['asthma'].nunique()

3

In [95]:
index_drop = list(mc_df[ (mc_df['asthma']==9) | (mc_df['heart_failure']==9) | (mc_df['heart_disease']==9) | (mc_df['angina']==9) | (mc_df['heart_attack']==9) | (mc_df['stroke']==9) | (mc_df['thyroid']==9) | (mc_df['liver']==9) | (mc_df['gallbladder']==9) ].index)


In [96]:
print(index_drop)

[309, 359, 421, 426, 798, 1103, 1122, 1168, 1490, 2192, 2248, 2302, 2348, 2361, 2710, 2982, 3015, 3101, 3211, 3241, 3353, 3592, 3593, 3729, 4091, 4099, 4125, 4321, 4446, 4530, 4545, 4655, 4837, 4911, 4989, 5020, 5277, 5306, 5312, 5351, 5568, 5608, 5612, 5677, 5695, 5799, 5855, 5859, 5929, 5932, 5950, 5963, 6202, 6362, 6399, 6592, 6606, 6608, 6834, 6943, 6944, 7232, 7590, 7633, 7659, 8009, 8080, 8303, 8385, 8602, 8843, 8856, 9134, 9433, 9438, 9591, 9609, 9681, 9782, 9831, 9976, 10026, 10032, 10116, 10220, 10303, 10420, 10523, 10601, 10628, 10673, 10774, 10803, 10955, 10967, 11110, 11132, 11159, 11166, 11429, 11657, 11676, 11753, 11889, 11919, 11930, 12112, 12232, 12387, 12566, 12583, 12949, 13020, 13074, 13081, 13256, 13481, 13491, 13533, 13628, 13688, 13693, 13722, 13723, 13775, 13781, 13824, 13920, 13981, 14231, 14296, 14339, 14385, 14466, 14540, 14770, 14892, 14921]


In [97]:
mc_df.drop(mc_df.index[index_drop], inplace = True)

In [98]:
print(mc_df)

             id  asthma  heart_failure  heart_disease  angina  heart_attack  \
0      109263.0     2.0            0.0            0.0     0.0           0.0   
1      109264.0     2.0            0.0            0.0     0.0           0.0   
2      109265.0     2.0            0.0            0.0     0.0           0.0   
3      109266.0     2.0            2.0            2.0     2.0           2.0   
4      109267.0     2.0            2.0            2.0     2.0           2.0   
...         ...     ...            ...            ...     ...           ...   
14981  124818.0     2.0            2.0            2.0     2.0           2.0   
14982  124819.0     2.0            0.0            0.0     0.0           0.0   
14983  124820.0     2.0            0.0            0.0     0.0           0.0   
14984  124821.0     1.0            2.0            2.0     2.0           2.0   
14985  124822.0     2.0            2.0            2.0     2.0           1.0   

       stroke  thyroid  liver  gallbladder  
0     

In [99]:
mc_df['asthma'] = mc_df['asthma'].replace({2: 0})
mc_df['heart_failure'] = mc_df['heart_failure'].replace({2: 0})
mc_df['heart_disease'] = mc_df['heart_disease'].replace({2: 0})
mc_df['angina'] = mc_df['angina'].replace({2: 0})
mc_df['heart_attack'] = mc_df['heart_attack'].replace({2: 0})
mc_df['stroke'] = mc_df['stroke'].replace({2: 0})
mc_df['thyroid'] = mc_df['thyroid'].replace({2: 0})
mc_df['liver'] = mc_df['liver'].replace({2: 0})
mc_df['gallbladder'] = mc_df['gallbladder'].replace({2: 0})
mc_df['heart_attack'] = mc_df['heart_attack'].replace({2: 0})

mc_df.head()

Unnamed: 0,id,asthma,heart_failure,heart_disease,angina,heart_attack,stroke,thyroid,liver,gallbladder
0,109263.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,109264.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,109265.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,109266.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,109267.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
mc_df.shape

(14848, 10)

In [101]:
X = mc_df.copy()
X.drop(['heart_attack', 'id'], axis = 1, inplace = True)
X.head()

Unnamed: 0,asthma,heart_failure,heart_disease,angina,stroke,thyroid,liver,gallbladder
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [102]:
y = mc_df['heart_attack'].values.reshape(-1,1)
y[:5]

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=444, stratify = y)

In [104]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# creating random grid
n_estimators = [int(x) for x in np.linspace(200, 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf,
               'min_samples_split': min_samples_split,
               'bootstrap': bootstrap}

In [105]:
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions= random_grid,
                               n_iter= 100, cv = 5, verbose = 2, random_state = 444, n_jobs = -1)

rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


In [106]:
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 90,
 'bootstrap': True}

In [107]:
best_random = rf_random.best_estimator_
predictions = best_random.predict(X_test)

In [108]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, predictions)

In [109]:
display(cm_df)
print(acc_score)
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3594,15
Actual 1,76,27


0.9754849137931034
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      3609
         1.0       0.64      0.26      0.37       103

    accuracy                           0.98      3712
   macro avg       0.81      0.63      0.68      3712
weighted avg       0.97      0.98      0.97      3712



In [110]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'bootstrap': [True],
    'max_depth': [90, 100, 110, 120],
    'max_features': ['auto'],
    'min_samples_leaf': [4, 5, 6],
    'min_samples_split': [1, 2, 3],
    'n_estimators': [200, 300, 400, 1000]
}

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                           cv = 5, n_jobs=-1, verbose=2)

In [111]:
grid_search.fit(X_train, y_train)

grid_search.best_params_

Fitting 5 folds for each of 144 candidates, totalling 720 fits


240 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(


{'bootstrap': True,
 'max_depth': 90,
 'max_features': 'auto',
 'min_samples_leaf': 6,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [112]:
best_grid = grid_search.best_estimator_

grid_predictions = best_grid.predict(X_test)

cm = confusion_matrix(y_test, grid_predictions)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, grid_predictions)

In [113]:
display(cm_df)
print(acc_score)
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3598,11
Actual 1,83,20


0.974676724137931
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      3609
         1.0       0.64      0.26      0.37       103

    accuracy                           0.98      3712
   macro avg       0.81      0.63      0.68      3712
weighted avg       0.97      0.98      0.97      3712



In [114]:
importances = best_grid.feature_importances_

importances_sorted = sorted(zip(best_grid.feature_importances_, X.columns), reverse = True)
importances_df = pd.DataFrame(importances_sorted)
importances_df

Unnamed: 0,0,1
0,0.506627,heart_disease
1,0.245863,heart_failure
2,0.131601,angina
3,0.058889,stroke
4,0.02357,gallbladder
5,0.017192,thyroid
6,0.016255,asthma
7,3e-06,liver


baseline reandom forest


In [7]:
med_df = pd.read_csv('/content/med_conditions.csv')

In [8]:
combined_df = med_df
combined_df.head()

Unnamed: 0.1,Unnamed: 0,id,asthma,heart_failure,heart_disease,angina_pectoris,stroke,thyroid,liver_condition,gallbladder,heart_attack
0,3,109266.0,0,0,0,0,0,0,0,0,0
1,4,109267.0,0,0,0,0,0,0,0,0,0
2,8,109271.0,1,0,0,0,0,0,0,0,0
3,9,109273.0,0,0,0,0,0,0,0,0,0
4,10,109274.0,0,0,0,0,0,0,0,1,0


In [9]:
combined_df.columns

Index(['Unnamed: 0', 'id', 'asthma', 'heart_failure', 'heart_disease',
       'angina_pectoris', 'stroke', 'thyroid', 'liver_condition',
       'gallbladder', 'heart_attack'],
      dtype='object')

In [10]:
combined_df.shape

(9088, 11)

In [11]:
X = combined_df.copy()
X.drop(['heart_attack', 'id', 'Unnamed: 0'], axis = 1, inplace = True)
X.head()

Unnamed: 0,asthma,heart_failure,heart_disease,angina_pectoris,stroke,thyroid,liver_condition,gallbladder
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1


In [98]:
y = combined_df['heart_attack'].values.reshape(-1,1)
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=444, stratify = y)

In [100]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)


  rf.fit(X_train, y_train)


In [101]:
predictions = rf.predict(X_test)

In [102]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, predictions)

In [103]:
display(cm_df)
print(acc_score)
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2142,27
Actual 1,63,40


0.960387323943662
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      2169
           1       0.60      0.39      0.47       103

    accuracy                           0.96      2272
   macro avg       0.78      0.69      0.73      2272
weighted avg       0.95      0.96      0.96      2272



Just Medical Conditions Dataset - did not check if before heart attack - down sampled

In [104]:
heart_attack = combined_df[combined_df['heart_attack'] == 1]
heart_attack.shape

(412, 11)

In [105]:
non_heart_attack = combined_df[combined_df['heart_attack'] == 0]
non_heart_attack.head()

Unnamed: 0.1,Unnamed: 0,id,asthma,heart_failure,heart_disease,angina_pectoris,stroke,thyroid,liver_condition,gallbladder,heart_attack
0,3,109266.0,0,0,0,0,0,0,0,0,0
1,4,109267.0,0,0,0,0,0,0,0,0,0
2,8,109271.0,1,0,0,0,0,0,0,0,0
3,9,109273.0,0,0,0,0,0,0,0,0,0
4,10,109274.0,0,0,0,0,0,0,0,1,0


In [106]:
non_heart_attack = non_heart_attack.sample(n=824,random_state = 444)
non_heart_attack.head()
non_heart_attack.shape

(824, 11)

In [107]:
down_sampled_df = pd.concat([non_heart_attack, heart_attack])
down_sampled_df.head()
down_sampled_df.shape

(1236, 11)

In [109]:
X = down_sampled_df.copy()
X.drop(['heart_attack', 'id', 'Unnamed: 0'], axis = 1, inplace = True)
X.head()

Unnamed: 0,asthma,heart_failure,heart_disease,angina_pectoris,stroke,thyroid,liver_condition,gallbladder
1092,1,0,0,1,0,1,0,0
5795,0,0,0,0,0,0,0,1
1195,0,0,0,0,0,0,1,0
3793,0,0,0,0,0,0,0,0
2507,0,0,0,0,0,0,0,0


In [110]:
y = down_sampled_df['heart_attack'].values.reshape(-1,1)
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=444, stratify = y)

In [112]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# creating random grid
n_estimators = [int(x) for x in np.linspace(200, 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf,
               'min_samples_split': min_samples_split,
               'bootstrap': bootstrap}

In [113]:
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions= random_grid,
                               n_iter= 100, cv = 5, verbose = 2, random_state = 444, n_jobs = -1)

rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


In [114]:
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 20,
 'bootstrap': True}

In [115]:
best_random = rf_random.best_estimator_
predictions = best_random.predict(X_test)

In [116]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, predictions)

In [117]:
display(cm_df)
print(acc_score)
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,195,11
Actual 1,18,85


0.9061488673139159
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       206
           1       0.89      0.83      0.85       103

    accuracy                           0.91       309
   macro avg       0.90      0.89      0.89       309
weighted avg       0.91      0.91      0.91       309



In [121]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'bootstrap': [True],
    'max_depth': [20, 30, 40, 50],
    'max_features': ['auto'],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [4, 5, 6],
    'n_estimators': [200, 300, 400, 1000]
}

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                           cv = 5, n_jobs=-1, verbose=2)

In [122]:
grid_search.fit(X_train, y_train)

grid_search.best_params_

Fitting 5 folds for each of 144 candidates, totalling 720 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


{'bootstrap': True,
 'max_depth': 40,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'n_estimators': 200}

In [123]:
best_grid = grid_search.best_estimator_

grid_predictions = best_grid.predict(X_test)

cm = confusion_matrix(y_test, grid_predictions)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, grid_predictions)

In [124]:
display(cm_df)
print(acc_score)
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,195,11
Actual 1,18,85


0.9061488673139159
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       206
           1       0.89      0.83      0.85       103

    accuracy                           0.91       309
   macro avg       0.90      0.89      0.89       309
weighted avg       0.91      0.91      0.91       309



In [125]:
importances = best_grid.feature_importances_

importances_sorted = sorted(zip(best_grid.feature_importances_, X.columns), reverse = True)
importances_df = pd.DataFrame(importances_sorted)
importances_df

Unnamed: 0,0,1
0,0.438574,heart_disease
1,0.209631,heart_failure
2,0.16478,stroke
3,0.124223,angina_pectoris
4,0.024169,gallbladder
5,0.02079,thyroid
6,0.010438,asthma
7,0.007396,liver_condition


Down sample - bigger size

In [126]:
heart_attack = combined_df[combined_df['heart_attack'] == 1]
non_heart_attack  = combined_df[combined_df['heart_attack'] == 0]

In [127]:
non_heart_attack = non_heart_attack.sample(n=1236,random_state = 444)
non_heart_attack.shape

(1236, 11)

In [128]:
down_sampled_df = pd.concat([non_heart_attack, heart_attack])
down_sampled_df.head()
down_sampled_df.shape

(1648, 11)

In [130]:
X = down_sampled_df.copy()
X.drop(['heart_attack', 'id', 'Unnamed: 0'], axis = 1, inplace = True)
X.head()

Unnamed: 0,asthma,heart_failure,heart_disease,angina_pectoris,stroke,thyroid,liver_condition,gallbladder
1092,1,0,0,1,0,1,0,0
5795,0,0,0,0,0,0,0,1
1195,0,0,0,0,0,0,1,0
3793,0,0,0,0,0,0,0,0
2507,0,0,0,0,0,0,0,0


In [131]:
y = down_sampled_df['heart_attack'].values.reshape(-1,1)
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [132]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=444, stratify = y)

In [133]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# creating random grid
n_estimators = [int(x) for x in np.linspace(200, 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf,
               'min_samples_split': min_samples_split,
               'bootstrap': bootstrap}

In [134]:
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions= random_grid,
                               n_iter= 100, cv = 5, verbose = 2, random_state = 444, n_jobs = -1)

rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [135]:
rf_random.best_params_

{'n_estimators': 2000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': False}

In [136]:
best_random = rf_random.best_estimator_
predictions = best_random.predict(X_test)

In [137]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, predictions)

In [138]:
display(cm_df)
print(acc_score)
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,294,15
Actual 1,38,65


0.8713592233009708
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       309
           1       0.81      0.63      0.71       103

    accuracy                           0.87       412
   macro avg       0.85      0.79      0.81       412
weighted avg       0.87      0.87      0.87       412



In [139]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'bootstrap': [False],
    'max_depth': [10, 20, 30, 40],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [1, 2, 3],
    'n_estimators': [1000, 2000, 2100, 1900]
}

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                           cv = 5, n_jobs=-1, verbose=2)

In [141]:
grid_search.fit(X_train, y_train)

grid_search.best_params_

Fitting 5 folds for each of 144 candidates, totalling 720 fits


240 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(


{'bootstrap': False,
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 1900}

In [142]:
best_grid = grid_search.best_estimator_

grid_predictions = best_grid.predict(X_test)

cm = confusion_matrix(y_test, grid_predictions)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, grid_predictions)

In [143]:
display(cm_df)
print(acc_score)
print(classification_report(y_test, grid_predictions))

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,294,15
Actual 1,38,65


0.8713592233009708
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       309
           1       0.81      0.63      0.71       103

    accuracy                           0.87       412
   macro avg       0.85      0.79      0.81       412
weighted avg       0.87      0.87      0.87       412



In [144]:
importances = best_grid.feature_importances_

importances_sorted = sorted(zip(best_grid.feature_importances_, X.columns), reverse = True)
importances_df = pd.DataFrame(importances_sorted)
importances_df

Unnamed: 0,0,1
0,0.437494,heart_disease
1,0.231948,heart_failure
2,0.122702,angina_pectoris
3,0.115698,stroke
4,0.02873,gallbladder
5,0.026161,thyroid
6,0.023554,asthma
7,0.013712,liver_condition


Applying Model to Whole Data to check for biases

In [146]:
X = combined_df.copy()
X.drop(['heart_attack', 'id', 'Unnamed: 0'], axis = 1, inplace = True)
X.head()

Unnamed: 0,asthma,heart_failure,heart_disease,angina_pectoris,stroke,thyroid,liver_condition,gallbladder
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1


In [147]:
y = combined_df['heart_attack'].values.reshape(-1,1)
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [148]:
total_predictions = best_grid.predict(X)

In [149]:
cm = confusion_matrix(y, total_predictions)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y, total_predictions)

In [150]:
display(cm_df)
print(acc_score)
print(classification_report(y, total_predictions))

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,8283,393
Actual 1,123,289


0.9432218309859155
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      8676
           1       0.42      0.70      0.53       412

    accuracy                           0.94      9088
   macro avg       0.70      0.83      0.75      9088
weighted avg       0.96      0.94      0.95      9088



No gallbladder

In [12]:
combined_df.head()

Unnamed: 0.1,Unnamed: 0,id,asthma,heart_failure,heart_disease,angina_pectoris,stroke,thyroid,liver_condition,gallbladder,heart_attack
0,3,109266.0,0,0,0,0,0,0,0,0,0
1,4,109267.0,0,0,0,0,0,0,0,0,0
2,8,109271.0,1,0,0,0,0,0,0,0,0
3,9,109273.0,0,0,0,0,0,0,0,0,0
4,10,109274.0,0,0,0,0,0,0,0,1,0


In [13]:
heart_attack = combined_df[combined_df['heart_attack'] == 1]
non_heart_attack  = combined_df[combined_df['heart_attack'] == 0]

In [14]:
non_heart_attack.drop('gallbladder', axis = 1, inplace = True)
non_heart_attack = non_heart_attack.sample(n=1236,random_state = 444)
non_heart_attack.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_heart_attack.drop('gallbladder', axis = 1, inplace = True)


(1236, 10)

In [15]:
heart_attack.drop('gallbladder', axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heart_attack.drop('gallbladder', axis = 1, inplace = True)


In [17]:
down_sampled_df = pd.concat([non_heart_attack, heart_attack])
down_sampled_df.head()
down_sampled_df.shape

(1648, 10)

In [18]:
X = down_sampled_df.copy()
X.drop(['heart_attack', 'id', 'Unnamed: 0'], axis = 1, inplace = True)
X.head()

Unnamed: 0,asthma,heart_failure,heart_disease,angina_pectoris,stroke,thyroid,liver_condition
1092,1,0,0,1,0,1,0
5795,0,0,0,0,0,0,0
1195,0,0,0,0,0,0,1
3793,0,0,0,0,0,0,0
2507,0,0,0,0,0,0,0


In [19]:
y = down_sampled_df['heart_attack'].values.reshape(-1,1)
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=444, stratify = y)

In [21]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# creating random grid
n_estimators = [int(x) for x in np.linspace(200, 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf,
               'min_samples_split': min_samples_split,
               'bootstrap': bootstrap}

In [22]:
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions= random_grid,
                               n_iter= 100, cv = 5, verbose = 2, random_state = 444, n_jobs = -1)

rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


In [23]:
rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 30,
 'bootstrap': False}

In [24]:
best_random = rf_random.best_estimator_
predictions = best_random.predict(X_test)

In [25]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, predictions)

In [26]:
display(cm_df)
print(acc_score)
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,296,13
Actual 1,36,67


0.8810679611650486
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       309
           1       0.84      0.65      0.73       103

    accuracy                           0.88       412
   macro avg       0.86      0.80      0.83       412
weighted avg       0.88      0.88      0.88       412



Testing on 2015-2016 Data 

In [152]:
med_conditions_2015_2016 = pd.read_sas('/content/MCQ_I_2015_2016.XPT', format = 'xport')
med_conditions_2015_2016.head()

Unnamed: 0,SEQN,MCQ010,MCQ025,MCQ035,MCQ040,MCQ050,AGQ030,MCQ053,MCQ080,MCQ092,...,MCQ300C,MCQ365A,MCQ365B,MCQ365C,MCQ365D,MCQ370A,MCQ370B,MCQ370C,MCQ370D,OSQ230
0,83732.0,2.0,,,,,,2.0,1.0,2.0,...,1.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0
1,83733.0,2.0,,,,,,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
2,83734.0,1.0,60.0,1.0,2.0,2.0,1.0,2.0,1.0,2.0,...,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0
3,83735.0,2.0,,,,,,2.0,1.0,2.0,...,9.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0
4,83736.0,1.0,10.0,2.0,,,,2.0,2.0,2.0,...,9.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [155]:
med_columns = ['SEQN', 'MCQ010', 'MCQ160B', 'MCQ160C', 'MCQ160D', 'MCQ160F',
           'MCQ160M', 'MCQ160L', 'MCQ160E']

In [156]:
mc_df = med_conditions_2015_2016[med_columns]
mc_df.head()

Unnamed: 0,SEQN,MCQ010,MCQ160B,MCQ160C,MCQ160D,MCQ160F,MCQ160M,MCQ160L,MCQ160E
0,83732.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
1,83733.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
2,83734.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0
3,83735.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
4,83736.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [157]:
mc_df = mc_df.rename(columns = {'SEQN': 'id', 'MCQ010': 'asthma', 'MCQ160B': 'heart_failure',
                                 'MCQ160C': 'heart_disease', 'MCQ160D': 'angina_pectoris', 
                                 'MCQ160F': 'stroke', 'MCQ160M': 'thyroid', 'MCQ160L': 'liver_condition', 
                                 'MCQ560': 'gallbladder', 'MCQ160E': 'heart_attack'})
mc_df.head()

Unnamed: 0,id,asthma,heart_failure,heart_disease,angina_pectoris,stroke,thyroid,liver_condition,heart_attack
0,83732.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
1,83733.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
2,83734.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0
3,83735.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
4,83736.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [158]:
mc_df = mc_df.dropna()
mc_df.head()

Unnamed: 0,id,asthma,heart_failure,heart_disease,angina_pectoris,stroke,thyroid,liver_condition,heart_attack
0,83732.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
1,83733.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
2,83734.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0
3,83735.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
4,83736.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [159]:
mc_df.shape

(5719, 9)

In [161]:
mc_df = mc_df[ (mc_df['asthma']!=9) & (mc_df['heart_failure']!=9) & (mc_df['heart_disease']!=9) & (mc_df['angina_pectoris']!=9) & (mc_df['heart_attack']!=9) & (mc_df['stroke']!=9) & (mc_df['thyroid']!=9) & (mc_df['liver_condition']!=9) ]

In [163]:
mc_df = mc_df[ (mc_df['asthma']!=7) & (mc_df['heart_failure']!=7) & (mc_df['heart_disease']!=7) & (mc_df['angina_pectoris']!=7) & (mc_df['heart_attack']!=7) & (mc_df['stroke']!=7) & (mc_df['thyroid']!=7) & (mc_df['liver_condition']!=7) ]

In [164]:
mc_df.shape

(5641, 9)

In [166]:
mc_df['asthma'] = np.where(mc_df['asthma'] == 1, 1, 0)
mc_df['heart_failure'] = np.where(mc_df['heart_failure'] == 1, 1, 0)
mc_df['heart_disease'] = np.where(mc_df['heart_disease'] == 1, 1, 0)
mc_df['angina_pectoris'] = np.where(mc_df['angina_pectoris'] == 1, 1, 0)
mc_df['stroke'] = np.where(mc_df['stroke'] == 1, 1, 0)
mc_df['thyroid'] = np.where(mc_df['thyroid'] == 1, 1, 0)
mc_df['liver_condition'] = np.where(mc_df['liver_condition'] == 1, 1, 0)
#mc_df['gallbladder'] = np.where(mc_df['gallbladder'] == 1, 1, 0)
mc_df['heart_attack'] = np.where(mc_df['heart_attack'] == 1, 1, 0)
mc_df.head()

Unnamed: 0,id,asthma,heart_failure,heart_disease,angina_pectoris,stroke,thyroid,liver_condition,heart_attack
0,83732.0,0,0,0,0,0,0,0,0
1,83733.0,0,0,0,0,0,0,0,0
2,83734.0,1,0,0,0,0,1,1,1
3,83735.0,0,0,0,0,0,0,0,0
4,83736.0,1,0,0,0,0,0,0,0


In [167]:
X = mc_df.copy()
X.drop(['heart_attack', 'id'], axis = 1, inplace = True)
X.head()

Unnamed: 0,asthma,heart_failure,heart_disease,angina_pectoris,stroke,thyroid,liver_condition
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,1,0,0,0,0,1,1
3,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0


In [168]:
y = mc_df['heart_attack'].values.reshape(-1,1)

In [169]:
predictions_1516 = best_grid.predict(X)

ValueError: ignored

In [None]:
cm = confusion_matrix(y, predictions_1516)
cm_df = pd.DataFrame(cm, 
                     index = ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y, predictions_1516)

In [None]:
display(cm_df)
print(acc_score)
print(classification_report(y, total_predictions))