# Hyperparameter tuning

In [11]:
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score

from pprint import pprint


In [2]:
pd.set_option('display.max_rows', None, 'display.max_columns', None)

df = pd.read_csv('../data/results/results_basemodels.csv')

In [3]:
df.sort_values(by=['test_matthews_corrcoef', 'test_recall'], ascending=False).set_index(keys=['data type', 'model'])

Unnamed: 0_level_0,Unnamed: 1_level_0,test_balanced_accuracy,test_precision,test_recall,test_roc_auc,test_accuracy,test_matthews_corrcoef
data type,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
no corr -- simple,rf,0.869643,0.923711,1.0,0.955115,0.93629,0.82403
no corr -- smote,rf,0.885714,0.943473,0.95,0.956619,0.917339,0.776762
no corr -- simple,knn,0.869281,0.935386,0.949275,0.963781,0.910282,0.75708
raw data -- smote,grad_b,0.875595,0.943532,0.933333,0.948059,0.904032,0.753823
pca -- smote,rf,0.864881,0.934909,0.933333,0.960902,0.897782,0.737143
pca transformed -- simple,knn,0.864519,0.935231,0.932609,0.968255,0.897581,0.736737
raw data -- simple,knn,0.863509,0.93388,0.941304,0.959103,0.903831,0.736027
raw data -- smote,knn,0.898913,0.981304,0.847826,0.94922,0.871573,0.724394
raw data -- smote,rf,0.872852,0.949532,0.899275,0.938571,0.884677,0.721077
pca transformed -- simple,grad_b,0.81131,0.894389,0.983333,0.91591,0.897581,0.715631


From all the results of the basemodels, we can see that the `Random Forest` without correlated data performs the best. It has a recall of `1.0`, which is very important, as we do not want to have false negatives. The `Random Forest` model also has the highest quality (MCC).

Unfortunately due to stochastic nature, performance can very. It is possible to set the random_state variable. However, this is not adviced.

## Tuning
Lets try to tune the model even more.

In [7]:
X_train, y_train = pd.read_pickle('../data/X_train.pkl'), pd.read_pickle('../data/y_train.pkl')
X_test, y_test = pd.read_pickle('../data/X_test.pkl'), pd.read_pickle('../data/y_test.pkl')

In [8]:
# get the data without correlation
no_corr_cols = ['MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'RPDE', 'DFA', 'spread2','D2', 'PPE']
X_no_corr_train = X_train[no_corr_cols]
X_no_corr_test = X_test[no_corr_cols]

In [14]:
# Create the baseline model
rf = make_pipeline(StandardScaler(), RandomForestClassifier())

cv = StratifiedKFold(n_splits=5)

# somehow matthews_corrcoef is not valid and it seems jupyter notebook uses another version python version?
scoring = ['balanced_accuracy', 'precision', 'recall', 'roc_auc', 'accuracy']

scores = cross_validate(rf, X_no_corr_train, y_train, cv=cv, scoring=scoring)

In [17]:
for test, score in scores.items():
    print(f'{test}:\t\t\t{score.mean():.3f}')

fit_time:			0.111
score_time:			0.022
test_balanced_accuracy:			0.840
test_precision:			0.908
test_recall:			0.992
test_roc_auc:			0.954
test_accuracy:			0.917


In [None]:
# Another test
rf.fit(X_no_corr_train, y_train)

In [29]:
y_pred = rf.predict(X_no_corr_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 8  2]
 [ 1 28]]
              precision    recall  f1-score   support

           0       0.89      0.80      0.84        10
           1       0.93      0.97      0.95        29

    accuracy                           0.92        39
   macro avg       0.91      0.88      0.90        39
weighted avg       0.92      0.92      0.92        39



In [32]:
pprint(rf.get_params())

{'memory': None,
 'randomforestclassifier': RandomForestClassifier(),
 'randomforestclassifier__bootstrap': True,
 'randomforestclassifier__ccp_alpha': 0.0,
 'randomforestclassifier__class_weight': None,
 'randomforestclassifier__criterion': 'gini',
 'randomforestclassifier__max_depth': None,
 'randomforestclassifier__max_features': 'auto',
 'randomforestclassifier__max_leaf_nodes': None,
 'randomforestclassifier__max_samples': None,
 'randomforestclassifier__min_impurity_decrease': 0.0,
 'randomforestclassifier__min_samples_leaf': 1,
 'randomforestclassifier__min_samples_split': 2,
 'randomforestclassifier__min_weight_fraction_leaf': 0.0,
 'randomforestclassifier__n_estimators': 100,
 'randomforestclassifier__n_jobs': None,
 'randomforestclassifier__oob_score': False,
 'randomforestclassifier__random_state': None,
 'randomforestclassifier__verbose': 0,
 'randomforestclassifier__warm_start': False,
 'standardscaler': StandardScaler(),
 'standardscaler__copy': True,
 'standardscaler__wi

## Creating a random grid for random search
It is import to start with random search, to narrow down posible parameters for grid search.

In [41]:
n_estimators = np.arange(0, 2000, 100)
max_features = ['sqrt', 'log2', None]
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['sqrt', 'log2', None],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': array([   0,  100,  200,  300,  400,  500,  600,  700,  800,  900, 1000,
       1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900])}


In [42]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, scoring='recall', n_iter=100, cv=5, verbose=1, n_jobs=-1)

In [43]:
rf_random.fit(X_no_corr_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


25 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/commons/conda/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/commons/conda/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 383, in fit
    self._validate_estimator()
  File "/commons/conda/lib/python3.8/site-packages/sklearn/ensemble/_base.py", line 138, in _validate_estimator
    raise ValueError(
ValueError: n_estimators must be greater than zero, got 0.

 0.99166667 0.99166667 0.99166667        nan 0.99166667 0.88188406
 0.96630435 0.88188406 0.

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['sqrt', 'log2', None],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': array([   0,  100,  200,  300,  400,  500,  600,  700,  800,  900, 1000,
       1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900])},
                   scoring='recall', verbose=1)

In [44]:
rf_random.best_params_

{'n_estimators': 1700,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 30,
 'bootstrap': False}

In [4]:
import pickle
loaded_model = pickle.load(open('../data/model_saves/best_model.sav', 'rb'))

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [45]:
best_random = rf_random.best_estimator_
best_random_ypred = best_random.predict(X_no_corr_test)

print(confusion_matrix(y_test, best_random_ypred))
print(classification_report(y_test, best_random_ypred))

[[ 8  2]
 [ 0 29]]
              precision    recall  f1-score   support

           0       1.00      0.80      0.89        10
           1       0.94      1.00      0.97        29

    accuracy                           0.95        39
   macro avg       0.97      0.90      0.93        39
weighted avg       0.95      0.95      0.95        39



In [10]:
roc_auc_score(y_test, best_random_ypred)

0.9


In [12]:
recall_score(y_test, best_random_ypred)

1.0

In [49]:
accuracy_score(y_test, best_random_ypred)

0.9487179487179487

## Conclusion
The results are great and there is no need to further optimize it for now. healthy patients (0) are not always classified correctly. However, patients with Parkinson's disease are not classified as false negatives. Therefore it is a great score. The accuracy is around 0.95 which is great and the goal is accomplished (recall and accuracy > 0.9). Using an ensemble method clearly improves the perofrmance.

In [46]:
rf_random.best_score_

1.0