In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, fbeta_score, make_scorer
import matplotlib.pyplot as plt
import sys
from sklearn.model_selection import  cross_validate, StratifiedShuffleSplit
from sklearn import tree
import graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from joblib import dump

# Project files

In [None]:
%load_ext autoreload
%autoreload 2
sys.path.append('/content/drive/MyDrive/TFG/implementations/machine_learning_tfg/')
from src.utils.model_metrics_generator import ModelMetricsGenerator
from src.utils.cross_validation_utils import CrossValidationMetricsResultPrinter
from src.utils.my_metrics import accuracy_precision_recall_specifity_f2_score

# Load data

In [None]:
input_data = pd.read_excel('/content/drive/MyDrive/TFG/implementations/machine_learning_tfg/data/prepared/prepared_ICU_Prediction.xlsx')
#input_data = pd.read_excel('./../data/prepared/prepared_ICU_Prediction.xlsx')
ground_truth = input_data['ICU']
sample_data = input_data.drop('ICU', axis=1)
train_data, test_data, train_truth, test_truth = train_test_split(sample_data, ground_truth, test_size=0.2, shuffle=True, random_state=42)

# Functions

In [None]:
def random_forest_model(n_estimators=100, max_features='sqrt', random_state=None):
  metrics = accuracy_precision_recall_specifity_f2_score()
  sskfold = StratifiedShuffleSplit(random_state=1)
  rf_model = RandomForestClassifier(n_estimators, max_features=max_features, random_state=random_state)

  #cross validation
  results = cross_validate(rf_model, train_data, train_truth, cv=sskfold, scoring=metrics, n_jobs=-1)
  printer = CrossValidationMetricsResultPrinter()
  printer.print_metrics_report(results)


  model_metrics_generator = ModelMetricsGenerator(rf_model, test_truth)
  rf_model = model_metrics_generator.fit_and_predict_model( train_data, train_truth, test_data)
  model_metrics_generator.print_results()
  

#First approach to random forest models

In [None]:
random_forest_model(random_state=1)


Valores medios:
	Fit time: 0.7688
	Test time: 0.0248
	Accuracy: 91.36
	Precision: 93.83
	Recall: 78.0
	Specificity: 97.6
	F2 score: 80.69


0.7688
0.0248
91.36
93.83
78.0
97.6
80.69

 Indicadores rendimiento:
Fit time: 0.5915
Predict time: 0.0141
Accuracy: 92.73
Precision: 89.52
Recall: 91.26
Specificity: 93.6
F2-score: 90.91


0.5915
0.0141
92.73
89.52
91.26
93.6
90.91


In [None]:
random_forest_model(n_estimators=500, random_state=1)


Valores medios:
	Fit time: 3.8334
	Test time: 0.084
	Accuracy: 92.18
	Precision: 93.2
	Recall: 81.43
	Specificity: 97.2
	F2 score: 83.5


3.8334
0.084
92.18
93.2
81.43
97.2
83.5

 Indicadores rendimiento:
Fit time: 3.0876
Predict time: 0.0606
Accuracy: 93.82
Precision: 93.0
Recall: 90.29
Specificity: 95.93
F2-score: 90.82


3.0876
0.0606
93.82
93.0
90.29
95.93
90.82


In [None]:
random_forest_model(n_estimators=2000, random_state=1)


Valores medios:
	Fit time: 15.1752
	Test time: 0.3288
	Accuracy: 92.18
	Precision: 93.2
	Recall: 81.43
	Specificity: 97.2
	F2 score: 83.5


15.1752
0.3288
92.18
93.2
81.43
97.2
83.5

 Indicadores rendimiento:
Fit time: 11.5838
Predict time: 0.254
Accuracy: 93.09
Precision: 91.18
Recall: 90.29
Specificity: 94.77
F2-score: 90.47


11.5838
0.254
93.09
91.18
90.29
94.77
90.47


## Grid search

In [None]:
n_estimators = [500, 1000, 1500, 2000, 2500, 3000]
max_features = ["sqrt", "log2"]
#n_estimators = [500]
#max_features = ["sqrt"]
param_grid = dict(n_estimators=n_estimators, max_features=max_features)

rf = RandomForestClassifier()
sskfold = StratifiedShuffleSplit(random_state=1)
scoring = accuracy_precision_recall_specifity_f2_score()
grid = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1, cv=sskfold, scoring=scoring, refit='recall')
grid_result = %time grid.fit(train_data, train_truth)

print(f'El mejor resultado :{grid_result.best_score_} se consigue con {grid_result.best_params_}')

model_metrics_generator = ModelMetricsGenerator(grid_result, test_truth)
model_metrics_generator.predict_model(test_data)
model_metrics_generator.print_results()

CPU times: user 10.3 s, sys: 510 ms, total: 10.8 s
Wall time: 11min 55s
El mejor resultado :0.8114285714285714 se consigue con {'max_features': 'sqrt', 'n_estimators': 1000}

 Indicadores rendimiento:
Fit time: 0
Predict time: 0.1381
Accuracy: 94.18
Precision: 93.07
Recall: 91.26
Specificity: 95.93
F2-score: 91.62


0
0.1381
94.18
93.07
91.26
95.93
91.62


We train a model with grid search values for getting train time

In [None]:
random_forest_model(n_estimators=1000, random_state=1)


Valores medios:
	Fit time: 7.9332
	Test time: 0.1644
	Accuracy: 92.09
	Precision: 93.17
	Recall: 81.14
	Specificity: 97.2
	F2 score: 83.26


7.9332
0.1644
92.09
93.17
81.14
97.2
83.26

 Indicadores rendimiento:
Fit time: 5.9595
Predict time: 0.1279
Accuracy: 93.09
Precision: 91.18
Recall: 90.29
Specificity: 94.77
F2-score: 90.47


5.9595
0.1279
93.09
91.18
90.29
94.77
90.47


## Save model

In [None]:
save_model = False
if save_model:
  dump(grid_result.best_estimator_, '/content/drive/MyDrive/TFG/implementations/machine_learning_tfg/notebooks/best_model/random_forest.joblib')