In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, fbeta_score, make_scorer
import matplotlib.pyplot as plt
import sys
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.linear_model import LogisticRegression
import time

# Project imports

In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
sys.path.append('drive/MyDrive/TFG/implementations/machine_learning_tfg')
from src.utils.model_metrics_generator import ModelMetricsGenerator
from src.utils.cross_validation_utils import CrossValidationMetricsResultPrinter
from src.utils.my_metrics import accuracy_precision_recall_specifity_f2_score

# Load data

In [29]:
input_data = pd.read_excel('/content/drive/MyDrive/TFG/implementations/machine_learning_tfg/data/prepared/prepared_ICU_Prediction.xlsx')
#input_data = pd.read_excel('./../data/prepared/prepared_ICU_Prediction.xlsx')
ground_truth = input_data['ICU']
sample_data = input_data.drop('ICU', axis=1)
train_data, test_data, train_truth, test_truth = train_test_split(sample_data, ground_truth, test_size=0.2, shuffle=True, random_state=42)

# Model implementation

In [31]:
model_metrics_generator = ModelMetricsGenerator(test_data, test_truth)

model = LogisticRegression(n_jobs=-1)
model_metrics_generator.generate_metrics(model, train_data, train_truth)

model_metrics_generator.print_results()


Fit time: 1.1221s.
Predict time: 0.0026s.
Accuracy: 78.18%.
Precision: 73.63%.
Recall: 65.05%.
Specificity: 86.05%.
F2-score: 66.6%.


In [None]:
def logit_hyperparam_grid_search():
  """ Perform Grid search for logit model

  """
  
  #hyperparameter values
  solver = ['liblinear', 'newton-cg','sag', 'lbfgs']
  penalty = ['l2', 'l1']
  C = [1.0, 0.75, 0.5, 0.25]
  class_weight = [{0:1, 1:1}, {0:1, 1:5}, {0:1, 1:10}]
  param_grid = dict(solver=solver,C=C, penalty=penalty, class_weight=class_weight)
  
  #configuring grid serach
  model = LogisticRegression(max_iter=4000)
  sskfold = StratifiedShuffleSplit(n_splits=10, random_state=1)
  grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=sskfold, verbose=1)
  
  #perform
  init_time = time.time()
  grid_result = grid.fit(sample_data, ground_truth)
  time_elapsed = time.time() - init_time

  #results
  print(f'Tiempo de ejecución: {time_elapsed} segundos.')
  print(f'El mejor resultado :{grid_result.best_score_} se consigue con {grid_result.best_params_}')


logit_hyperparam_grid_search()

Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 775 tasks      | elapsed: 11.9min


Tiempo de ejecución: 912.1901445388794 segundos.
El mejor resultado :0.8347826086956521 se consigue con {'C': 1.0, 'class_weight': {0: 1, 1: 1}, 'penalty': 'l2', 'solver': 'liblinear'}


[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed: 15.2min finished


## Model implemented using best hypeparameter values obtained with grid search

In [32]:
metrics = accuracy_precision_recall_specifity_f2_score()

model2 = LogisticRegression(solver='liblinear')
sskfold = StratifiedShuffleSplit(random_state=1)
results = cross_validate(model2, sample_data , ground_truth , cv=sskfold, scoring = metrics)

printer = CrossValidationMetricsResultPrinter()
printer.print_metrics_report(results)


Valores medios:
	Fit time: 0.0755s.
	Test time: 0.0082s
	Accuracy: 83.48%.
	Precision: 80.5%.
	Recall: 66.74%.
	Specificity: 91.85%.
	F2 score: 69.06%.


0.0755
0.0082
83.48
80.5
66.74
91.85
69.06
