# Otimizando os parâmetros de regressão Logistica

In [1]:
# Importando os módulos
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import cross_validate
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV , RandomizedSearchCV 

In [2]:
# Importando o dataset
heart_failure = pd.read_csv('heart_failure_clinical_records_dataset.csv')

In [3]:
# fazendo as separação dos atributos e do target
x = heart_failure.drop(['DEATH_EVENT'],axis=1).values
y = heart_failure.DEATH_EVENT.values

# Normalizando os dados
x_std = StandardScaler().fit_transform(x)

In [4]:
# usando o metos undersampling para tornar os dados menos enviesados
nm = RandomUnderSampler(random_state = 10)

x_nm, y_nm = nm.fit_resample(x_std, y)

## Aplicando o gridsearch para otimizar os hiperparametros

In [12]:
parametros = {'penalty':['l1', 'l2', 'elasticnet',None],'dual':[True,False],'fit_intercept':[True,False],
              'max_iter':np.arange(21,31,1),'multi_class':['auto','ovr','multinomial'],}

In [13]:
grid_search = GridSearchCV(estimator=LogisticRegression(),param_grid=parametros)
grid_search.fit(x_nm,y_nm)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_



2100 fits failed out of a total of 2400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
600 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 449, in _check_solver
    % (solver, penalty)
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

---------------------------------

In [14]:
print(melhores_parametros)
print(melhor_resultado)

{'dual': False, 'fit_intercept': True, 'max_iter': 21, 'multi_class': 'auto', 'penalty': 'l2'}
0.7280701754385965


de 1 a 10 em max iter temos o seguinte resultado:

{'dual': False, 'fit_intercept': True, 'max_iter': 5, 'multi_class': 'auto', 'penalty': 'l2'}
 com score de 0.7333333333333333

 que foi o melhor em que foi gerado.

In [15]:
nome_metricas = ['accuracy', 'precision_macro', 'recall_macro']

lg = LogisticRegression(dual=False,fit_intercept=True,max_iter=5,multi_class='auto',penalty='l2')

metricas_ran = cross_validate(lg,x_nm, y_nm, cv=7, scoring=nome_metricas)
for met in metricas_ran:
  print(f'-{met}')
  print(f"-- {metricas_ran[met]}")
  media = np.mean(metricas_ran[met])
  desvio = np.std(metricas_ran[met])
  print(f'Média do {met}: {media}')
  print(f'Desvio {desvio}')
  print(f'Intervalo [{(media-(2*desvio)):.3f},{(media+(2*desvio)):.3f}]')
  print('-*-'*20)

-fit_time
-- [0.00714111 0.00447822 0.00398111 0.00284123 0.00264049 0.00260234
 0.00261903]
Média do fit_time: 0.0037576471056256977
Desvio 0.0015462421251022577
Intervalo [0.001,0.007]
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
-score_time
-- [0.00313282 0.00287366 0.00191998 0.00181985 0.00168538 0.00177002
 0.0018146 ]
Média do score_time: 0.002145188195364816
Desvio 0.000550857665173927
Intervalo [0.001,0.003]
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
-test_accuracy
-- [0.85714286 0.85714286 0.82142857 0.81481481 0.85185185 0.81481481
 0.33333333]
Média do test_accuracy: 0.7643613000755858
Desvio 0.17687942914655935
Intervalo [0.411,1.118]
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
-test_precision_macro
-- [0.85714286 0.88888889 0.8368984  0.82954545 0.85833333 0.81666667
 0.27380952]
Média do test_precision_macro: 0.7658978743012356
Desvio 0.20206691046369302
Intervalo [0.362,1.170]
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Podemos comparar e temos que:

- regressão logística *sem otimização*:
  - acuracia: 0,753
  - precisão: 0,758
  - recall: 0,755
- regressão logística *utilizando grid search*:
  - acuracia: 0,764
  - precisão: 0,765
  - recall: 0,766


Podemos ver que nesse caso tivemos uma pequena melhora no desempenho.

## Aplicando o random search na otimização de parametros

In [19]:
parametros2 = {'penalty':['l1', 'l2', 'elasticnet','none'],'dual':[True,False],'fit_intercept':[True,False],
              'max_iter':np.arange(1,101,1),'multi_class':['auto','ovr','multinomial'],}

In [20]:
random_search = RandomizedSearchCV(estimator=LogisticRegression(),param_distributions=parametros2)
random_search.fit(x_nm,y_nm)
melhores_parametros2 = random_search.best_params_
melhor_resultado2 = random_search.best_score_



40 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 449, in _check_solver
    % (solver, penalty)
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

--------------------------------------

In [21]:
print(melhores_parametros2)
print(melhor_resultado2)

{'penalty': 'none', 'multi_class': 'ovr', 'max_iter': 52, 'fit_intercept': True, 'dual': False}
0.7333333333333333


In [22]:
lg2 = LogisticRegression(penalty='none',multi_class='ovr',max_iter=52,fit_intercept=True,
                         dual=False,)

metricas_ran2 = cross_validate(lg2,x_nm, y_nm, cv=7, scoring=nome_metricas)
for met in metricas_ran:
  print(f'-{met}')
  print(f"-- {metricas_ran2[met]}")
  media = np.mean(metricas_ran2[met])
  desvio = np.std(metricas_ran2[met])
  print(f'Média do {met}: {media}')
  print(f'Desvio {desvio}')
  print(f'Intervalo [{(media-(2*desvio)):.3f},{(media+(2*desvio)):.3f}]')
  print('-*-'*20)

-fit_time
-- [0.00647926 0.00335574 0.00322914 0.00325394 0.00394559 0.0031538
 0.00396419]
Média do fit_time: 0.003911665507725307
Desvio 0.0010940308351234163
Intervalo [0.002,0.006]
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
-score_time
-- [0.00318694 0.00182843 0.00173235 0.00169969 0.00189066 0.00173831
 0.00174189]
Média do score_time: 0.0019740377153669086
Desvio 0.0004989176492378942
Intervalo [0.001,0.003]
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
-test_accuracy
-- [0.85714286 0.89285714 0.82142857 0.81481481 0.85185185 0.81481481
 0.2962963 ]
Média do test_accuracy: 0.7641723356009071
Desvio 0.19280137120242297
Intervalo [0.379,1.150]
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
-test_precision_macro
-- [0.85714286 0.91176471 0.8368984  0.82954545 0.85833333 0.81666667
 0.24642857]
Média do test_precision_macro: 0.7652542835315943
Desvio 0.21369608241960797
Intervalo [0.338,1.193]
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--

Compararndo os resultados, temos que:

- regressão logística *sem otimização*:
  - acuracia: 0,753
  - precisão: 0,758
  - recall: 0,755
- regressão logística *utilizando grid search*:
  - acuracia: 0,764
  - precisão: 0,765
  - recall: 0,766
- regressão logística *utilizando random search*:
  - acuracia: 0,764
  - precisão: 0,765
  - recall: 0,766


Podemos ver que aplicando a otimização teve uma melhora, entretanto, com o grid search e o random search tiveram praticamente os mesmos resultados.