<a href="https://colab.research.google.com/github/devluz2023/AnaliseCombinatoraia/blob/main/Projeto_otimiza%C3%A7%C3%A3o_de_modelos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Conhecendo os dados

In [6]:
!git clone https://github.com/devluz2023/classificacao_otimizacao.git

fatal: destination path 'classificacao_otimizacao' already exists and is not an empty directory.


In [7]:

import pandas as pd

In [8]:
dados = pd.read_csv('classificacao_otimizacao/dados_inadimplencia.csv')
dados.head()

Unnamed: 0,receita_cliente,anuidade_emprestimo,anos_casa_propria,telefone_trab,avaliacao_cidade,score_1,score_2,score_3,score_social,troca_telefone,inadimplente
0,16855.246324,2997.0,12.157324,0,2.0,0.501213,0.003109,0.513171,0.117428,243.0,1
1,13500.0,2776.05,12.157324,0,2.0,0.501213,0.26973,0.513171,0.0979,617.0,0
2,11250.0,2722.188351,12.157324,0,3.0,0.701396,0.518625,0.700184,0.1186,9.0,0
3,27000.0,6750.0,3.0,0,2.0,0.501213,0.649571,0.513171,0.0474,300.0,0
4,22500.0,3097.8,12.157324,0,2.0,0.440744,0.509677,0.513171,0.0144,2913.0,1


In [9]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14578 entries, 0 to 14577
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   receita_cliente      14578 non-null  float64
 1   anuidade_emprestimo  14578 non-null  float64
 2   anos_casa_propria    14578 non-null  float64
 3   telefone_trab        14578 non-null  int64  
 4   avaliacao_cidade     14578 non-null  float64
 5   score_1              14578 non-null  float64
 6   score_2              14578 non-null  float64
 7   score_3              14578 non-null  float64
 8   score_social         14578 non-null  float64
 9   troca_telefone       14578 non-null  float64
 10  inadimplente         14578 non-null  int64  
dtypes: float64(9), int64(2)
memory usage: 1.2 MB


In [10]:
round(dados['inadimplente'].value_counts(normalize=True)*100, 2)

0    67.65
1    32.35
Name: inadimplente, dtype: float64

In [11]:
x = dados.drop('inadimplente', axis=1)
y = dados['inadimplente']

In [12]:
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42

x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=0.33, random_state=RANDOM_STATE, stratify=y)

## Construindo os modelos

### Decision tree classifier

In [13]:
from sklearn.tree import DecisionTreeClassifier

In [14]:
modelo_dt = DecisionTreeClassifier(max_depth=3, random_state=RANDOM_STATE)
modelo_dt.fit(x_treino, y_treino)

In [15]:
from sklearn.metrics import recall_score

recall_dt = recall_score(y_teste, modelo_dt.predict(x_teste))
print(f"recall do DT = {recall_dt:.3f}")

recall do DT = 0.143


### Regressão logística

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [17]:
logistic_pipeline = make_pipeline(StandardScaler(), LogisticRegression())
logistic_pipeline.fit(x_treino, y_treino)

In [18]:
recall_lr = recall_score(y_teste, logistic_pipeline.predict(x_teste))
print(f"recall do LR = {recall_lr:.3f}")

recall do LR = 0.253


## Realizando a busca em grade

### DecisionTreeClassifier

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
import numpy as np

param_grid_dt = {
    'criterion':  ['gini', 'entropy'],
    'max_depth': np.linspace(6, 12, 4, dtype=int),
    'min_samples_split': np.linspace(5, 20, 4, dtype=int),
    'min_samples_leaf': np.linspace(5, 20, 4, dtype=int),
    'max_features': ['sqrt', 'log2'],
    'splitter': ['best', 'random']
}

In [None]:
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(shuffle=True, random_state=RANDOM_STATE)

dt_grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=RANDOM_STATE),
             param_grid=param_grid_dt,
             scoring="recall",
             n_jobs=-1,
             cv=cv)

dt_grid_search.fit(x_treino, y_treino)

### Obtendo informações da busca em grade

In [None]:
dt_grid_search.best_params_

In [None]:
df_cv_results_dt = pd.DataFrame(dt_grid_search.cv_results_)
df_cv_results_dt.head(3)

In [None]:
df_cv_results_dt.loc[[dt_grid_search.best_index_]]

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(
    df_cv_results_dt['param_max_depth'],
    df_cv_results_dt['mean_test_score'],
)

plt.title('max_depth vs. mean_test_score')
plt.xlabel('max_depth')
plt.ylabel('recall')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(
    df_cv_results_dt['param_min_samples_leaf'],
    df_cv_results_dt['mean_test_score'],
)

plt.title('min_samples_leaf vs. mean_test_score')
plt.xlabel('min_samples_leaf')
plt.ylabel('recall')
plt.show()

###  Busca em grade para o LogisticRegression

In [None]:
max_iter = np.linspace(100, 300, 5, dtype=int)
c = [0.001, 0.01, 0.1, 1, 10]

param_grid_lr = [
    {'logisticregression__solver' : ['newton-cg', 'lbfgs'],
      'logisticregression__penalty' : ['l2'],
      'logisticregression__max_iter' : max_iter,
       'logisticregression__C' : c},
    {'logisticregression__solver' : ['liblinear'],
      'logisticregression__penalty' : ['l1', 'l2'],
      'logisticregression__max_iter' : max_iter,
      'logisticregression__C' : c},
]

In [None]:
lr_grid_search = GridSearchCV(estimator=make_pipeline(StandardScaler(), LogisticRegression()),
             param_grid=param_grid_lr,
             scoring="recall",
             n_jobs=-1,
             cv=cv)

lr_grid_search.fit(x_treino, y_treino)

In [None]:
lr_grid_search.best_params_

In [None]:
df_cv_results_lr = pd.DataFrame(lr_grid_search.cv_results_)
df_cv_results_lr.head(3)

In [None]:
df_cv_results_lr.loc[[lr_grid_search.best_index_]]

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(
    df_cv_results_lr['param_logisticregression__max_iter'],
    df_cv_results_lr['mean_test_score'],
)

plt.title('max_iter vs. mean_test_score')
plt.xlabel('max_iter')
plt.ylabel('recall')
plt.show()