# 4 - Modeling

In [74]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [75]:
import pandas as pd
import numpy as np

In [76]:
train = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Datarisk/data/processed_train.csv', low_memory=False)

In [77]:
train

Unnamed: 0,ID_CLIENTE,SAFRA_REF,VALOR_A_PAGAR,TAXA,FLAG_PF,SEGMENTO_INDUSTRIAL,DOMINIO_EMAIL,PORTE,CEP_2_DIG,RENDA_MES_ANTERIOR,...,DIASEMANA_EMISSAO,DIASEMANA_PAGAMENTO,DIASEMANA_VENCIMENTO,DIASEMANA_CADASTRO,DIASEMANA_SAFRA_REF,ANO_EMISSAO_DOCUMENTO,ANO_PAGAMENTO,ANO_VENCIMENTO,ANO_CADASTRO,ANO_SAFRA_REF
0,1661240395903230676,2018-09-01,35516.41,6.99,PJ,Serviços,YAHOO,PEQUENO,65,290074.138889,...,Friday,Thursday,Thursday,Thursday,Saturday,2018,2018,2018,2013,2018
1,1661240395903230676,2018-09-01,17758.21,6.99,PJ,Serviços,YAHOO,PEQUENO,65,290074.138889,...,Sunday,Tuesday,Monday,Thursday,Saturday,2018,2018,2018,2013,2018
2,1661240395903230676,2018-09-01,17431.96,6.99,PJ,Serviços,YAHOO,PEQUENO,65,290074.138889,...,Sunday,Tuesday,Monday,Thursday,Saturday,2018,2018,2018,2013,2018
3,1661240395903230676,2018-09-01,1341.00,6.99,PJ,Serviços,YAHOO,PEQUENO,65,290074.138889,...,Thursday,Thursday,Friday,Thursday,Saturday,2018,2018,2018,2013,2018
4,1661240395903230676,2018-09-01,21309.85,6.99,PJ,Serviços,YAHOO,PEQUENO,65,290074.138889,...,Friday,Thursday,Thursday,Thursday,Saturday,2018,2018,2018,2013,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89546,3431426889924624821,2021-11-01,56879.10,6.99,PJ,Serviços,HOTMAIL,MEDIO,69,227342.000000,...,Friday,Monday,Monday,Thursday,Monday,2021,2021,2021,2020,2021
89547,5288503299611498087,2021-11-01,156725.15,5.99,PJ,Comércio,YAHOO,PEQUENO,13,352642.000000,...,Sunday,Tuesday,Wednesday,Tuesday,Monday,2021,2021,2021,2020,2021
89548,957773253650890560,2021-11-01,266.08,5.99,PJ,Comércio,GMAIL,MEDIO,20,433808.000000,...,Sunday,Monday,Monday,Monday,Monday,2021,2021,2021,2021,2021
89549,6094038865287329652,2021-11-01,301.49,8.99,PJ,Serviços,GMAIL,GRANDE,48,532236.000000,...,Sunday,Monday,Monday,Monday,Monday,2021,2021,2021,2021,2021


In [78]:
# Criando X e y sem variáveis que possuem relação direta com DATA_PAGAMENTO
X = train.drop(columns={'CEP_2_DIG','VALOR_A_PAGAR','INADIMPLENTE', 'PAGTO_DIFF_EMISSAO','PAGTO_DIFF_VENC','DIASEMANA_PAGAMENTO','ANO_PAGAMENTO', 'MES_PAGAMENTO'})
y = train['INADIMPLENTE']

In [79]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89551 entries, 0 to 89550
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID_CLIENTE             89551 non-null  int64  
 1   SAFRA_REF              89551 non-null  object 
 2   TAXA                   89551 non-null  float64
 3   FLAG_PF                89551 non-null  object 
 4   SEGMENTO_INDUSTRIAL    89551 non-null  object 
 5   DOMINIO_EMAIL          89551 non-null  object 
 6   PORTE                  89551 non-null  object 
 7   RENDA_MES_ANTERIOR     89551 non-null  float64
 8   NO_FUNCIONARIOS        89551 non-null  float64
 9   PRAZO                  89551 non-null  int64  
 10  ZONA_POSTAL            89551 non-null  object 
 11  LIFETIME_CLIENTE_DIAS  89551 non-null  int64  
 12  MES_EMISSAO_DOCUMENTO  89551 non-null  int64  
 13  MES_VENCIMENTO         89551 non-null  int64  
 14  MES_CADASTRO           89551 non-null  int64  
 15  ME

## 4.1 Criando Pipeline para Feature Importance

In [80]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, StandardScaler

numeric_values = list(X.select_dtypes(exclude=['object']).columns)
string_values = list(X.select_dtypes(include=['object']).columns)

preprocessor = ColumnTransformer(transformers=[  
    ('cat', OrdinalEncoder(), string_values),
    ('num', MinMaxScaler(), numeric_values)
])


## 4.2 Feature importance

In [81]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

rfc = RandomForestClassifier(random_state=22)

pipe_fi = Pipeline(steps=[('preprocessor', preprocessor), ('clf', rfc)])
pipe_fi.fit(X, y)

feature_importances = rfc.feature_importances_

# Criando lista de features para printar
feature_names = X.columns.tolist()

# Pareando nome de features com importância e fazendo sort
sorted_features = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

# Printando features e sua importância
for name, importance in sorted_features:
    print(f"{name}: {importance}")


MES_SAFRA_REF: 0.12640064810610652
MES_CADASTRO: 0.12300306923596732
MES_EMISSAO_DOCUMENTO: 0.10655348543340219
MES_VENCIMENTO: 0.059956547734632416
ZONA_POSTAL: 0.05422810642040342
ID_CLIENTE: 0.04747058581975358
RENDA_MES_ANTERIOR: 0.04440498223736957
DIASEMANA_EMISSAO: 0.041610477655126626
PORTE: 0.041235320011547616
DIASEMANA_VENCIMENTO: 0.04081470697242074
DOMINIO_EMAIL: 0.03476092255920726
DIASEMANA_CADASTRO: 0.03337888290408662
DIASEMANA_SAFRA_REF: 0.032175568056838716
ANO_CADASTRO: 0.02866112538612856
LIFETIME_CLIENTE_DIAS: 0.02723124307430479
PRAZO: 0.02713492501037245
SEGMENTO_INDUSTRIAL: 0.026764482773723154
FLAG_PF: 0.02291030754746226
NO_FUNCIONARIOS: 0.019949548261289238
TAXA: 0.0164609316595489
ANO_VENCIMENTO: 0.015556080824866559
ANO_SAFRA_REF: 0.014397846713930785
ANO_EMISSAO_DOCUMENTO: 0.014254597067137338
SAFRA_REF: 0.0006856085343733751


## 4.3 Cross-Validation de modelos

Realizarei um loop em diversos modelos e fazendo over-sampling para comparar sua performance também com a aplicação de dois scalers: MinMaxScaler e StandardScaler para identificar qual se adequa melhor.

In [82]:
pip install category-encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [83]:
pip install xgboost --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [84]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, log_loss
from xgboost import XGBClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier
)
from imblearn.over_sampling import SMOTE
from category_encoders import TargetEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [85]:
rs = 22

cl_models = [
    QuadraticDiscriminantAnalysis(),
    GaussianNB(),
    XGBClassifier(random_state=rs),
    RandomForestClassifier(random_state=rs),
    GradientBoostingClassifier(random_state=rs),
    ExtraTreesClassifier(random_state=rs),
]

scalers = [
    StandardScaler(),
    MinMaxScaler(),
    RobustScaler()
]


In [87]:
for model in cl_models:
  for scaler in scalers:
    pipe = make_pipeline(
        TargetEncoder(),
        scaler,
        SMOTE(n_jobs=-1, random_state=rs),
        model
    )

    '''5-Fold Cross-Validation usando precision como 
    métrica de comparação pois o dataset é desbalanceado'''

    skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

    scores = cross_val_score(pipe, X, y, cv=skf, scoring='precision', n_jobs=-1)

    # Scores
    print(f'Model: {model}. Scaler: {scaler}')
    print('Cross-validation scores:', scores)
    print('Mean cross-validation score:', scores.mean())
    print('-----' * 10)

Model: QuadraticDiscriminantAnalysis(). Scaler: StandardScaler()
Cross-validation scores: [0.27975048 0.27407076 0.27117031 0.28781413 0.27316141]
Mean cross-validation score: 0.27719341882889176
--------------------------------------------------
Model: QuadraticDiscriminantAnalysis(). Scaler: MinMaxScaler()
Cross-validation scores: [0.28187919 0.27592345 0.27398568 0.29140959 0.27767176]
Mean cross-validation score: 0.28017393422623826
--------------------------------------------------
Model: QuadraticDiscriminantAnalysis(). Scaler: RobustScaler()
Cross-validation scores: [0.27459207 0.27018769 0.2674954  0.2854512  0.27158189]
Mean cross-validation score: 0.27386165061978485
--------------------------------------------------
Model: GaussianNB(). Scaler: StandardScaler()
Cross-validation scores: [0.23350923 0.21917263 0.20468068 0.22612245 0.22061329]
Mean cross-validation score: 0.22081965744708146
--------------------------------------------------
Model: GaussianNB(). Scaler: MinMax

O modelo que performa melhor no dataframe inteiro usando cross-validation é o  RandomForestClassifier com StandardScaler (Precisão de 72,4%)

## 4.4 Hyperparameter Fine-Tuning de melhor modelo

In [88]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=rs)

In [89]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Hyperparameters (Como leva muito tempo, coloquei apenas 1 com pequeno range)
param_grid = {
    'randomforestclassifier__min_samples_leaf': range(3,7)
}

rfc = RandomForestClassifier(random_state=rs, n_jobs=-1)

# Novo pipeline
pipeRFC = make_pipeline(
            TargetEncoder(),
            StandardScaler(),
            rfc
        )

# GridSearch
grid_search = GridSearchCV(pipeRFC, param_grid, cv=skf, scoring='precision', n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

# Printando melhores hyperparameters
print('Best hyperparameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best hyperparameters: {'randomforestclassifier__min_samples_leaf': 5}
Best score: 0.8232178644395598


In [90]:
# Fit do pipeline com melhores hyperparameters
best_pipeRFC = grid_search.best_estimator_
best_pipeRFC.fit(X_train, y_train)
y_pred = best_pipeRFC.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98     25100
           1       0.82      0.43      0.56      1766

    accuracy                           0.96     26866
   macro avg       0.89      0.71      0.77     26866
weighted avg       0.95      0.96      0.95     26866



In [91]:
import pickle

# Salvando o modelo em um arquivo pickle
with open('melhor_modelo.pickle', 'wb') as f:
    pickle.dump(best_pipeRFC, f)