In [80]:
import pandas as pd

df = pd.read_csv("datasets/credit_01/train.gz", compression="gzip")

df.head(2)

Unnamed: 0,REF_DATE,TARGET,VAR2,IDADE,VAR4,VAR5,VAR6,VAR7,VAR8,VAR9,...,VAR141,VAR142,VAR143,VAR144,VAR145,VAR146,VAR147,VAR148,VAR149,ID
0,2017-06-01 00:00:00+00:00,0,M,34.137,,RO,-8.808779,-63.87847,D,E,...,2680.289259,D,,,,,102,EMAIL INEXISTENTE#@#NOME INEXISTENTE#@#CEP INE...,2.6.1,181755
1,2017-08-18 00:00:00+00:00,0,M,40.447,,PB,-7.146537,-34.92608,E,E,...,1777.725469,E,,,,,102,EMAIL INEXISTENTE#@#NOME INEXISTENTE#@#CEP INE...,2.6.1,287633


In [92]:
# convertendo colunas object pra category e análises básicas
df = df.astype({col: "category" for col in df.select_dtypes("object").columns})

df.info()

print()

print(df['TARGET'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120750 entries, 0 to 120749
Columns: 151 entries, REF_DATE to ID
dtypes: category(114), float64(34), int64(3)
memory usage: 47.4 MB

TARGET
0    91163
1    29587
Name: count, dtype: int64


In [82]:
# aplicando Oversample na classe minoritária

from sklearn.utils import resample

def oversample_target(df, target_col='TARGET'):
    # Separa as classes
    classe_majoritaria = df[df[target_col] == df[target_col].value_counts().idxmax()]
    classe_minoritaria = df[df[target_col] != df[target_col].value_counts().idxmax()]
    
    classe_minoritaria_upsampled = resample(
        classe_minoritaria,
        replace=True,
        n_samples=len(classe_majoritaria),
        random_state=42
    )
    
    # Junta as duas classes
    df_balanceado = pd.concat([classe_majoritaria, classe_minoritaria_upsampled])
    return df_balanceado.sample(frac=1, random_state=42).reset_index(drop=True)

# Exemplo de uso:
df_balanceado = oversample_target(df)
print(df_balanceado['TARGET'].value_counts())

TARGET
0    91163
1    91163
Name: count, dtype: int64


In [83]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df_balanceado.drop(columns=['TARGET'])
y = df_balanceado['TARGET']

# Convertendo variáveis categóricas para numéricas
X = pd.get_dummies(X, drop_first=True)

# Preenchendo valores nulos
X = X.fillna(0)

# divindindo entre treino e teste

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Treinando o modelo
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [84]:
# fazendo grid_search pra encontrar melhores parâmetros
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [10, 15],
    'min_samples_split': [2, 5],
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Melhores parâmetros: {grid_search.best_params_}")

Melhores parâmetros: {'max_depth': 15, 'min_samples_split': 2, 'n_estimators': 150}


In [85]:
# validação com curva auc 
from sklearn.metrics import roc_auc_score

y_pred_train = grid_search.predict_proba(X_train)[:, 1]
y_pred_test = grid_search.predict_proba(X_test)[:, 1]

roc_auc_train = roc_auc_score(y_train, y_pred_train)
roc_auc_test = roc_auc_score(y_test, y_pred_test)

print(f"Curva AUC (treino): {roc_auc_train}")
print(f"Curva AUC (teste): {roc_auc_test}")

Curva AUC (treino): 0.8812988010663865
Curva AUC (teste): 0.8081075554740872


In [86]:
#classification report
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92     18233
           1       0.92      0.93      0.93     18233

    accuracy                           0.92     36466
   macro avg       0.92      0.92      0.92     36466
weighted avg       0.92      0.92      0.92     36466



In [87]:
# exportando modelo
import pickle

with open("monitoring/my_model.pkl", "wb") as f:
    pickle.dump(model, f)