# 1. Bibliotecas

In [4]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectPercentile
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# 2. Carregar Dados

In [5]:
df = pd.read_csv('../data/train.csv')

# 3. Descrição dos Dados

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [6]:
df.drop('Unnamed: 0', axis=1, inplace=True)

## 3.1 Dimensão dos Dados

In [7]:
df.shape

(150000, 11)

## 3.2 Tipos dos Dados

In [8]:
df.dtypes

target                                          int64
TaxaDeUtilizacaoDeLinhasNaoGarantidas         float64
Idade                                           int64
NumeroDeVezes30-59DiasAtrasoNaoPior             int64
TaxaDeEndividamento                           float64
RendaMensal                                   float64
NumeroDeLinhasDeCreditoEEmprestimosAbertos      int64
NumeroDeVezes90DiasAtraso                       int64
NumeroDeEmprestimosOuLinhasImobiliarias         int64
NumeroDeVezes60-89DiasAtrasoNaoPior             int64
NumeroDeDependentes                           float64
dtype: object

## 3.3. Verificar Nulos

In [10]:
df.isna().sum()

target                                            0
TaxaDeUtilizacaoDeLinhasNaoGarantidas             0
Idade                                             0
NumeroDeVezes30-59DiasAtrasoNaoPior               0
TaxaDeEndividamento                               0
RendaMensal                                   29731
NumeroDeLinhasDeCreditoEEmprestimosAbertos        0
NumeroDeVezes90DiasAtraso                         0
NumeroDeEmprestimosOuLinhasImobiliarias           0
NumeroDeVezes60-89DiasAtrasoNaoPior               0
NumeroDeDependentes                            3924
dtype: int64

## 3.4. Estatisticas

In [11]:
df.describe()

Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
count,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.757222
std,0.249746,249.755371,14.771866,4.192781,2037.818523,14384.67,5.145951,4.169304,1.129771,4.155179,1.115086
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029867,41.0,0.0,0.175074,3400.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.559046,63.0,0.0,0.868254,8249.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


## 3.5. Variavel Alvo

In [12]:
df['target'].value_counts(normalize=True)

target
0    0.93316
1    0.06684
Name: proportion, dtype: float64

# 4. Separa Dados Treino e Validação

In [13]:
X = df.drop('target', axis=1)
y = df['target']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y)

# 5. Tratamento dos Dados

In [23]:
imp_mean = SimpleImputer(strategy='mean')
cols_to_impute = ['RendaMensal', 'NumeroDeDependentes']
imp_mean.fit(X_train[cols_to_impute])

In [24]:
X_train[cols_to_impute] = imp_mean.transform(X_train[cols_to_impute])
X_valid[cols_to_impute] = imp_mean.transform(X_valid[cols_to_impute])

# 6. Selecionar Features

In [27]:
select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)
# transform training set
X_train_selected = select.transform(X_train)
print(f"X_train.shape: {X_train.shape}")
print(f"X_train_selected.shape: {X_train_selected.shape}")

X_train.shape: (120000, 10)
X_train_selected.shape: (120000, 5)


In [47]:
# valores_selecionados = [X_train.columns[i] for i in range(len(X_train.columns[select.get_support()]))]
# valores_selecionados

X_train.columns[select.get_support()].tolist()

['Idade',
 'NumeroDeVezes30-59DiasAtrasoNaoPior',
 'NumeroDeVezes90DiasAtraso',
 'NumeroDeVezes60-89DiasAtrasoNaoPior',
 'NumeroDeDependentes']

# 7. Treinar o modelo - Baseline

In [49]:
X_valid_selected = select.transform(X_valid)

In [51]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_preds = lr.predict_proba(X_valid)[:, 1]

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [52]:
print(f'Score com todas as features: {roc_auc_score(y_valid, y_preds)}')

Score com todas as features: 0.6956519756155245


In [53]:
lr.fit(X_train_selected, y_train)
y_preds = lr.predict_proba(X_valid_selected)[:, 1]

In [54]:
print(f'Score com as features selecionadas: {roc_auc_score(y_valid, y_preds)}')

Score com as features selecionadas: 0.6938185523866705


# 8. Tuning dos Hiperparametros

In [55]:
clf = [
    LogisticRegression(solver='newton-cg', penalty=None, max_iter=1000),
    LogisticRegression(solver='lbfgs', penalty=None, max_iter=1000),
    LogisticRegression(solver='sag', penalty=None, max_iter=1000),
    LogisticRegression(solver='saga', penalty=None, max_iter=1000)
]

clf_columns = []
clf_compare = pd.DataFrame(columns=clf_columns)

row_index = 0
for lrs in clf:
    y_pred = lrs.fit(X_train, y_train).predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, y_preds)
    clf_name = lrs.__class__.__name__
    clf_compare.loc[row_index, 'Modelo'] = clf_name
    clf_compare.loc[row_index, 'max_iter'] = lrs.max_iter
    clf_compare.loc[row_index, 'solver'] = lrs.solver
    clf_compare.loc[row_index, 'penalty'] = lrs.penalty
    clf_compare.loc[row_index, 'class_weight'] = lrs.class_weight
    clf_compare.loc[row_index, 'AUC'] = auc

    row_index += 1

clf_compare.sort_values(by=['AUC'], ascending=False, inplace=True)
clf_compare

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Modelo,max_iter,solver,penalty,class_weight,AUC
0,LogisticRegression,1000.0,newton-cg,,,0.693819
1,LogisticRegression,1000.0,lbfgs,,,0.693819
2,LogisticRegression,1000.0,sag,,,0.693819
3,LogisticRegression,1000.0,saga,,,0.693819


# 9. Deploy do Modelo

In [56]:
modelo_final = LogisticRegression(solver='lbfgs', penalty=None, max_iter=1000, class_weight=None)
modelo_final.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [58]:
pickle.dump(modelo_final, open('modelo_final.pkl', 'wb'))