# Introdução

In [3]:
%pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.4.1.post1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.12.0-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.4 kB ? eta -:--:--
     ---------------------------------------- 60.4/60.4 kB 1.6 MB/s eta 0:00:00
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.3.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.4.1.post1-cp311-cp311-win_amd64.whl (10.6 MB)
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   -- ------------------------------------- 0.5/10.6 MB 11.3 MB/s eta 0:00:01
   ---- ----------------------------------- 1.3/10.6 MB 13.5 MB/s eta 0:00:01
   -------- ------------------------------- 2.2/10.6 MB 15.3 MB/s eta 0:00:01
   ------------ -------

# Desenvolvimento do Projeto

## 0.0 Carrega Bibliotecas

In [144]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import pickle

## 1.0 Carregar os dados

In [4]:
df = pd.read_csv('train.csv')

### Descrição dos dados

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [7]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

### Dimensão dos dados

In [8]:
df.shape

(150000, 11)

### Colunas

In [13]:
print(df.columns)

Index(['target', 'TaxaDeUtilizacaoDeLinhasNaoGarantidas', 'Idade',
       'NumeroDeVezes30-59DiasAtrasoNaoPior', 'TaxaDeEndividamento',
       'RendaMensal', 'NumeroDeLinhasDeCreditoEEmprestimosAbertos',
       'NumeroDeVezes90DiasAtraso', 'NumeroDeEmprestimosOuLinhasImobiliarias',
       'NumeroDeVezes60-89DiasAtrasoNaoPior', 'NumeroDeDependentes'],
      dtype='object')


In [15]:
def to_snake_case(column_name):
    """
    Converte um nome de coluna para snake_case.
    
    Argumentos:
    column_name (str): O nome da coluna a ser convertido.
    
    Retorna:
    str: O nome da coluna convertido para snake_case.
    """
    # Converte letras maiúsculas para minúsculas e adiciona um underscore antes delas
    snake_case_name = re.sub(r'(?<!^)(?=[A-Z])', '_', column_name).lower()
    return snake_case_name

df.rename(columns=lambda x: to_snake_case(x), inplace=True)

In [18]:
print(df.columns)

Index(['target', 'taxa_de_utilizacao_de_linhas_nao_garantidas', 'idade',
       'numero_de_vezes30-59_dias_atraso_nao_pior', 'taxa_de_endividamento',
       'renda_mensal', 'numero_de_linhas_de_credito_e_emprestimos_abertos',
       'numero_de_vezes90_dias_atraso',
       'numero_de_emprestimos_ou_linhas_imobiliarias',
       'numero_de_vezes60-89_dias_atraso_nao_pior', 'numero_de_dependentes'],
      dtype='object')


### Tipos dos dados

In [19]:
df.dtypes

target                                                 int64
taxa_de_utilizacao_de_linhas_nao_garantidas          float64
idade                                                  int64
numero_de_vezes30-59_dias_atraso_nao_pior              int64
taxa_de_endividamento                                float64
renda_mensal                                         float64
numero_de_linhas_de_credito_e_emprestimos_abertos      int64
numero_de_vezes90_dias_atraso                          int64
numero_de_emprestimos_ou_linhas_imobiliarias           int64
numero_de_vezes60-89_dias_atraso_nao_pior              int64
numero_de_dependentes                                float64
dtype: object

### Verificar NaN

In [33]:
print('Quantidade de valores nulos por coluna')
display(df.isna().sum())

Quantidade de valores nulos por coluna


target                                                   0
taxa_de_utilizacao_de_linhas_nao_garantidas              0
idade                                                    0
numero_de_vezes30-59_dias_atraso_nao_pior                0
taxa_de_endividamento                                    0
renda_mensal                                         29731
numero_de_linhas_de_credito_e_emprestimos_abertos        0
numero_de_vezes90_dias_atraso                            0
numero_de_emprestimos_ou_linhas_imobiliarias             0
numero_de_vezes60-89_dias_atraso_nao_pior                0
numero_de_dependentes                                 3924
dtype: int64

In [35]:
print('Porcentagem dos valores nulos na base total (%)')
display(df.isna().sum()/len(df)*100)

Porcentagem dos valores nulos na base total (%)


target                                                0.000000
taxa_de_utilizacao_de_linhas_nao_garantidas           0.000000
idade                                                 0.000000
numero_de_vezes30-59_dias_atraso_nao_pior             0.000000
taxa_de_endividamento                                 0.000000
renda_mensal                                         19.820667
numero_de_linhas_de_credito_e_emprestimos_abertos     0.000000
numero_de_vezes90_dias_atraso                         0.000000
numero_de_emprestimos_ou_linhas_imobiliarias          0.000000
numero_de_vezes60-89_dias_atraso_nao_pior             0.000000
numero_de_dependentes                                 2.616000
dtype: float64

### Estatísticas

In [36]:
df.describe()

Unnamed: 0,target,taxa_de_utilizacao_de_linhas_nao_garantidas,idade,numero_de_vezes30-59_dias_atraso_nao_pior,taxa_de_endividamento,renda_mensal,numero_de_linhas_de_credito_e_emprestimos_abertos,numero_de_vezes90_dias_atraso,numero_de_emprestimos_ou_linhas_imobiliarias,numero_de_vezes60-89_dias_atraso_nao_pior,numero_de_dependentes
count,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.757222
std,0.249746,249.755371,14.771866,4.192781,2037.818523,14384.67,5.145951,4.169304,1.129771,4.155179,1.115086
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029867,41.0,0.0,0.175074,3400.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.559046,63.0,0.0,0.868254,8249.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


In [38]:
df.target.value_counts(normalize=True)

target
0    0.93316
1    0.06684
Name: proportion, dtype: float64

## 2.0 Feature Engineering

In [108]:
df2 = df.copy()

In [109]:
def feature_engineering(dataset:pd.DataFrame) -> pd.DataFrame : 

    # definindo uma faixa para a renda mensal
    dataset['faixa_renda_mensal'] = dataset['renda_mensal'].apply(lambda x: 'não especificado' if np.isnan(x) else
                                                                    '0-1k' if x >= 0 and x < 1000 else 
                                                                    '1k-5k' if x>=1000 and x < 5000 else
                                                                    '5k-20k' if x >= 5000 and x < 20000 else
                                                                    '20k-100k' if x>=20000 and x < 100000 else
                                                                    '100k-1mi' if x>=100000 and x < 1000000 else
                                                                    '1mi+'
                                                                    )

    # definindo faixa para numero de dependentes
    dataset['faixa_numero_dependentes'] = dataset['numero_de_dependentes'].apply(lambda x: 'não especificado' if np.isnan(x) else
                                                                                '0-2' if x>=0 and x<2 else
                                                                                '2-5' if x>=2 and x<5 else
                                                                                '5-10' if x>=5 and x<10 else
                                                                                '10+')
    
    dataset.drop(columns=['renda_mensal','numero_de_dependentes'], inplace = True)
    
    return dataset


In [110]:
df_featured = feature_engineering(df2)

## 3.0 Train Test Split

Separação dos dados em treino e validação

In [111]:
df3 = df_featured.copy()

In [112]:
def train_validation_dataset(dataset: pd.DataFrame)->pd.DataFrame:

    X = dataset.drop('target', axis = 1)
    y = dataset.target

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, stratify=y)

    return X_train, X_valid, y_train, y_valid

In [113]:
X_train, X_valid, y_train, y_valid = train_validation_dataset(df3)

## 4.0 Transformação do dados

In [114]:
def transform_data(X_train: pd.DataFrame, X_valid: pd.DataFrame)->pd.DataFrame:

    X_train['faixa_renda_mensal'] = X_train['faixa_renda_mensal'].map(X_train['faixa_renda_mensal'].value_counts(normalize = True).to_dict())
    X_train['faixa_numero_dependentes'] = X_train['faixa_numero_dependentes'].map(X_train['faixa_numero_dependentes'].value_counts(normalize = True).to_dict())

    X_valid['faixa_renda_mensal'] = X_valid['faixa_renda_mensal'].map(X_valid['faixa_renda_mensal'].value_counts(normalize = True).to_dict())
    X_valid['faixa_numero_dependentes'] = X_valid['faixa_numero_dependentes'].map(X_valid['faixa_numero_dependentes'].value_counts(normalize = True).to_dict())

    return X_train, X_valid

In [115]:
X_train_transformed, X_valid_transformed = transform_data(X_train, X_valid)

## 5.0 Seleção de Features

In [125]:
def feature_selection(X_train_transformed: pd.DataFrame, X_valid_transformed: pd.DataFrame, y_train: pd.Series, y_valid: pd.Series) -> pd.DataFrame:

    select = SelectPercentile(percentile=50)
    select.fit(X_train_transformed, y_train)
    X_train_selected = select.transform(X_train_transformed)
    X_valid_selected = select.transform(X_valid_transformed)

    print(f"X_train.shape {X_train.shape}")
    print(f"X_train_selected.shape {X_train_selected.shape}")

    return X_train_selected, X_valid_selected

In [126]:
X_train_selected, X_valid_selected = feature_selection(X_train_transformed, X_valid_transformed, y_train, y_valid)

X_train.shape (120000, 10)
X_train_selected.shape (120000, 5)


## 6.0 Treinar o modelo - Baseline

In [134]:
lr = LogisticRegression(max_iter = 1000)
lr.fit(X_train_transformed, y_train)
y_pred = lr.predict_proba(X_valid_transformed)[:,1]

In [136]:
print('Score AUC com todas as features', roc_auc_score(y_valid, y_pred))

Score AUC com todas as features 0.6987192137534357


In [137]:
lr = LogisticRegression(max_iter = 1000)
lr.fit(X_train_selected, y_train)
y_pred = lr.predict_proba(X_valid_selected)[:,1]

In [138]:
print('Score AUC com features selecionadas', roc_auc_score(y_valid, y_pred))

Score AUC com features selecionadas 0.6982872876747228


## 7.0 Tunning dos hiperparâmetros

In [139]:
clf = [
    LogisticRegression(solver='newton-cg', penalty=None,
    max_iter=1000),
    LogisticRegression(solver='lbfgs', penalty=None,
    max_iter=1000),
    LogisticRegression(solver='sag', penalty=None,
    max_iter=1000),
    LogisticRegression(solver='saga', penalty=None,
    max_iter=1000)
]
clf_columns =[]
clf_compare = pd.DataFrame(columns = clf_columns)

row_index = 0
for lrs in clf:

    y_pred = lrs.fit(X_train_transformed, y_train).predict_proba(X_valid_transformed)[:,1]
    auc = roc_auc_score(y_valid, y_pred)
    clf_name = lrs.__class__.__name__
    clf_compare.loc[row_index, 'Modelo'] = clf_name
    clf_compare.loc[row_index, 'max_iter'] = lrs.max_iter
    clf_compare.loc[row_index, 'solver'] = lrs.solver
    clf_compare.loc[row_index, 'penalty'] = lrs.penalty
    clf_compare.loc[row_index, 'class_weight'] = lrs.class_weight
    clf_compare.loc[row_index, 'AUC'] = auc

    row_index+=1

clf_compare.sort_values(by='AUC', ascending = False, inplace = True)

clf_compare




Unnamed: 0,Modelo,max_iter,solver,penalty,class_weight,AUC
0,LogisticRegression,1000.0,newton-cg,,,0.698868
1,LogisticRegression,1000.0,lbfgs,,,0.69868
2,LogisticRegression,1000.0,sag,,,0.62557
3,LogisticRegression,1000.0,saga,,,0.617488


## 8.0 Deploy Modelo Final

In [143]:
modelo_final = LogisticRegression(solver='newton-cg', 
                                  penalty=None, 
                                  max_iter=1000,
                                  class_weight = None)
modelo_final.fit(X_train_selected, y_train)

In [145]:
pickle.dump(modelo_final, 
            open('modelo_final.pkl','wb'))