# Projeto

## Bibliotecas

In [1]:
import os

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score

from lightgbm import LGBMClassifier

## Arquivos

In [2]:
caminho = '../../machine_learning_i/projeto/dados'
arquivo_principal = 'application_train.csv'
arquivo_oculto = 'application_test_student.csv'
arquivo_metadados = 'HomeCredit_columns_description.csv'

## Input

In [3]:
df = pd.read_csv(f'{caminho}/{arquivo_principal}')

df_oculto = pd.read_csv(f'{caminho}/{arquivo_oculto}')

df_metadados = pd.read_csv(f'{caminho}/{arquivo_metadados}', encoding = 'Windows-1252')

## Variáveis

In [26]:
var_expl = [
    'CNT_CHILDREN', 
    'AMT_INCOME_TOTAL', 
    'DAYS_BIRTH', 
    'DAYS_EMPLOYED',
    'NAME_EDUCATION_TYPE',
    'FLAG_OWN_CAR', 
    'FLAG_OWN_REALTY'
]

var_resp = 'TARGET'

## Treino e Teste

In [27]:
df_treino, df_teste = train_test_split(df, test_size = 0.15, random_state = 1)

In [28]:
x_treino = df_treino[var_expl].copy()
y_treino = df_treino[var_resp].copy()

x_teste = df_teste[var_expl].copy()
y_teste = df_teste[var_resp].copy()

x_oculto = df_oculto[var_expl].copy()

## Pipeline

**União do pré-processamento com o modelo**

Consequentemente tiramos as métricas aqui também.

In [30]:
%%time

parametros = {
    'modelo__num_leaves': [15],
    'modelo__max_depth': [5, 8],
    'modelo__n_estimators': [250, 500],
    'modelo__learning_rate': [0.05, 0.01],
}

modelo = LGBMClassifier(
    colsample_bytree = 0.5, 
    subsample = 0.5,
    subsample_freq = 5,
    random_state = 1
)


lista_ordenada = [
    'Lower secondary',
    'Secondary / secondary special', 
    'Incomplete higher',
    'Higher education', 
    'Academic degree', 
]

pipe_education = Pipeline(steps = [
    ('trata_education', OrdinalEncoder(categories = [lista_ordenada]))
])

pipe_flag = Pipeline(steps = [
    ('trata_flag', OrdinalEncoder())
])

pipe_selecao = ColumnTransformer(transformers = [
    ('pipe_education', pipe_education, ['NAME_EDUCATION_TYPE']),
    ('pipe_flag', pipe_flag, ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY']),
    ('outras_colunas', 'passthrough', ['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'DAYS_EMPLOYED']),
])

pipeline = Pipeline(steps = [
    ('preproc', pipe_selecao),
    ('modelo', modelo)
])

gscv = GridSearchCV(
    estimator = pipeline,
    param_grid = parametros,
    scoring = 'roc_auc',
    refit = True,
    cv = 3
)

gscv.fit(x_treino, y_treino)
y_pred_treino = gscv.predict_proba(x_treino)[:, 1]
y_pred_teste = gscv.predict_proba(x_teste)[:, 1]

CPU times: user 5min 56s, sys: 34.7 s, total: 6min 31s
Wall time: 56.2 s


In [31]:
pipe_selecao.fit_transform(x_treino)

array([[ 2.00000e+00,  1.00000e+00,  1.00000e+00, ...,  1.57500e+05,
        -1.62180e+04, -2.40000e+02],
       [ 1.00000e+00,  1.00000e+00,  1.00000e+00, ...,  1.35000e+05,
        -2.00400e+04, -9.14000e+02],
       [ 1.00000e+00,  0.00000e+00,  1.00000e+00, ...,  9.00000e+04,
        -2.01660e+04,  3.65243e+05],
       ...,
       [ 3.00000e+00,  0.00000e+00,  0.00000e+00, ...,  9.00000e+04,
        -1.28420e+04, -2.29700e+03],
       [ 3.00000e+00,  0.00000e+00,  1.00000e+00, ...,  1.80000e+05,
        -2.12400e+04,  3.65243e+05],
       [ 1.00000e+00,  1.00000e+00,  1.00000e+00, ...,  1.17000e+05,
        -9.57200e+03, -1.11900e+03]])

In [32]:
roc_auc_score(y_treino, y_pred_treino)

0.6435477594555178

In [33]:
roc_auc_score(y_teste, y_pred_teste)

0.6251038564944535

## Previsão

In [20]:
y_pred_oculto = pipeline.predict_proba(x_oculto)[:, 1]
df_oculto['Y_PRED'] = y_pred_oculto
df_oculto[['SK_ID_CURR', 'Y_PRED']].head()

Unnamed: 0,SK_ID_CURR,Y_PRED
0,149741,0.060691
1,363290,0.056175
2,436006,0.048052
3,377703,0.057595
4,188624,0.065784
