# Projeto

## Bibliotecas

In [27]:
import os

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

from lightgbm import LGBMClassifier

## Arquivos

In [2]:
caminho = '../../machine_learning_i/projeto/dados'
arquivo_principal = 'application_train.csv'
arquivo_oculto = 'application_test_student.csv'
arquivo_metadados = 'HomeCredit_columns_description.csv'

## Input

In [3]:
df = pd.read_csv(f'{caminho}/{arquivo_principal}')

df_oculto = pd.read_csv(f'{caminho}/{arquivo_oculto}')

df_metadados = pd.read_csv(f'{caminho}/{arquivo_metadados}', encoding = 'Windows-1252')

## Variáveis

In [4]:
var_expl = [
    'CNT_CHILDREN', 
    'AMT_INCOME_TOTAL', 
    'DAYS_BIRTH', 
    'DAYS_EMPLOYED',
    'NAME_EDUCATION_TYPE'
]

var_resp = 'TARGET'

## Treino e Teste

In [5]:
df_treino, df_teste = train_test_split(df, test_size = 0.15, random_state = 1)

In [6]:
x_treino = df_treino[var_expl].copy()
y_treino = df_treino[var_resp].copy()

x_teste = df_teste[var_expl].copy()
y_teste = df_teste[var_resp].copy()

x_oculto = df_oculto[var_expl].copy()

## Pré-Processamento

In [7]:
lista_ordenada = [
    'Lower secondary',
    'Secondary / secondary special', 
    'Incomplete higher',
    'Higher education', 
    'Academic degree', 
]

oe = OrdinalEncoder(categories = [lista_ordenada])

oe.fit(x_treino[['NAME_EDUCATION_TYPE']])
x_treino[['NAME_EDUCATION_TYPE']] = oe.transform(x_treino[['NAME_EDUCATION_TYPE']])
x_teste[['NAME_EDUCATION_TYPE']] = oe.transform(x_teste[['NAME_EDUCATION_TYPE']])
x_oculto[['NAME_EDUCATION_TYPE']] = oe.transform(x_oculto[['NAME_EDUCATION_TYPE']])

## Modelo e Métricas

**Otimização acontece aqui**

In [36]:
4*4*4*4*2

512

In [None]:
32   -   21.2
512  -    x

In [None]:
32 x = 21.2 * 512
x = 21.2 * 512 / 32

In [40]:
21.2 * 1280 / 32 / 60

14.133333333333333

In [41]:
%%time

parametros = {
    'num_leaves': [5, 15, 25, 50],
    'max_depth': [2, 3, 5, 8],
    'n_estimators': [10, 50, 250, 500],
    'learning_rate': [0.1, 0.05, 0.01, 0.005],
}

modelo = LGBMClassifier(
    subsample = 0.5,        # similar ao random forest, importante definir
    subsample_freq = 5,     # similar ao random forest, importante definir
    colsample_bytree = 0.5, # similar ao random forest, importante definir
    random_state = 1    
)

gscv = GridSearchCV(
    estimator = modelo,
    param_grid = parametros,
    scoring = 'roc_auc',
    refit = True,
    cv = 5
)

gscv.fit(x_treino, y_treino)

y_pred_treino = gscv.predict_proba(x_treino)[:, 1]
y_pred_teste = gscv.predict_proba(x_teste)[:, 1]

CPU times: user 2h 17min 58s, sys: 10min 41s, total: 2h 28min 39s
Wall time: 19min 38s


In [42]:
roc_auc_score(y_treino, y_pred_treino)

0.6422611914457849

In [43]:
roc_auc_score(y_teste, y_pred_teste)

0.6218092144251098

In [46]:
gscv.best_params_

{'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 500, 'num_leaves': 15}

In [50]:
%%time

parametros = {
    'num_leaves': [15, 20, 25],
    'max_depth': [8, 10],
    'n_estimators': [500, 1000],
    'learning_rate': [0.01],
}

modelo = LGBMClassifier(
    subsample = 0.5,        # similar ao random forest, importante definir
    subsample_freq = 5,     # similar ao random forest, importante definir
    colsample_bytree = 0.5, # similar ao random forest, importante definir
    random_state = 1    
)

gscv = GridSearchCV(
    estimator = modelo,
    param_grid = parametros,
    scoring = 'roc_auc',
    refit = True,
    cv = 3
)

gscv.fit(x_treino, y_treino)

y_pred_treino = gscv.predict_proba(x_treino)[:, 1]
y_pred_teste = gscv.predict_proba(x_teste)[:, 1]

CPU times: user 17min, sys: 1min 35s, total: 18min 35s
Wall time: 2min 41s


In [58]:
160/(3*2*2*3)

4.444444444444445

In [51]:
roc_auc_score(y_treino, y_pred_treino)

0.6424046805073195

In [52]:
roc_auc_score(y_teste, y_pred_teste)

0.6218638463511945

In [53]:
gscv.best_params_

{'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 500, 'num_leaves': 15}

## Previsão

In [55]:
y_pred_oculto = gscv.predict_proba(x_oculto)[:, 1]
df_oculto['Y_PRED'] = y_pred_oculto
df_oculto[['SK_ID_CURR', 'Y_PRED']].head()

Unnamed: 0,SK_ID_CURR,Y_PRED
0,149741,0.062229
1,363290,0.047734
2,436006,0.032861
3,377703,0.075128
4,188624,0.081328
