# Projeto

## Bibliotecas

In [1]:
import os

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score

## Arquivos

In [2]:
caminho = '../../machine_learning_i/projeto/dados'
arquivo_principal = 'application_train.csv'
arquivo_oculto = 'application_test_student.csv'
arquivo_metadados = 'HomeCredit_columns_description.csv'

## Input

In [3]:
df = pd.read_csv(f'{caminho}/{arquivo_principal}')

df_oculto = pd.read_csv(f'{caminho}/{arquivo_oculto}')

df_metadados = pd.read_csv(f'{caminho}/{arquivo_metadados}', encoding = 'Windows-1252')

## Variáveis

In [4]:
var_expl = [
    'CNT_CHILDREN', 
    'AMT_INCOME_TOTAL', 
    'DAYS_BIRTH', 
    'DAYS_EMPLOYED',
    'NAME_EDUCATION_TYPE'
]

var_resp = 'TARGET'

## Treino e Teste

In [5]:
df_treino, df_teste = train_test_split(df, test_size = 0.15, random_state = 1)

In [6]:
x_treino = df_treino[var_expl].copy()
y_treino = df_treino[var_resp].copy()

x_teste = df_teste[var_expl].copy()
y_teste = df_teste[var_resp].copy()

x_oculto = df_oculto[var_expl].copy()

## Pré-Processamento

In [7]:
lista_ordenada = [
    'Lower secondary',
    'Secondary / secondary special', 
    'Incomplete higher',
    'Higher education', 
    'Academic degree', 
]

oe = OrdinalEncoder(categories = [lista_ordenada])

oe.fit(x_treino[['NAME_EDUCATION_TYPE']])
x_treino[['NAME_EDUCATION_TYPE']] = oe.transform(x_treino[['NAME_EDUCATION_TYPE']])
x_teste[['NAME_EDUCATION_TYPE']] = oe.transform(x_teste[['NAME_EDUCATION_TYPE']])
x_oculto[['NAME_EDUCATION_TYPE']] = oe.transform(x_oculto[['NAME_EDUCATION_TYPE']])

## Modelo e Métricas

In [11]:
%%time

parametros = {
    'n_estimators': [50, 100, 250, 500],
    'learning_rate': [0.1, 0.01],
}

modelo = AdaBoostClassifier(
    random_state = 1    
)

gscv = GridSearchCV(
    estimator = modelo,
    param_grid = parametros,
    scoring = 'roc_auc',
    refit = True,
    cv = 2
)

gscv.fit(x_treino, y_treino)

y_pred_treino = gscv.predict_proba(x_treino)[:, 1]
y_pred_teste = gscv.predict_proba(x_teste)[:, 1]

CPU times: user 3min 1s, sys: 815 ms, total: 3min 2s
Wall time: 3min 3s


In [None]:
gscv.best_params_

In [14]:
180/(16)

11.25

In [17]:
11.25 * 1280 / 60 / 60

4.0

In [12]:
roc_auc_score(y_treino, y_pred_treino)

0.6322235803076812

In [13]:
roc_auc_score(y_teste, y_pred_teste)

0.6218738343268031

## Previsão

In [None]:
y_pred_oculto = modelo.predict_proba(x_oculto)[:, 1]
df_oculto['Y_PRED'] = y_pred_oculto
df_oculto[['SK_ID_CURR', 'Y_PRED']].head()