# Deep Mailing - XGBoost Model

In [None]:
import xgboost
import numpy as np
import os
import sys
import logging
import gc
import pickle as pickle
from datetime import datetime

log_location = "../logs/"
arquivo_df_pickled_norm_train_x = "../intermediate/df.norm.train.x.pickle.npy"
arquivo_df_pickled_norm_train_y = "../intermediate/df.norm.train.y.pickle.npy"
arquivo_df_pickled_norm_test_x = "../intermediate/df.norm.test.x.pickle.npy"
arquivo_df_pickled_norm_test_y = "../intermediate/df.norm.test.y.pickle.npy"


logger = logging.getLogger()
logging.basicConfig(format="%(asctime)-15s %(message)s",
                    level=logging.DEBUG,
                    filename=os.path.join(log_location,'log.' + datetime.now().strftime("%Y%m%d%H%M%S.%f") + '.log'))

In [None]:
logging.debug("Carregando Pickling normalizado:{}".format(arquivo_df_pickled_norm))    
chamadas = pd.read_pickle(arquivo_df_pickled_norm)

In [None]:
data_maxima_mailing = chamadas.DATA_MAILING.max()
data_minima_mailing = chamadas.DATA_MAILING.min()

print("Max:{} Min:{}".format(data_maxima_mailing, data_minima_mailing))

In [None]:
print("Criando Pickling de train e teste...")
chamadas_test = chamadas[(chamadas.DATA_MAILING >= data_maxima_mailing)]
chamadas_train = chamadas[(chamadas.DATA_MAILING < data_maxima_mailing)]
del chamadas

In [None]:

def create_column_reference(header_chamadas_x,arquivo_df_pickled_norm_train_x):
    print("Criando Arquivo de referencia de colunas...")
    with open(arquivo_df_pickled_norm_train_x+".txt","w") as f:
    counter = 0
    lista_header = list(header_chamadas_x.columns.values)
    for header in lista_header:
        f.write("{}-{}\n".format(counter,header))
        counter=counter+1

print("Separando colunas em X e Y...")        
chamadas_train = chamadas_train[(chamadas_train.NORM_TENTATIVAS > 0)]
create_column_reference(chamadas_train.loc[:, chamadas_train.columns.values[2]:'NORM_63'].head(1), \ 
                        arquivo_df_pickled_norm_train_x)
chamadas_train_x = chamadas_train.loc[:, chamadas_train.columns.values[2]:'NORM_63'].as_matrix()
chamadas_train_y = chamadas_train.NORM_CUP.as_matrix()
chamadas_test = chamadas_test[(chamadas_test.NORM_TENTATIVAS > 0)]
chamadas_test_x = chamadas_test.loc[:, chamadas_test.columns.values[2]:'NORM_63'].as_matrix()
chamadas_test_y = chamadas_test.NORM_CUP.as_matrix()

print("Criando arquivos finais em formato NUMPY para consumo pelo algoritmo...")        
np.save(arquivo_df_pickled_norm_train_x,chamadas_train_x)
np.save(arquivo_df_pickled_norm_train_y,chamadas_train_y)
np.save(arquivo_df_pickled_norm_test_x,chamadas_test_x)
np.save(arquivo_df_pickled_norm_test_y,chamadas_test_y)

print("Removendo objetos desnecessarios")        
del chamadas_train_x
del chamadas_train_y
del chamadas_train
del chamadas_test
del chamadas_test_x
del chamadas_test_y

gc.collect()

In [None]:
print("Carregando objetos numpy")        
train_x = np.load(arquivo_df_pickled_norm_train_x)
train_y = np.load(arquivo_df_pickled_norm_train_y)
test_x = np.load(arquivo_df_pickled_norm_test_x)
test_y = np.load(arquivo_df_pickled_norm_test_y)

In [None]:
msg1 = "Train - CUPS Detectados {} num universo de {}".format(len([y for y in train_y if y >0]),len(train_y))
msg2 = "Test - CUPS Detectados {} num universo de {}".format(len([y for y in test_y if y >0]),len(test_y))

print(msg1)
print(msg2)

logging.debug(msg1)
logging.debug(msg2)


In [None]:
from sklearn.metrics import accuracy_score

model_depths_to_train = [10]
param = {}
param['objective'] = 'binary:logitraw'
param['eval_metric'] = 'error'
param['tree_method'] = 'gpu_hist'
param['silent'] = 1
num_round = 1000

gc.collect()
logging.debug("Starting model for depth:{}".format(depth))
dtrain = xgb.DMatrix(train_x, train_y)
dtest = xgb.DMatrix(train_x, train_y)
model = xgboost.XGBClassifier(silent=True, max_depth=depth)
gpu_res = {}
booster = xgb.train(param, dtrain, num_round, evals=[], evals_result=gpu_res)

test_y_pred = booster.predict(dtest)
test_predictions = [value for value in test_y_pred]
accuracy = accuracy_score(test_y, test_predictions)

logging.debug("Accuracy:{}".format(accuracy))

save_file = "../output/{}.{}.model".format(datetime.now().strftime("%Y%m%d.%H%M%S"),depth)
with open(save_file, 'wb') as fp:
    pickle.dump(booster, fp)    
logging.debug("Model saved as {}".format(save_file))
    