In [None]:
import xgboost
import numpy as np
import os
import sys
import logging
import gc
import pickle as pickle
import pandas as pd
import dateutil.parser as parser
import os.path
import math
from sklearn.metrics import accuracy_score,precision_score,recall_score, confusion_matrix
from datetime import datetime
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from xgboost import plot_tree
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
from pandas_ml import ConfusionMatrix

In [None]:
proporcao_train = 0.8
proporcao_test = 0.2

log_location = "../logs/"
arquivo_pagantes_norm = "../data/intermediate/Herval.normalized.pickle"
arquivo_pagantes_norm_train_x = "../data/intermediate/Herval.normalized.train.x.pickle"

txt_dump_model = "../data/intermediate/model_generated.txt"
txt_feat_map_model = "../data/intermediate/feat_map_generated.txt"
output_folder= ""

In [None]:
logger = logging.getLogger()
logging.basicConfig(format="%(asctime)-15s %(message)s",
                    level=logging.DEBUG,
                    filename=os.path.join(log_location,'xgboost.log.' + datetime.now().strftime("%Y%m%d%H%M%S.%f") + '.log'))

In [None]:
def print_log(msg):
    logging.debug(msg)
    print(msg)

In [None]:
print_log("Carregando Pickling normalizado:{}".format(arquivo_pagantes_norm))    
pagantes = pd.read_pickle(arquivo_pagantes_norm)
pagantes = pagantes. query('NORM_CONTRATO_ATRASO < 0.15')

In [None]:
df4 = pagantes.NORM_CONTRATO_ATRASO 

rcParams['figure.figsize'] = 10,10
df4.plot.hist(alpha=0.5)

In [None]:
pagantes.head(10)

In [None]:
total_pagantes = len(pagantes.index)
print_log("Total pagantes:{}".format(total_pagantes))

In [None]:
pagantes = pagantes[['NORM_IDADE', 'NORM_VALOR_DIVIDA', 'NORM_CONTRATO_ATRASO', 'PAGOU']]

In [None]:
def create_column_reference(header_chamadas_x,arquivo_df_pickled_norm_train_x):
    print_log("Criando Arquivo de referencia de colunas...")
    with open(arquivo_df_pickled_norm_train_x+".txt","w") as f:
        counter = 0
        lista_header = list(header_chamadas_x.columns.values)
        for header in lista_header:
            f.write("{}-{}\n".format(counter,header))
            counter=counter+1

In [None]:
pagantes.head(10)
#pagantes.loc[:, :'NORM_CONTRATO_ATRASO'].head(1)

In [None]:
print_log("Criando dataframes de train e teste...")
pagantes = pagantes.sample(int(len(pagantes.index)))
pagantes_train = pagantes.tail(int(len(pagantes.index) * proporcao_train))
pagantes_test = pagantes.head(int(len(pagantes.index) * proporcao_test))
del pagantes

In [None]:
create_column_reference(pagantes_train.loc[:, :'NORM_CONTRATO_ATRASO'].head(1), arquivo_pagantes_norm_train_x)

In [None]:
pagantes_train_x = pagantes_train.loc[:, :'NORM_CONTRATO_ATRASO']
pagantes_train_y = pagantes_train.loc[:, 'PAGOU':'PAGOU']

pagantes_test_x = pagantes_test.loc[:, :'NORM_CONTRATO_ATRASO']
pagantes_test_y = pagantes_test.loc[:, 'PAGOU':'PAGOU']

colunas_x = pagantes_train_x.columns.values
colunas_y = pagantes_train_y.columns.values

pagantes_train_x = pagantes_train_x.as_matrix()
pagantes_train_y = pagantes_train_y.as_matrix()

pagantes_test_x = pagantes_test_x.as_matrix()
pagantes_test_y = pagantes_test_y.as_matrix()

colunas_x = [x for x in colunas_x]

In [None]:
msg1 = "Train - Pagantes Detectados {} num universo de {}".format(len([y for y in pagantes_train_y if y >0]),len(pagantes_train_y))
msg2 = "Test - Pagantes Detectados {} num universo de {}".format(len([y for y in pagantes_test_y if y >0]),len(pagantes_test_y))
print_log(msg1)
print_log(msg2)

In [None]:
param = {}
param['eta'] = 0.3
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'auc'
param['tree_method'] = 'exact'
param['silent'] = 0
param['max_depth'] = 10
num_round = 100

In [None]:
gc.collect()
print_log("Starting model for params:{}".format(param))
dtrain = xgb.DMatrix(pagantes_train_x, pagantes_train_y, feature_names = colunas_x)
dtest = xgb.DMatrix(pagantes_test_x, pagantes_test_y, feature_names = colunas_x)

In [None]:
train_labels = dtrain.get_label()
ratio = float(np.sum(train_labels == 0)) / np.sum(train_labels == 1) 
param['scale_pos_weight'] = ratio
print_log("ratio:{}".format(ratio))

In [None]:
gpu_res = {}
booster = xgb.train(param, dtrain, num_round, evals=[], evals_result=gpu_res)

In [None]:
booster.dump_model(txt_dump_model)

In [None]:
%%bash -s "$txt_dump_model"

#cat $1 

In [None]:
%matplotlib inline

rcParams['figure.figsize'] = 20,20
plot_importance(booster)
plt.show()

In [None]:
train_y_pred = booster.predict(dtrain)
train_predictions = np.array([value for value in train_y_pred])

In [None]:
accuracy = accuracy_score(pagantes_train_y, train_predictions.round())
precision = precision_score(pagantes_train_y, train_predictions.round())
recall = recall_score(pagantes_train_y, train_predictions.round())

print_log("(Base Train)Clientes Total:{}".format(len(train_predictions)))
print_log("(Base Train)PAGANTES Previstos:{}".format(len([x for x in train_predictions if x > 0.5])))
print_log("(Base Train)PAGANTES na Base Train:{}".format(len([x for x in pagantes_train_y if x > 0.5])))
print_log("(Base Train)Accuracy Total:{}".format(accuracy))
print_log("(Base Train)Precision:{}".format(precision))
print_log("(Base Train)Recall:{}".format(recall))

In [None]:
print(pagantes_train_y.shape, train_predictions.shape)
print(pagantes_train_y.dtype, train_predictions.dtype)
pagantes_train_y = pagantes_train_y.astype('float32')
train_predictions = train_predictions.astype('float32').round()

#confusion_matrix = ConfusionMatrix(np.squeeze(pagantes_train_y), np.squeeze(train_predictions))
#print("Confusion matrix:\n%s" % confusion_matrix)
#print("TP:{}".format(confusion_matrix.TP.sum()))
tn, fp, fn, tp = confusion_matrix(np.squeeze(pagantes_train_y), np.squeeze(train_predictions)).ravel()
print(tn, fp, fn, tp)
print("True Positive:{}".format(tp))
print("True Negative:{}".format(tn))
print("False Positive:{}".format(fp))
print("False Negative:{}".format(fn))


In [None]:
test_y_pred = booster.predict(dtest)
test_predictions = np.array([value for value in test_y_pred])

In [None]:
accuracy = accuracy_score(pagantes_test_y, test_predictions.round())
precision = precision_score(pagantes_test_y, test_predictions.round())
recall = recall_score(pagantes_test_y, test_predictions.round())

print_log("(Base Test)Clientes Total:{}".format(len(test_predictions)))
print_log("(Base Test)PAGANTES Previstos:{}".format(len([x for x in test_predictions if x > 0.5])))
print_log("(Base Test)PAGANTES na Base Teste:{}".format(len([x for x in pagantes_test_y if x > 0.5])))
print_log("(Base Test)Accuracy Total:{}".format(accuracy))
print_log("(Base Test)Precision:{}".format(precision))
print_log("(Base Test)Recall:{}".format(recall))

In [None]:
save_file = "../data/output/{}.model".format(datetime.now().strftime("%Y%m%d.%H%M%S"))
with open(save_file, 'wb') as fp:
    pickle.dump(booster, fp)    
print_log("Model saved as {}".format(save_file))