In [None]:
import xgboost
import numpy as np
import os
import sys
import logging
import gc
import pickle as pickle
import pandas as pd
import dateutil.parser as parser
import os.path
import math
from sklearn.metrics import accuracy_score,precision_score,recall_score, confusion_matrix
from datetime import datetime
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from xgboost import plot_tree
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
from pandas_ml import ConfusionMatrix

In [None]:
proporcao_train = 0.5
proporcao_test = 0.5

log_location = "../../logs/"
arquivo_pagantes_norm = "../../data/batch03/intermediate/Herval.normalized.pickle"
arquivo_pagantes_norm_train_x = "../../data/batch03/intermediate/Herval.normalized.train.x.pickle"

txt_dump_model = "../../data/batch03/intermediate/model_generated.txt"
txt_feat_map_model = "../../data/batch03/intermediate/feat_map_generated.txt"

output_folder= "../../data/batch03/model/"

In [None]:
logger = logging.getLogger()
logging.basicConfig(format="%(asctime)-15s %(message)s",
                    level=logging.DEBUG,
                    filename=os.path.join(log_location,'xgboost.log.' + datetime.now().strftime("%Y%m%d%H%M%S.%f") + '.log'))

In [None]:
def print_log(msg):
    logging.debug(msg)
    print(msg)
    
def log(msg):
    logging.debug(msg)
    

In [None]:
print_log("Carregando Pickling normalizado:{}".format(arquivo_pagantes_norm))    
pagantes = pd.read_pickle(arquivo_pagantes_norm)

In [None]:
pagantes.head(10)

In [None]:
total_pagantes = len(pagantes.index)
print_log("Total pagantes:{}".format(total_pagantes))

In [None]:
def create_column_reference(header_chamadas_x,arquivo_df_pickled_norm_train_x):
    print_log("Criando Arquivo de referencia de colunas...")
    with open(arquivo_df_pickled_norm_train_x+".txt","w") as f:
        counter = 0
        lista_header = list(header_chamadas_x.columns.values)
        for header in lista_header:
            f.write("{}-{}\n".format(counter,header))
            counter=counter+1

In [None]:
print_log("Criando dataframes de train e teste...")
pagantes = pagantes.sample(int(len(pagantes.index)))
pagantes_train = pagantes.tail(int(len(pagantes.index) * proporcao_train))
pagantes_test = pagantes.head(int(len(pagantes.index) * proporcao_test))
del pagantes

In [None]:
create_column_reference(pagantes_train.loc[:, :'NORM_RENDA_PRESUMIDA'].head(1), arquivo_pagantes_norm_train_x)

In [None]:
pagantes_train_x = pagantes_train.loc[:, :'NORM_RENDA_PRESUMIDA']
pagantes_train_y = pagantes_train.loc[:, 'PAGOU':'PAGOU']

pagantes_test_x = pagantes_test.loc[:, :'NORM_RENDA_PRESUMIDA']
pagantes_test_y = pagantes_test.loc[:, 'PAGOU':'PAGOU']

colunas_x = pagantes_train_x.columns.values
colunas_y = pagantes_train_y.columns.values

pagantes_train_x = pagantes_train_x.as_matrix()
pagantes_train_y = pagantes_train_y.as_matrix()

pagantes_test_x = pagantes_test_x.as_matrix()
pagantes_test_y = pagantes_test_y.as_matrix()

colunas_x = [x for x in colunas_x]

In [None]:
msg1 = "Train - Pagantes Detectados {} num universo de {}".format(len([y for y in pagantes_train_y if y >0]),len(pagantes_train_y))
msg2 = "Test - Pagantes Detectados {} num universo de {}".format(len([y for y in pagantes_test_y if y >0]),len(pagantes_test_y))
print_log(msg1)
print_log(msg2)

In [None]:
pagantes = (len([y for y in pagantes_train_y if y >0]),len(pagantes_train_y))
print(pagantes)
ratio = pagantes[1] / pagantes[0]


In [None]:
min_trees = 1
max_trees = 20
model_batch = datetime.now().strftime("%Y%m%d.%H%M%S")
for eta in [0.3]:
    for depth in [4]:
        for x in range(min_trees, max_trees):
            param = {}
            param['booster'] = 'gbtree'
            param['eta'] = eta
            param['objective'] = 'binary:logistic'
            param['eval_metric'] = 'auc'
            param['tree_method'] = 'auto'
            param['silent'] = 0
            param['max_depth'] = depth
            param['subsample'] = 0.5
            num_round = x
            gc.collect()
            #print_log("Starting model for params:{}".format(param))
            dtrain = xgb.DMatrix(pagantes_train_x, pagantes_train_y, feature_names = colunas_x)
            dtest = xgb.DMatrix(pagantes_test_x, pagantes_test_y, feature_names = colunas_x)
            train_labels = dtrain.get_label()
            ratio = float(np.sum(train_labels == 0)) / np.sum(train_labels == 1) 
            param['scale_pos_weight'] = ratio
            #print_log("ratio:{}".format(ratio))
            gpu_res = {}
            booster = xgb.train(param, dtrain, num_round, evals_result=gpu_res, evals = [])

            booster.dump_model(os.path.join(output_folder,"{}.{}.txt".format(model_batch,x)))
            print_log(sorted( ((v,k) for k,v in booster.get_score().items()), reverse=True))

            train_y_pred = booster.predict(dtrain)
            train_predictions = np.array([value for value in train_y_pred])
            pagantes_train_y = pagantes_train_y.astype('float32')
            train_predictions = train_predictions.astype('float32').round()
            tn, fp, fn, tp = confusion_matrix(np.squeeze(pagantes_train_y), np.squeeze(train_predictions)).ravel()
            msg = "(TRAIN)Number of Trees:{}, eta:{}, depth:{} ".format(x, eta, depth) + "True Positive:{} True Negative:{} False Positive:{} False Negative:{},  {}% de acerto no train".format(tp, tn, fp, fn, (1 - ((fp+fn)/ (tp+tn))) * 100 )
            print_log(msg)

            test_y_pred = booster.predict(dtest)
            test_predictions = np.array([value for value in test_y_pred])
            test_predictions = np.array([1 if x > 0.30 else 0 for x in test_predictions])
            pagantes_test_y = pagantes_test_y.astype('float32')
            test_predictions = test_predictions.astype('float32').round()
            tn, fp, fn, tp = confusion_matrix(np.squeeze(pagantes_test_y), np.squeeze(test_predictions)).ravel()
            msg = "(TEST)Number of Trees:{}, eta:{}, depth:{} ".format(x, eta, depth) + "True Positive:{} True Negative:{} False Positive:{} False Negative:{},  {}% de falso negativo no test".format(tp, tn, fp, fn,  (fn / (tp + tn + fp + fn) * 100))
            print_log(msg)

            if False and x == (max_trees - 1):
                falso_negativos = []
                for cur_sample in range(len(pagantes_test_x)):
                    if pagantes_test_y[cur_sample] == 1 and test_predictions[cur_sample] == 0:
                        falso_negativos.append(pagantes_test_x[cur_sample])

                print_log(len(falso_negativos))
                for cur_falso_negativo in falso_negativos:
                    print_log(cur_falso_negativo)
                break

            save_file = os.path.join(output_folder,"{}.{}.model".format(model_batch,x))
            with open(save_file, 'wb') as fp:
                pickle.dump(booster, fp)
            