In [None]:
import xgboost
import numpy as np
import os
import sys
import logging
import gc
import pickle as pickle
import pandas as pd
import dateutil.parser as parser
import os.path
import math
from sklearn.metrics import accuracy_score,precision_score,recall_score, confusion_matrix
from datetime import datetime
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from xgboost import plot_tree
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
from pandas_ml import ConfusionMatrix

In [None]:
cluster = 2
proporcao_train = 0.7
proporcao_test = 0.3
min_trees = 5
max_trees = 20
learning_rates_to_run = [0.3]
depth_to_run = [3,5,10,20]

log_location = "../../logs/"
arquivo_pagantes_norm = "../../data/batch03/intermediate/Herval.normalized.pickle"
arquivo_pagantes_norm_train_x = "../../data/batch03/intermediate/Herval.normalized.train.x.pickle"

txt_dump_model = "../../data/batch03/intermediate/model_generated.txt"
txt_feat_map_model = "../../data/batch03/intermediate/feat_map_generated.txt"

output_folder= "../../data/batch03/model/"

In [None]:
logger = logging.getLogger()
logging.basicConfig(format="%(asctime)-15s %(message)s",
                    level=logging.DEBUG,
                    filename=os.path.join(log_location,'xgboost.log.' + datetime.now().strftime("%Y%m%d%H%M%S.%f") + '.log'))

In [None]:
def print_log(msg):
    logging.debug(msg)
    print(msg)
    
def log(msg):
    logging.debug(msg)
    

In [None]:
print_log("Carregando Pickling normalizado:{}".format(arquivo_pagantes_norm))    
pagantes = pd.read_pickle(arquivo_pagantes_norm)
pagantes = pagantes.query("CLUSTER == {}".format(cluster))

In [None]:
total_pagantes = len(pagantes.index)
print_log("Total pagantes:{}".format(total_pagantes))

In [None]:
def create_column_reference(header_chamadas_x,arquivo_df_pickled_norm_train_x):
    print_log("Criando Arquivo de referencia de colunas...")
    with open(arquivo_df_pickled_norm_train_x+".txt","w") as f:
        counter = 0
        lista_header = list(header_chamadas_x.columns.values)
        for header in lista_header:
            f.write("{}-{}\n".format(counter,header))
            counter=counter+1

In [None]:
print_log("Criando dataframes de train e teste...")
pagantes = pagantes.sample(int(len(pagantes.index)))
pagantes_train = pagantes.tail(int(len(pagantes.index) * proporcao_train))
pagantes_test = pagantes.head(int(len(pagantes.index) * proporcao_test))
del pagantes

In [None]:
create_column_reference(pagantes_train.loc[:, 'NORM_CLASSE_SOCIAL_A1':'NORM_RENDA_PRESUMIDA'].head(1), arquivo_pagantes_norm_train_x)

In [None]:
pagantes_train_x = pagantes_train.loc[:, 'NORM_CLASSE_SOCIAL_A1':'NORM_RENDA_PRESUMIDA']
pagantes_train_y = pagantes_train.loc[:, 'PAGOU':'PAGOU']

pagantes_test_x = pagantes_test.loc[:, 'NORM_CLASSE_SOCIAL_A1':'NORM_RENDA_PRESUMIDA']
pagantes_test_y = pagantes_test.loc[:, 'PAGOU':'PAGOU']

colunas_x = pagantes_train_x.columns.values
colunas_y = pagantes_train_y.columns.values

pagantes_train_x = pagantes_train_x.as_matrix()
pagantes_train_y = pagantes_train_y.as_matrix()

pagantes_test_x = pagantes_test_x.as_matrix()
pagantes_test_y = pagantes_test_y.as_matrix()

colunas_x = [x for x in colunas_x]

In [None]:
#msg1 = "Train - Pagantes Detectados {} num universo de {}".format(len([y for y in pagantes_train_y if y >0]),len(pagantes_train_y))
#msg2 = "Test - Pagantes Detectados {} num universo de {}".format(len([y for y in pagantes_test_y if y >0]),len(pagantes_test_y))
#print_log(msg1)
#print_log(msg2)

In [None]:
#pagantes = (len([y for y in pagantes_train_y if y >0]),len(pagantes_train_y))
#print(pagantes)
#ratio = pagantes[1] / pagantes[0]

In [None]:
model_batch = datetime.now().strftime("%Y%m%d.%H%M%S")
tested_hyper_parameters = []

def create_booster(eta,depth,num_trees):
    param = {}
    param['booster'] = 'gbtree'
    param['eta'] = eta
    param['objective'] = 'binary:logistic'
    param['eval_metric'] = 'auc'
    param['tree_method'] = 'auto'
    param['silent'] = 0
    param['max_depth'] = depth
    param['subsample'] = 0.5
    num_round = num_trees
    dtrain = xgb.DMatrix(pagantes_train_x, pagantes_train_y, feature_names = colunas_x)
    dtest = xgb.DMatrix(pagantes_test_x, pagantes_test_y, feature_names = colunas_x)
    train_labels = dtrain.get_label()
    ratio = float(np.sum(train_labels == 0)) / np.sum(train_labels == 1) 
    param['scale_pos_weight'] = ratio
    gpu_res = {}
    booster = xgb.train(param, dtrain, num_round, evals_result=gpu_res, evals = [])    
    return booster, dtrain, dtest

for eta in learning_rates_to_run:
    for depth in depth_to_run:
        for num_trees in range(min_trees, max_trees):
            gc.collect()
            booster, dtrain, dtest = create_booster(eta,depth,num_trees)
            booster.dump_model(os.path.join(output_folder,"{}.{}.{}.txt".format(model_batch,depth,num_trees)))
            save_file = os.path.join(output_folder,"{}.{}.{}.xgboost.model".format(model_batch,depth,num_trees))
            relevant_features = sorted( ((v,k) for k,v in booster.get_score().items()), reverse=True)
            #print_log(relevant_features)

            for current_threshold in range(0, 30):
                threshold = (0.5 - (current_threshold / 100))
                train_y_pred = booster.predict(dtrain)
                train_predictions = np.array([value for value in train_y_pred])
                train_predictions = np.array([1 if x > threshold else 0 for x in train_predictions])
                pagantes_train_y = pagantes_train_y.astype('float32')
                train_predictions = train_predictions.astype('float32').round()
                total_pagantes_train = len([x for x in pagantes_train_y if x == 1])
                tn, fp, fn, tp = confusion_matrix(np.squeeze(pagantes_train_y), np.squeeze(train_predictions)).ravel()
                pagantes_perdidos_train = (fn / total_pagantes_train ) * 100
                msg = "(TRAIN)Number of Trees:{}, eta:{}, depth:{} threshold:{:5.2f} ".format(num_trees, eta, depth, threshold) + "True Positive:{} True Negative:{} False Positive:{} False Negative:{},  {}% de pagantes perdidos no test".format(tp, tn, fp, fn, pagantes_perdidos_train )
                #print_log(msg)

                test_y_pred = booster.predict(dtest)
                test_predictions = np.array([value for value in test_y_pred])
                test_predictions = np.array([1 if x > threshold else 0 for x in test_predictions])
                pagantes_test_y = pagantes_test_y.astype('float32')
                test_predictions = test_predictions.astype('float32').round()
                total_pagantes_test = len([x for x in pagantes_test_y if x == 1])
                tn, fp, fn, tp = confusion_matrix(np.squeeze(pagantes_test_y), np.squeeze(test_predictions)).ravel()
                pagantes_perdidos_test = (fn / total_pagantes_train ) * 100
                msg = "(TEST)Number of Trees:{}, eta:{}, depth:{} threshold:{:5.2f} ".format(num_trees, eta, depth, threshold) + "True Positive:{} True Negative:{} False Positive:{} False Negative:{},  {}% de pagantes perdidos no test".format(tp, tn, fp, fn, pagantes_perdidos_test )
                
                porcentagem_pagamentos = tp / (tp + fn)
                base_para_trabalhar = (tp+fp) / (tp + tn + fn + fp)
                current_info = { 
                    "0cluster" : cluster,
                    "1model" : save_file,
                    "2num_trees" : num_trees,
                    "3eta" : eta,
                    "4depth" : depth,
                    "5relevante_features" : str(relevant_features),
                    "6threshold" : threshold,
                    "7test0_total_pagantes" : total_pagantes_test,
                    "7test1_true_negative" : tn,
                    "7test2_true_positive" : tp,
                    "7test3_false_positive" : fp,
                    "7test4_false_negative" : fn,
                    "7test5_pagantes_perdidos" : total_pagantes_test,
                    "8%_pagamentos" : porcentagem_pagamentos,
                    "8%_base_para_trabalhar" : base_para_trabalhar,
                    "9delta" : porcentagem_pagamentos - base_para_trabalhar
                }
                tested_hyper_parameters.append(current_info)
            booster.save_model(save_file)


In [None]:
pd.DataFrame(tested_hyper_parameters).to_excel(os.path.join(output_folder,"{}.model.xls".format(model_batch)))