# Deep Mailing - XGBoost Model

In [1]:
import xgboost
import numpy as np
import os
import sys
import logging
import gc
import pickle as pickle
from datetime import datetime

log_location = "../logs/"
arquivo_df_pickled_norm_train_x = "../intermediate/df.norm.train.x.pickle.npy"
arquivo_df_pickled_norm_train_y = "../intermediate/df.norm.train.y.pickle.npy"
arquivo_df_pickled_norm_test_x = "../intermediate/df.norm.test.x.pickle.npy"
arquivo_df_pickled_norm_test_y = "../intermediate/df.norm.test.y.pickle.npy"


logger = logging.getLogger()
logging.basicConfig(format="%(asctime)-15s %(message)s",
                    level=logging.DEBUG,
                    filename=os.path.join(log_location,'log.' + datetime.now().strftime("%Y%m%d%H%M%S.%f") + '.log'))

In [2]:
train_x = np.load(arquivo_df_pickled_norm_train_x)
train_y = np.load(arquivo_df_pickled_norm_train_y)
test_x = np.load(arquivo_df_pickled_norm_test_x)
test_y = np.load(arquivo_df_pickled_norm_test_y)

In [None]:
msg1 = "Train - CUPS Detectados {} num universo de {}".format(len([y for y in train_y if y >0]),len(train_y))
msg2 = "Test - CUPS Detectados {} num universo de {}".format(len([y for y in test_y if y >0]),len(test_y))

print(msg1)
print(msg2)

logging.debug(msg1)
logging.debug(msg2)


Train - CUPS Detectados 2318 num universo de 967378
Test - CUPS Detectados 106 num universo de 73189


In [None]:
from sklearn.metrics import accuracy_score

model_depths_to_train = [1,2,5,10,20,50]

for depth in model_depths_to_train:
    gc.collect()
    logging.debug("Starting model for depth:{}".format(depth))
    model = xgboost.XGBClassifier(silent=True, max_depth=depth)
    model.fit(train_x, train_y.ravel())
    test_y_pred = model.predict(test_x)
    test_predictions = [value for value in test_y_pred]
    accuracy = accuracy_score(test_y, test_predictions)
    logging.debug("Rows on test:{} - {}".format(len(test_predictions), len(test_y)))
    logging.debug("Accuracy on test : %.8f%%" % (accuracy * 100.0))
    save_file = "../output/{}.{}.{}.model".format(datetime.now().strftime("%Y%m%d.%H%M%S"),depth,str(int(accuracy*100.0)).zfill(4))
    with open(save_file, 'wb') as fp:
        pickle.dump(model, fp)    
    logging.debug("Model saved as {}".format(save_file))
    