In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import xgboost as xgb

In [2]:
from sklearn.metrics import mean_absolute_error

In [3]:
from datetime import date

In [4]:
import warnings
warnings.filterwarnings(action='ignore')

In [5]:
# set the seed of random number generator, which is useful for creating simulations 
# or random objects that can be reproduced.
import random
SEED=3
random.seed(SEED)
np.random.seed(SEED)

In [6]:
# Load Train Data
train = pd.read_pickle('../data/processed/train_nochanel_uniqueidpos_x_envios_feateng.pkl')

In [7]:
train.shape

(11810, 94)

In [8]:
train['fecha_venta_norm'] = pd.to_datetime(train['fecha_venta_norm'])

In [9]:
train['fecha_venta_norm'] = train['fecha_venta_norm'].dt.date

In [10]:
predictors = [
    'id_pos', 
    #'canal',
    'competidores',
    'ingreso_mediana',
    'ingreso_promedio',
    'densidad_poblacional',
    'pct_0a5',
    'pct_5a9',
    'pct_10a14',
    'pct_15a19',
    'pct_20a24',
    'pct_25a29',
    'pct_30a34',
    'pct_35a39',
    'pct_40a44',
    'pct_45a49',
    'pct_50a54',
    'pct_55a59',
    'pct_60a64',
    'pct_65a69',
    'pct_70a74',
    'pct_75a79',
    'pct_80a84',
    'pct_85ainf',
    'pct_bachelors',
    'pct_doctorados',
    'pct_secundario',
    'pct_master',
    'pct_bicicleta',
    'pct_omnibus',
    'pct_subtes',
    'pct_taxi',
    'pct_caminata',
    'mediana_valor_hogar',
    #'unidades_despachadas_sum', 
    #'unidades_despachadas_max',
    #'unidades_despachadas_min', 
    #'unidades_despachadas_avg',
    #'cantidad_envios_max', 
    #'cantidad_envios_min', 
    #'cantidad_envios_avg',
    #'num_cantidad_envios', 
    #'unidades_despachadas_sum_acum', 
    #'unidades_despachadas_sum_acum_3p',
    #'unidades_despachadas_sum_acum_6p', 
    #'unidades_despachadas_max_acum', 
    #'unidades_despachadas_min_acum', 
    #'num_cantidad_envios_acum', 
    #'num_cantidad_envios_acum_3per',
    #'num_cantidad_envios_acum_6per', 
    #'diff_dtventa_dtenvio',
    'unidades_before', 
    'num_ventas_before',
    'rel_unidades_num_ventas', 
    'unidades_acum', 
    'num_ventas_acum',
    'countacum', 'unidades_mean', 
    'num_ventas_mean',
    'unidades_2time_before', 
    'unidades_diff', 
    'month',
    'diff_dtventa_dtventa_before', 
    'unidades_pend',
    ]

In [11]:
train = train[predictors]

#### encode catvars

In [12]:
le = preprocessing.LabelEncoder()

In [13]:
classes = train['canal'].unique()

In [14]:
classes = [i for i in classes]

In [15]:
classes.append('NAN')

In [16]:
le.fit(classes)

LabelEncoder()

In [17]:
np.save('../models/canal_le.npy', le.classes_)

In [18]:
train['canal'] = le.transform(train['canal'].values)

In [12]:
X, y = train.iloc[:,:-1],train.iloc[:,-1]


#### Building final model

In [13]:
model = xgb.XGBRegressor(seed = SEED)

In [14]:
model.set_params(objective = 'reg:squarederror')
model.set_params(gpu_id = 0)
model.set_params(max_bin= 16)
model.set_params(tree_method='gpu_hist')
model.set_params(learning_rate = 0.01)
model.set_params(n_estimators = 273)
model.set_params(max_depth = 4)
model.set_params(min_child_weight = 5)
model.set_params(gamma = 0.0)
model.set_params(colsample_bytree = 0.9)
model.set_params(subsample = 0.8)
model.set_params(reg_alpha = 1)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.9, gamma=0.0, gpu_id=0,
       importance_type='gain', learning_rate=0.01, max_bin=16,
       max_delta_step=0, max_depth=4, min_child_weight=5, missing=None,
       n_estimators=273, n_jobs=1, nthread=None,
       objective='reg:squarederror', random_state=0, reg_alpha=1,
       reg_lambda=1, scale_pos_weight=1, seed=3, silent=None,
       subsample=0.8, tree_method='gpu_hist', verbosity=1)

In [15]:
model.fit(X, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.9, gamma=0.0, gpu_id=0,
       importance_type='gain', learning_rate=0.01, max_bin=16,
       max_delta_step=0, max_depth=4, min_child_weight=5, missing=None,
       n_estimators=273, n_jobs=1, nthread=None,
       objective='reg:squarederror', random_state=0, reg_alpha=1,
       reg_lambda=1, scale_pos_weight=1, seed=3, silent=None,
       subsample=0.8, tree_method='gpu_hist', verbosity=1)

In [16]:
y_pred = model.predict(X)

In [17]:
print("MAE unidades: ",mean_absolute_error(y, y_pred))

MAE unidades:  18.158011187190326


In [18]:
print("median unidades: ", np.median(y))

median unidades:  12.0


In [19]:
print("median unidades pred: ", np.median(y_pred))

median unidades pred:  16.386812


In [20]:
import pickle
#save model
pickle.dump(model, open("../models/xgboost_013.pkl","wb")) 