In [28]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)



In [29]:
import pandas as pd
import numpy as np
import datetime

#### Datos normalizados

In [30]:
df = pd.read_feather('train_normalized_data.fth')
df_test = pd.read_feather('test_normalized_data.fth')

#### Separo categoricas y continuas

In [31]:
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen', 'Promo2Weeks', 
            'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear', 'State', 
            'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_bool_fw', 'StateHoliday_bool_bw', 'SchoolHoliday_fw', 'SchoolHoliday_bw']

In [32]:
contin_vars = ['CompetitionDistance', 
   'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC', 'Precipitationmm',
   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
   'AfterStateHoliday_bool', 'BeforeStateHoliday_bool', 'Promo', 'SchoolHoliday', 'StateHoliday_bool']

In [33]:
y_out_columns = ['Sales']

### LightGBM

#### Split de datos para entrenar

In [34]:
# split_type = 'random'
# split_type = 'no_split'
split_type = 'last_week'

In [35]:
if split_type == 'no_split':
    df_train = df
elif split_type == 'last_week':
    # Esto divide en train y val
    df_train = df[df.Date < datetime.datetime(2015, 7, 1)]  
    df_val = df[df.Date >= datetime.datetime(2015, 7, 1)]
    print(f'Cantidad en val: {len(df_val)}, porcentaje: {len(df_val)/(len(df_train) + len(df_val))}')
elif split_type == 'random':
    # Splitting aleatorio
    np.random.seed(42)
    indexes = np.arange(len(df))
    np.random.shuffle(indexes)
    N = len(df)//5
    df_train = df[N:]
    df_val = df[:N]
    print(f'Cantidad en val: {len(df_val)}, porcentaje: {len(df_val)/(len(df_train) + len(df_val))}')

Cantidad en val: 30188, porcentaje: 0.035753454185409164


In [36]:
X_train = df_train[cat_vars + contin_vars]
if split_type != 'no_split':
    X_val = df_val[cat_vars + contin_vars]
X_test = df_test[cat_vars + contin_vars]

#### Normalización de los datos

In [37]:
log_output = False
    
if log_output:
    # Escala logaritmica
    max_log_y = np.max(np.log(df[y_out_columns])).values
    y_train = np.log(df_train[y_out_columns].values)/max_log_y
    if split_type != 'no_split':
        y_val = np.log(df_val[y_out_columns].values)/max_log_y
else:
    # Normalización
    y_mean = df_train[y_out_columns].mean().values
    y_std = df_train[y_out_columns].std().values
    y_train = (df_train[y_out_columns].values - y_mean)/y_std
    if split_type != 'no_split':
        y_val = (df_val[y_out_columns].values - y_mean)/y_std

#### Armado del modelo

In [38]:
from sklearn.model_selection import cross_val_score
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from lightgbm import LGBMRegressor

In [39]:
def calculate_RMSE(X, y, log_output=True):
    y_preds = np.exp(model.predict(X, verbose=1)*max_log_y)
    return np.sqrt((((y - y_preds)/y)**2).sum()/len(y_preds))

In [40]:
min_child_samples=5
n_estimators=2000
learning_rate=0.25
model = LGBMRegressor(min_child_samples=min_child_samples, n_estimators=n_estimators, learning_rate=learning_rate )

In [41]:
fit_params={"early_stopping_rounds":100, 
            "eval_metric" : 'l2', 
            "eval_set" : [(X_val, y_val.reshape(-1))],
            'eval_names': ['valid'],
            'verbose': 100,
            'feature_name': 'auto', # that's actually the default
            'categorical_feature': cat_vars
           }

In [42]:
model.fit(X_train, y_train.reshape(-1), **fit_params)

New categorical_feature is ['Assortment', 'CompetitionMonthsOpen', 'CompetitionOpenSinceYear', 'Day', 'DayOfWeek', 'Events', 'Month', 'Promo2SinceYear', 'Promo2Weeks', 'PromoInterval', 'Promo_bw', 'Promo_fw', 'SchoolHoliday_bw', 'SchoolHoliday_fw', 'State', 'StateHoliday', 'StateHoliday_bool_bw', 'StateHoliday_bool_fw', 'Store', 'StoreType', 'Week', 'Year']


Training until validation scores don't improve for 100 rounds
[100]	valid's l2: 0.0990793
[200]	valid's l2: 0.104567
Early stopping, best iteration is:
[161]	valid's l2: 0.092899


LGBMRegressor(learning_rate=0.25, min_child_samples=5, n_estimators=2000)

#### Análisis de la métrica

In [43]:
model.score(X_val, y_val)

0.8958604498002849

In [44]:
if log_output:
    y_pred_train = np.exp(model.predict(X_train, verbose=1)*max_log_y)
    y_pred = np.exp(model.predict(X_val, verbose=1)*max_log_y)
    y_pred_test = np.exp(model.predict(X_test, verbose=1)*max_log_y)
else:
    y_pred_train = model.predict(X_train, verbose=1)*y_std + y_mean
    y_pred = model.predict(X_val, verbose=1)*y_std + y_mean
    y_pred_test = model.predict(X_test, verbose=1)*y_std + y_mean

In [45]:
# Train
np.sqrt((((df_train['Sales'].values - y_pred_train)/df_train['Sales'].values)**2).sum()/len(y_pred_train))

0.17750948033554367

In [46]:
# Validación
np.sqrt((((df_val['Sales'].values - y_pred)/df_val['Sales'].values)**2).sum()/len(y_pred))

0.15102391658926115

#### LightGBM con Hyperopt

In [54]:
def predicctionDesNormalizacion(model, soloVal=False):
  

  if log_output:
      if soloVal:
        y_pred = np.exp(model.predict(X_val, verbose=1)*max_log_y)
      else:
        y_pred_train = np.exp(model.predict(X_train, verbose=1)*max_log_y)
        y_pred = np.exp(model.predict(X_val, verbose=1)*max_log_y)
        y_pred_test = np.exp(model.predict(X_test, verbose=1)*max_log_y)
  else:
      if soloVal:
        y_pred = model.predict(X_val, verbose=1)*y_std + y_mean
      else:
        y_pred_train = model.predict(X_train, verbose=1)*y_std + y_mean
        y_pred = model.predict(X_val, verbose=1)*y_std + y_mean
        y_pred_test = model.predict(X_test, verbose=1)*y_std + y_mean
  if soloVal:
    return y_pred
  return y_pred_train, y_pred, y_pred_test

In [49]:
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from hyperopt import Trials
import hyperopt

In [52]:
search_HyperOptSearch = {
    'learning_rate': hp.loguniform(low=0.01, high=1, label='learning_rate'),
    'max_depth':  hp.choice('max_depth', list(np.arange(0, 500))),
    'num_leaves': hp.choice('num_leaves', list(np.arange(2, 150))), #max number of leaves in one tree        
    'min_child_samples': hp.choice('min_child_samples', list(np.arange(0, 200))), # minimal number of data in one leaf
    'reg_lambda': hp.loguniform('reg_lambda',1e-9, 1000), # L2 regularization
    'reg_alpha': hp.loguniform('reg_alpha',1e-9, 1.0), # L1 regularization
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0), # enabler of bagging fraction
    'min_child_weight': hp.choice('min_child_weight', list(np.arange(0, 10,))), # minimal number of data in one leaf.
    'n_estimators': hp.choice('n_estimators', list(np.arange(10, 1000))) # cant. de estimadores secuenciales (se pone alto stopea earlystopping)    
}
max_iterSearch = 500

In [53]:
def objective(params):
        
    clf = LGBMRegressor(**params)    
    score = -cross_val_score(clf, X_train.values, y_train.reshape(-1), cv=4, scoring='neg_root_mean_squared_error', verbose=1).mean()
    return score

tpe_trials = Trials()    


%time hyperOptLightGBM = fmin(fn=objective, space=search_HyperOptSearch, verbose=2, algo=hyperopt.tpe.suggest, max_evals=max_iterSearch)

  0%|                                                                          | 0/500 [00:00<?, ?trial/s, best loss=?]

  return np.exp(draw)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.



  0%|                                                                          | 0/500 [00:08<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

In [None]:
print('best parameters:', hyperOptLightGBM)

In [None]:
model = LGBMRegressor(**hyperOptLightGBM)
model.fit(X_train, y_train.reshape(-1), **fit_params)
y_pred_train, y_pred, y_pred_test = predicctionDesNormalizacion(model)

print('RMSE en train:',np.sqrt((((df_train['Sales'].values - y_pred_train)/df_train['Sales'].values)**2).sum()/len(y_pred_train)))
print('RMSE en val:', np.sqrt((((df_val['Sales'].values - y_pred)/df_val['Sales'].values)**2).sum()/len(y_pred)))