In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import math

In [None]:
import sys
sys.path.append('../src/main')

In [None]:
from sklearn.model_selection import train_test_split
from dsbase.ModelDSBase import ModelDSBaseWrapper

In [None]:
from dsbase.models.regression.LightGradientBoostingRegressionDSBase import LightGradientBoostingRegressionDSBaseModel
from dsbase.models.regression.LightGradientBoostingRegressionDSBase import LightGradientBoostingRegressionDSBaseModelParamsToMap

# Data Loading 

In [None]:
df = pd.read_csv('../datasets/predict-sales/dataset1A.csv')

In [None]:
df.drop(['Unnamed: 0'], inplace=True, axis=1)

# Data Cleaning 

In [None]:
def cleanDataset(df):
    # Removing targets without evidence
    df_cleaned = df[~df['date_block_num'].isna()]
    
    # Imputing features
    df_cleaned['STD(sales.item_cnt_day)_imputed'] = df_cleaned['STD(sales.item_cnt_day)'].apply(lambda x: 0 if (math.isnan(x)) else x)
    df_cleaned['STD(sales.item_price)_imputed'] = df_cleaned['STD(sales.item_price)'].apply(lambda x: 0 if (math.isnan(x)) else x)
    df_cleaned['STD(sales.item_cnt_day)_na_indicator'] = df_cleaned['STD(sales.item_cnt_day)'].apply(lambda x: 1 if (math.isnan(x)) else 0)
    df_cleaned['STD(sales.item_price)_na_indicator'] = df_cleaned['STD(sales.item_price)'].apply(lambda x: 1 if (math.isnan(x)) else 0)
    df_cleaned['SKEW(sales.item_cnt_day)_imputed'] = df_cleaned['SKEW(sales.item_cnt_day)'].apply(lambda x: 0 if (math.isnan(x)) else x)
    df_cleaned['SKEW(sales.item_price)_imputed'] = df_cleaned['SKEW(sales.item_cnt_day)'].apply(lambda x: 0 if (math.isnan(x)) else x)
    df_cleaned['SKEW(sales.item_cnt_day)_na_indicator'] = df_cleaned['SKEW(sales.item_cnt_day)'].apply(lambda x: 1 if (math.isnan(x)) else 0)
    df_cleaned['SKEW(sales.item_price)_na_indicator'] = df_cleaned['SKEW(sales.item_cnt_day)'].apply(lambda x: 1 if (math.isnan(x)) else 0)
    df_cleaned['skew_shop_cat_day_imputed'] = df_cleaned['skew_shop_cat_day'].apply(lambda x: 0 if (math.isnan(x)) else x)
    df_cleaned['skew_shop_cat_item_price_imputed'] = df_cleaned['skew_shop_cat_item_price'].apply(lambda x: 0 if (math.isnan(x)) else x)
    df_cleaned['skew_shop_cat_day_na_indicator'] = df_cleaned['skew_shop_cat_day'].apply(lambda x: 1 if (math.isnan(x)) else 0)
    df_cleaned['skew_shop_cat_item_price_na_indicator'] = df_cleaned['skew_shop_cat_item_price'].apply(lambda x: 1 if (math.isnan(x)) else 0)
    df_cleaned.drop(labels=['STD(sales.item_cnt_day)','STD(sales.item_price)',
                            'SKEW(sales.item_cnt_day)','SKEW(sales.item_price)',
                            'skew_shop_cat_day','skew_shop_cat_item_price'
                           ], inplace=True, axis=1)
    
    # imputing target
    df_cleaned['target_imputed'] = df_cleaned['target'].apply(lambda x: 0 if (math.isnan(x)) else x)
    df_cleaned.drop(labels=['target'], inplace=True, axis=1)
    
    return df_cleaned

In [None]:
df_cleaned = cleanDataset(df)

# Preparing data for ML 

In [None]:
X = df_cleaned.drop(labels=['target_imputed'], axis=1).values
y = df_cleaned['target_imputed'].values

# Simple case (Just first test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
params = LightGradientBoostingRegressionDSBaseModelParamsToMap(max_depth=31, n_estimators=100, learning_rate=0.1,num_leaves=31, subsample_for_bin=200000, reg_alpha=0, reg_lambda=0)
#params = LightGradientBoostingRegressionDSBaseModelParamsToMap(max_depth=20, n_estimators=70, learning_rate=0.1, num_leaves=31, subsample_for_bin=1000000, reg_alpha=100000, reg_lambda=100)
#lgbr = ModelDSBaseWrapper('LGBR',X_train,y_train,X_test, y_test,[20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100],LightGradientBoostingRegressionDSBaseModel,params)
lgbr = ModelDSBaseWrapper('LGBR',X_train,y_train,X_test, y_test,[100],LightGradientBoostingRegressionDSBaseModel,params)

In [None]:
lgbr.train()

In [None]:
lclgdbr=lgbr.getLearningCurves()

In [None]:
plt.plot(lclgdbr[0,:],'b',lclgdbr[1,:],'r')

In [None]:
lgbr.getScore()

# Cross-Validation Optimization Method for each model

Time requieredd for a 5-fold cross validation process: 2 min 

In [None]:
from dsbase.SearchOptimumParams import evaluateParams, randomElement, showSearchOptimumHyperParametersReport

In [None]:
max_depth=[10,20,30,40,50,60,70,80,90,100]
n_estimators=[70,80,90,100,110,120,130,140,150]
learning_rate=[0.01,0.03,0.1,0.3,1]
subsample_for_bin=[75000,100000,150000,200000,500000,1000000,1300000,2000000,2300000]
num_leaves=[7,15,31,63,127]
reg_alpha=[0,50000,75000,90000,100000,120000]
reg_lambda=[0,20,50,100,150,200]

In [None]:
num_tries = 50

In [None]:
params = []
for i in range(num_tries):
    p = LightGradientBoostingRegressionDSBaseModelParamsToMap(max_depth=randomElement(max_depth),
                                                                   n_estimators=randomElement(n_estimators),
                                                                   learning_rate=randomElement(learning_rate),
                                                                   subsample_for_bin=randomElement(subsample_for_bin),
                                                                   reg_alpha=randomElement(reg_alpha),
                                                                   reg_lambda=randomElement(reg_lambda))
    params.append(p)

In [None]:
tries = evaluateParams(X, y, 5, LightGradientBoostingRegressionDSBaseModel, 'LGBR', params, num_tries)

In [None]:
showSearchOptimumHyperParametersReport(tries)

# Ensembling

In [None]:
def processModel(databaseName, modelName):
    df = pd.read_csv('../datasets/predict-sales/' + databaseName + '.csv')
    df.drop(['Unnamed: 0'], inplace=True, axis=1)
    df_cleaned = cleanDataset(df)
    X = df_cleaned.drop(labels=['target_imputed'], axis=1).values
    y = df_cleaned['target_imputed'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    params = LightGradientBoostingRegressionDSBaseModelParamsToMap(max_depth=31, n_estimators=100, learning_rate=0.1,num_leaves=31, subsample_for_bin=200000, reg_alpha=0, reg_lambda=0)
    lgbr = ModelDSBaseWrapper(modelName,X_train,y_train,X_test, y_test,[100],LightGradientBoostingRegressionDSBaseModel,params)
    lgbr.train()
    lclgdbr=lgbr.getLearningCurves()
    print('Score for',modelName,lgbr.getScore())
    return lgbr

## Phase 1 (LightGB for each 1/2 dataset)

In [None]:
model1 = processModel('dataset1A','model1')
model1.save()

In [None]:
model2 = processModel('dataset2A','model2')
model2.save()

In [None]:
model3 = processModel('dataset3A','model3')
model3.save()

In [None]:
model4 = processModel('dataset4A','model4')
model4.save()

In [None]:
model5 = processModel('dataset5A','model5')
model5.save()

In [None]:
model6 = processModel('dataset6A','model6')
model6.save()

## Phase 2 (NN for the 1/2 processed dataset)

### Generating 2º dataset processing datasetXB in the previous models

### Training dataset 

# End of Case! 