In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import math
import gc

In [None]:
import sys
sys.path.append('../src/main')

In [None]:
from sklearn.model_selection import train_test_split
from dsbase.ModelDSBase import ModelDSBaseWrapper

In [None]:
from dsbase.models.regression.LightGradientBoostingRegressionDSBase import LightGradientBoostingRegressionDSBaseModel
from dsbase.models.regression.LightGradientBoostingRegressionDSBase import LightGradientBoostingRegressionDSBaseModelParamsToMap

# Ensembling - Stacking

## Phase 1 (LightGB for each 1/2 dataset)

In [None]:
def getXy(databaseName):
    df = pd.read_csv('../datasets/predict-sales/' + databaseName + '_cleaned.csv')
    df.drop(['Unnamed: 0','Unnamed: 0.1','slot'], inplace=True, axis=1)
    df.set_index(keys='ID_pair',inplace=True)
    
    X = df.drop(labels=['target_imputed'], axis=1).values
    y = df['target_imputed'].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    return X_train, X_test, y_train, y_test

In [None]:
# params: [reg_alpha, reg_lambda]
def processModel(X_train, X_test, y_train, y_test, modelName, params, perc=[100]):
    print('generating model',modelName,'...')

    params = LightGradientBoostingRegressionDSBaseModelParamsToMap(max_depth=31, n_estimators=100, learning_rate=0.1,num_leaves=31, subsample_for_bin=200000, reg_alpha=params[0], reg_lambda=params[1])
    lgbr = ModelDSBaseWrapper(modelName,X_train,y_train,X_test, y_test,perc,LightGradientBoostingRegressionDSBaseModel,params)
    lgbr.train()
    lclgdbr=lgbr.getLearningCurves()
    
    overfitting_rate = lclgdbr[1,-1]/lclgdbr[0,-1]
    score = lgbr.getScore()
    print('Score for',modelName,lgbr.getScore(),'(',overfitting_rate,')')
    return lgbr, overfitting_rate, score, lclgdbr

Let's calculate the models: 

In [None]:
# Completed Set
dataset = ['dataset1A','dataset2A','dataset3A','dataset4A','dataset5A','dataset6A']
model = ['model1A','model2A','model3A','model4A','model5A','model6A']
params = [[3000,25],[2500,25],[2000,15],[1000,15],[100,10],[0,10]]
#scores = [0.7114320203335709 ( 0.9036347645739408 ), 0.7017113534310703 ( 0.8691804015123732 ), 0.6440095888038557 ( 0.8363979013208283 )
#          0.5942881438165505 ( 0.7507641992157429 ), 0.5763562599461289 ( 0.8655557217963179 ), 0.5480502465067804 ( 0.9972860257205527 )]
perc = [100]

# Searching params Set
#dataset = ['dataset4A']
#model = ['model4A']
#params = [[5000,50]]
#perc=[20,60,100]

In [None]:
models = []
ofrs = []
scores = []
lcs = []
for d, m, p in zip (dataset,model,params):
    X_train, X_test, y_train, y_test = getXy(d)
    model, ofr, score,lc = processModel(X_train, X_test, y_train, y_test,m,p,perc)
    gc.collect()
    models.append(model)
    ofrs.append(ofr)
    scores.append(score)
    lcs.append(lc)

In [None]:
#plt.plot(lc[0,:],'b',lc[1,:],'r')

Let's store them: 

In [None]:
for m in models:
    m.save()

## Phase 2 (NN for the 1/2 processed dataset)

### Generating 2º dataset processing datasetXB in the previous models

In [None]:
def getPreStackingDataset(dataset, model):
    df = pd.read_csv('../datasets/predict-sales/' + dataset + 'B_cleaned.csv')
    df.drop(['Unnamed: 0','Unnamed: 0.1'], inplace=True, axis=1)
    df.set_index(keys=['ID_pair'], inplace=True)

    X = df.drop(labels=['target_imputed'], axis=1).values

    model = LightGradientBoostingRegressionDSBaseModel(model + '0')
    model.load()
    y_pred = model.predict(X)

    df_pred = pd.DataFrame(data=y_pred,columns=['target_predicted'])
    df_pre_stacking = pd.concat(objs=[df.reset_index(),df_pred], axis=1)
    df_pre_stacking_filtered = df_pre_stacking.set_index(keys=['ID_pair','slot'])[['target_imputed','target_predicted']]
    return df_pre_stacking_filtered

In [None]:
# Selecting the ID_pair to evaluate in the dataset
df = pd.read_csv('../datasets/predict-sales/dataset1B_cleaned.csv')
ids = df['ID_pair'].unique()

In [None]:
df1 = getPreStackingDataset('dataset1','model1A')

In [None]:
df2 = getPreStackingDataset('dataset2','model2A')

In [None]:
df3 = getPreStackingDataset('dataset3','model3A')

In [None]:
df4 = getPreStackingDataset('dataset4','model4A')

In [None]:
df5 = getPreStackingDataset('dataset5','model5A')

In [None]:
df6 = getPreStackingDataset('dataset6','model6A')

Now, let's calculate the real stacking dataset:

In [None]:
slots = np.arange(2,33)

In [None]:
for i in ids:
    for slot in slots:
        print(i,'-',slot)

### Training dataset 

# End of Case! 