In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [38]:
from utils.PredictSalesUtils import getTestEnriched
from utils.PredictSalesUtils import cleanEnsembleDataset, cleanStackingDataset
from utils.PredictSalesUtils import generateFeaturesForEvaluation

In [39]:
import math

In [None]:
import sys
sys.path.append('../src/main')

In [4]:
from dsbase.models.regression.LightGradientBoostingRegressionDSBase import LightGradientBoostingRegressionDSBaseModel
from dsbase.models.regression.DNNRegressionKerasDSBase import DNNRegressionKerasDSBaseModel

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.


In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib

# Test Data Loading 

In [26]:
test_df_enriched = getTestEnriched('../datasets/predict-sales/test.csv','../datasets/predict-sales/items.csv')

In [27]:
test_df_enriched.head()

Unnamed: 0,ID,shop_id,item_id,item_category_id,ID_pair,ID_CAT_pair
0,0,5,5037,19,5-5037,5-19
1,1,5,5320,55,5-5320,5-55
2,2,5,5233,19,5-5233,5-19
3,3,5,5232,23,5-5232,5-23
4,4,5,5268,20,5-5268,5-20


# Sales Direct Load

In [None]:
sales_df = pd.read_csv('../datasets/predict-sales/sales_train_enriched.csv')

In [None]:
sales_df.drop(labels=['Unnamed: 0'], inplace=True, axis=1)

In [None]:
sales_df.head()

# Evaluation Processing 

## Ensemble Process

In [None]:
slots = sales_df['date_block_num'].nunique()
slot_seq = np.arange(1,slots)

In [None]:
windows = [3,6,12,18,25,32]

In [None]:
def processFold(i):
    months = slot_seq[-windows[i-1]:]
    fold_df = generateFeaturesForEvaluation(sales_df, months, test_df_enriched)
    
    #sns.heatmap(~fold_df.isna())
    fold_df_cleaned = cleanEnsembleDataset(fold_df, imputeTarget=False)
    fold_df_cleaned.set_index(keys=['ID_pair'], inplace=True)
    
    # Prediction in model fold[index]
    X = fold_df_cleaned.values
    model = LightGradientBoostingRegressionDSBaseModel('model' + str(i) + 'A0')
    model.load()
    y_pred = model.predict(X)
    
    column_name = 'target_prediction'
    fold_df_cleaned_completed = pd.concat(objs=[fold_df_cleaned.reset_index(),pd.DataFrame(y_pred,columns=[column_name])], axis=1)
    fold_df_cleaned_completed[['ID_pair',column_name]]        
    return fold_df_cleaned_completed[['ID_pair',column_name]]

### Fold 1

In [None]:
df1_target = processFold(1)

### Fold 2

In [None]:
df2_target = processFold(2)

### Fold 3

In [None]:
df3_target = processFold(3)

### Fold 4

In [None]:
df4_target = processFold(4)

### Fold 5

In [None]:
df5_target = processFold(5)

### Fold 6

In [None]:
df6_target = processFold(6)

Now, let's join the result for the stacking stage: 

In [None]:
df_for_stacking = df1_target.merge(
    right=df2_target, on=['ID_pair'],how='outer', suffixes=('_1','_2')
).merge(
    right=df3_target, on=['ID_pair'],how='outer'
).merge(
    right=df4_target, on=['ID_pair'],how='outer', suffixes=('_3','_4')
).merge(
    right=df5_target, on=['ID_pair'],how='outer'
).merge(
    right=df6_target, on=['ID_pair'],how='outer', suffixes=('_5','_6')
)

In [None]:
df_for_stacking_cleaned = cleanStackingDataset(df_for_stacking,['target_prediction_1','target_prediction_2','target_prediction_3','target_prediction_4','target_prediction_5','target_prediction_6'])

In [None]:
df_for_stacking_cleaned.to_csv('../datasets/predict-sales/eval_dataset_for_stacking_cleaned.csv')

## Stacking Process 

In [6]:
df_for_stacking_cleaned = pd.read_csv('../datasets/predict-sales/eval_dataset_for_stacking_cleaned.csv')

In [8]:
df_for_stacking_cleaned.drop(labels=['Unnamed: 0'], inplace=True, axis=1)

In [9]:
df_for_stacking_cleaned.set_index(keys=['ID_pair'], inplace=True)

In [10]:
X = df_for_stacking_cleaned.values

In [11]:
scalerX = joblib.load('model_persistance/dnn_scalerX.sav')
scalery = joblib.load('model_persistance/dnn_scalery.sav')

In [12]:
model = DNNRegressionKerasDSBaseModel('DNN3')
model.load()

initiating empty model DNN3. DNNRegressionKeras
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [13]:
y_s_pred = model.predict(scalerX.transform(X))

In [14]:
y_pred = scalery.inverse_transform(y_s_pred)

In [24]:
predictions = pd.concat(objs=[df_for_stacking_cleaned.reset_index(),pd.DataFrame(y_pred, columns=['predictions'])], axis=1)

In [30]:
test_df_enriched_predictions = test_df_enriched.merge(right=predictions, on='ID_pair', how='left')

In [31]:
pre_submission = test_df_enriched_predictions[['ID','predictions']]

In [40]:
def finalFitting(x):
    if (math.isnan(x)):
        return 0
    elif (x < 0):
        return 0
    else:
        return x

In [47]:
pre_submission['item_cnt_month'] = pre_submission['predictions'].apply(finalFitting)

In [48]:
submission = pre_submission[['ID','item_cnt_month']]

In [51]:
submission.to_csv('../datasets/predict-sales/submission.csv', index=False)

# End of Case! 