In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from utils.PredictSalesUtils import getTrainEnriched
from utils.PredictSalesUtils import setPair
from utils.PredictSalesUtils import getTargetAgg, generateFeaturesForTraining

# Data Loading 

In [None]:
# WARN: It takes a time
sales_df = getTrainEnriched('datasets/sales_train.csv','../datasets/predict-sales/items.csv')

In [None]:
sales_df.to_csv('datasets/sales_train_enriched.csv')

## Direct Load

In [None]:
sales_df = pd.read_csv('datasets/sales_train_enriched.csv')

In [None]:
sales_df.drop(labels=['Unnamed: 0'], inplace=True, axis=1)

In [None]:
sales_df.head()

In [None]:
key_th = 40
sales_df_ensemble = sales_df[sales_df['item_category_id']>=40]
sales_df_stacking = sales_df[sales_df['item_category_id']<40]
print('ensemble size:',sales_df_ensemble.shape)
print('stacking size:',sales_df_stacking.shape)

# Visualizing Data 

Let's visualize some data from the test set:

In [None]:
def drawTimeSerie(df, sample, n):
    for i in range(n):
        shop_id = sample.iloc[i]['shop_id']
        item_id = sample.iloc[i]['item_id']
        id_df = df[(df['shop_id'] == shop_id) & (df['item_id'] == item_id)]
        id_df_grouped = id_df[['date_block_num','item_cnt_day']].groupby('date_block_num').sum().reset_index()
        plt.figure(figsize=[10,n*5])
        plt.subplot(n,1,i+1)
        plt.plot(id_df_grouped['date_block_num'], id_df_grouped['item_cnt_day'],'*-',)
        plt.title(str(shop_id) + '-' + str(item_id))

In [None]:
n_samples = 10
sample = sales_df_ensemble.sample(n=n_samples, axis=0)

In [None]:
drawTimeSerie(sales_df_ensemble, sample, n_samples)

# Generating the DataSet 

In [None]:
def slidingWindow(sales_df, size, slots):
    df_final = pd.DataFrame()
    for index in range(size,slots):
        features_target = generateFeaturesForTraining(sales_df, np.arange(index-size,index),index)
        # TODO stack dataset
        df_final = pd.concat([df_final,features_target], axis=0) 
    # TODO return completed dataset
    return df_final

## Ensemble Dataset

In [None]:
slots = sales_df['date_block_num'].nunique()

In [None]:
windows = [3,6,12,18,25,32]

### Dataset 1

In [None]:
dataset1A = slidingWindow(sales_df_ensemble, windows[0], slots)
dataset1B = slidingWindow(sales_df_stacking, windows[0], slots)

In [None]:
dataset1A.to_csv('datasets/dataset1A.csv')
dataset1B.to_csv('datasets/dataset1B.csv')

### Dataset 2

In [None]:
dataset2A = slidingWindow(sales_df_ensemble, windows[1], slots)
dataset2B = slidingWindow(sales_df_stacking, windows[1], slots)

In [None]:
dataset2A.to_csv('datasets/dataset2A.csv')
dataset2B.to_csv('datasets/dataset2B.csv')

### Dataset 3

In [None]:
dataset3A = slidingWindow(sales_df_ensemble, windows[2], slots)
dataset3B = slidingWindow(sales_df_stacking, windows[2], slots)

In [None]:
dataset3A.to_csv('../datasets/predict-sales/dataset3A.csv')
dataset3B.to_csv('../datasets/predict-sales/dataset3B.csv')

### Dataset 4

In [None]:
dataset4A = slidingWindow(sales_df_ensemble, windows[3], slots)
dataset4B = slidingWindow(sales_df_stacking, windows[3], slots)

In [None]:
dataset4A.to_csv('../datasets/predict-sales/dataset4A.csv')
dataset4B.to_csv('../datasets/predict-sales/dataset4B.csv')

### Dataset 5

In [None]:
dataset5A = slidingWindow(sales_df_ensemble, windows[4],slots)
dataset5B = slidingWindow(sales_df_stacking, windows[4],slots)

In [None]:
dataset5A.to_csv('datasets/dataset5A.csv')
dataset5B.to_csv('datasets/dataset5B.csv')

### Dataset 6

In [None]:
dataset6A = slidingWindow(sales_df_ensemble, windows[5],slots)
dataset6B = slidingWindow(sales_df_stacking, windows[5],slots)

In [None]:
dataset6A.to_csv('datasets/dataset6A.csv')
dataset6B.to_csv('datasets/dataset6B.csv')

## Stacking Dataset 

... in following notebooks

# End of Case! 