In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Data Loading 

In [None]:
sales_df = pd.read_csv('../../datasets/predict-sales/sales_train.csv')
items_df = pd.read_csv('../../datasets/predict-sales/items.csv')
test_df = pd.read_csv('../../datasets/predict-sales/test.csv')

In [None]:
sales_df.drop(labels=['date'],inplace=True,axis=1)
sales_df = sales_df.reset_index()
items_df.drop(labels=['item_name'],inplace=True,axis=1)

In [None]:
sales_df['ID_pair'] = sales_df[['shop_id','item_id']].apply(lambda x: str(x[0]) + '-' + str(x[1]), axis=1)

In [None]:
sales_df = sales_df.merge(items_df)

In [None]:
sales_df.info()

In [None]:
sales_df.head()

# Visualizing Data 

Let's visualize some data from the test set:

In [None]:
def drawTimeSerie(df, sample, n):
    for i in range(n):
        shop_id = sample.iloc[i]['shop_id']
        item_id = sample.iloc[i]['item_id']
        id_df = df[(df['shop_id'] == shop_id) & (df['item_id'] == item_id)]
        id_df_grouped = id_df[['date_block_num','item_cnt_day']].groupby('date_block_num').sum().reset_index()
        plt.figure(figsize=[10,n*5])
        plt.subplot(n,1,i+1)
        plt.plot(id_df_grouped['date_block_num'], id_df_grouped['item_cnt_day'],'*-',)
        plt.title(str(shop_id) + '-' + str(item_id))

In [None]:
n_samples = 10
sample = test_df.sample(n=n_samples, axis=0)

In [None]:
drawTimeSerie(sales_df, sample, n_samples)

# Preparing Data with FeatureTools (testing)

In [None]:
import featuretools as ft

Let's evaluate the FT process 

## Pre-proccesing

In [None]:
month_id = 9

In [None]:
sales_month_df = sales_df[sales_df['date_block_num'] == month_id]

In [None]:
#sales_month_df['ID_pair'] = sales_month_df[['shop_id','item_id']].apply(lambda x: str(x[0]) + '-' + str(x[1]), axis=1)

In [None]:
#sales_month_df = sales_month_df.merge(items_df)

In [None]:
sales_month_df.head()

In [None]:
res_cat_sales = {}

In [None]:
def getResidualCategorySales(x, sales_slot_df):
    i = str(x[0]) + '-' + str(x[1])
    try:
        return res_cat_sales[i]
    except:
        res_cat_info = sales_slot_df[(sales_slot_df['shop_id'] == x[0]) &
                             (sales_slot_df['item_id'] != x[1]) &
                             (sales_slot_df['item_category_id'] != x[2])][['item_price','item_cnt_day']]
        count = res_cat_info.count()[0]
        mean = res_cat_info.mean()
        std = res_cat_info.std()
        maxi = res_cat_info.max()
        mini = res_cat_info.min()        
        skew = res_cat_info.skew()
        if (count > 0):
            result = (count,mean[0],mean[1],std[0],std[1],maxi[0],maxi[1],mini[0],mini[1],skew[0],skew[1])
        else:
            result = (0,-1,-1,-1,-1,-1,-1)
        res_cat_sales[i] = result
        return result

In [None]:
# Testing function
#getResidualCategorySales([54,18394,57])

In [None]:
sales_month_df['res_cat_sales'] = sales_month_df[['shop_id','item_id','item_category_id']].apply(getResidualCategorySales, args=[sales_month_df], axis=1)

In [None]:
sales_month_df['res_cat_count'] = sales_month_df['res_cat_sales'].apply(lambda x: x[0])
sales_month_df['res_cat_mean_item_price'] = sales_month_df['res_cat_sales'].apply(lambda x: x[1])
sales_month_df['res_cat_mean_item_cnt_day'] = sales_month_df['res_cat_sales'].apply(lambda x: x[2])
sales_month_df['res_cat_std_item_price'] = sales_month_df['res_cat_sales'].apply(lambda x: x[3])
sales_month_df['res_cat_std_item_cnt_day'] = sales_month_df['res_cat_sales'].apply(lambda x: x[4])
sales_month_df['res_cat_max_item_price'] = sales_month_df['res_cat_sales'].apply(lambda x: x[3])
sales_month_df['res_cat_max_item_cnt_day'] = sales_month_df['res_cat_sales'].apply(lambda x: x[5])
sales_month_df['res_cat_min_item_price'] = sales_month_df['res_cat_sales'].apply(lambda x: x[6])
sales_month_df['res_cat_min_item_cnt_day'] = sales_month_df['res_cat_sales'].apply(lambda x: x[7])
sales_month_df['res_cat_skew_item_price'] = sales_month_df['res_cat_sales'].apply(lambda x: x[8])
sales_month_df['res_cat_skew_item_cnt_day'] = sales_month_df['res_cat_sales'].apply(lambda x: x[9])
sales_month_df.drop(labels=['res_cat_sales'], inplace=True, axis=1)

In [None]:
sales_month_df.drop(labels=['shop_id','item_id','item_category_id'], inplace=True, axis=1)

In [None]:
sales_month_df.head()

## EntitySet Processing (Recommended by FeatureTools)

In [None]:
es = ft.EntitySet(id="prediction_sales")

In [None]:
es = es.entity_from_dataframe(entity_id='sales',dataframe=sales_month_df, index='index')

In [None]:
es = es.normalize_entity(base_entity_id='sales',
                         new_entity_id='ids',
                         index='ID_pair',
                         additional_variables=["date_block_num",
                                               "res_cat_count",
                                               "res_cat_mean_item_price",
                                               "res_cat_mean_item_cnt_day",
                                               "res_cat_std_item_price",
                                               "res_cat_std_item_cnt_day",
                                               "res_cat_max_item_price",
                                               "res_cat_max_item_cnt_day",
                                               "res_cat_min_item_price",
                                               "res_cat_min_item_cnt_day",
                                               "res_cat_skew_item_price",
                                               "res_cat_skew_item_cnt_day"]
                        )

In [None]:
feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='ids')

In [None]:
#feature_matrix.info()

In [None]:
#feature_matrix.iloc[0]

# Generating the DataSet 

In [None]:
slots = sales_df['date_block_num'].nunique()

In [None]:
def generateFeatures(sales_df, months_feature, month_target):
    print('features window:',months_feature,', target:',month_target)
    # Step 1: Extract fetures from the previous point
    sales_window_df = sales_df[sales_df['date_block_num'].isin(months_feature)]
    res_cat_sales = {}
    sales_window_df['res_cat_sales'] = sales_window_df[['shop_id','item_id','item_category_id']].apply(getResidualCategorySales, args=[sales_window_df], axis=1)

    sales_window_df['res_cat_count'] = sales_window_df['res_cat_sales'].apply(lambda x: x[0])
    sales_window_df['res_cat_mean_item_price'] = sales_window_df['res_cat_sales'].apply(lambda x: x[1])
    sales_window_df['res_cat_mean_item_cnt_day'] = sales_window_df['res_cat_sales'].apply(lambda x: x[2])
    sales_window_df['res_cat_std_item_price'] = sales_window_df['res_cat_sales'].apply(lambda x: x[3])
    sales_window_df['res_cat_std_item_cnt_day'] = sales_window_df['res_cat_sales'].apply(lambda x: x[4])
    sales_window_df['res_cat_max_item_price'] = sales_window_df['res_cat_sales'].apply(lambda x: x[3])
    sales_window_df['res_cat_max_item_cnt_day'] = sales_window_df['res_cat_sales'].apply(lambda x: x[5])
    sales_window_df['res_cat_min_item_price'] = sales_window_df['res_cat_sales'].apply(lambda x: x[6])
    sales_window_df['res_cat_min_item_cnt_day'] = sales_window_df['res_cat_sales'].apply(lambda x: x[7])
    sales_window_df['res_cat_skew_item_price'] = sales_window_df['res_cat_sales'].apply(lambda x: x[8])
    sales_window_df['res_cat_skew_item_cnt_day'] = sales_window_df['res_cat_sales'].apply(lambda x: x[9])
    sales_window_df.drop(labels=['res_cat_sales'], inplace=True, axis=1)

    sales_window_df.drop(labels=['shop_id','item_id','item_category_id'], inplace=True, axis=1)
    
    es = ft.EntitySet(id="prediction_sales")
    es = es.entity_from_dataframe(entity_id='sales',dataframe=sales_window_df, index='index')

    es = es.normalize_entity(base_entity_id='sales',
                         new_entity_id='ids',
                         index='ID_pair',
                         additional_variables=["date_block_num",
                                               "res_cat_count",
                                               "res_cat_mean_item_price",
                                               "res_cat_mean_item_cnt_day",
                                               "res_cat_std_item_price",
                                               "res_cat_std_item_cnt_day",
                                               "res_cat_max_item_price",
                                               "res_cat_max_item_cnt_day",
                                               "res_cat_min_item_price",
                                               "res_cat_min_item_cnt_day",
                                               "res_cat_skew_item_price",
                                               "res_cat_skew_item_cnt_day"]
                        )

    feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='ids')
    
    # Step 2: Extract target
    target = sales_df[sales_df['date_block_num'] == month_target][['ID_pair','item_cnt_day']].groupby('ID_pair').sum()    
    target.columns = ['target']
    
    return pd.concat([feature_matrix, target], axis=1, sort=False)
    

In [None]:
def slidingWindow(sales_df, size, slots):
    df_final = pd.DataFrame()
    for index in range(size,slots):
        features_target = generateFeatures(sales_df, np.arange(index-size,index),index)
        # TODO stack dataset
        df_final = pd.concat([df_final,features_target], axis=0) 
    # TODO return completed dataset
    return df_final

## Dataset Test

In [None]:
dstest = slidingWindow(sales_df, 1,3)

In [None]:
dstest
#dstest[dstest['MEAN(sales.item_cnt_day)'] == 0]

## Dataset 1

In [None]:
slidingWindow(2,slots)

## Dataset 2)

In [None]:
slidingWindow(3,slots)

## Dataset 3

In [None]:
slidingWindow(4,slots)

## Dataset 4

In [None]:
slidingWindow(5,slots)

## Dataset 5

In [None]:
slidingWindow(6,slots)

## Dataset 6

In [None]:
slidingWindow(7,slots)

# End of Case! 