In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Data Loading 

In [50]:
sales_df = pd.read_csv('../../datasets/predict-sales/sales_train.csv')
items_df = pd.read_csv('../../datasets/predict-sales/items.csv')
test_df = pd.read_csv('../../datasets/predict-sales/test.csv')

In [51]:
sales_df.drop(labels=['date'],inplace=True,axis=1)
sales_df = sales_df.reset_index()
items_df.drop(labels=['item_name'],inplace=True,axis=1)

In [53]:
sales_df['ID_pair'] = sales_df[['shop_id','item_id']].apply(lambda x: str(x[0]) + '-' + str(x[1]), axis=1)

In [57]:
sales_df = sales_df.merge(items_df)

In [65]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2935849 entries, 0 to 2935848
Data columns (total 8 columns):
index               int64
date_block_num      int64
shop_id             int64
item_id             int64
item_price          float64
item_cnt_day        float64
ID_pair             object
item_category_id    int64
dtypes: float64(2), int64(5), object(1)
memory usage: 201.6+ MB


In [66]:
sales_df.head()

Unnamed: 0,index,date_block_num,shop_id,item_id,item_price,item_cnt_day,ID_pair,item_category_id
0,0,0,59,22154,999.0,1.0,59-22154,37
1,3270,0,24,22154,999.0,1.0,24-22154,37
2,17081,0,27,22154,999.0,1.0,27-22154,37
3,25918,0,25,22154,999.0,1.0,25-22154,37
4,25919,0,25,22154,999.0,1.0,25-22154,37


# Visualizing Data 

Let's visualize some data from the test set:

In [None]:
def drawTimeSerie(df, sample, n):
    for i in range(n):
        shop_id = sample.iloc[i]['shop_id']
        item_id = sample.iloc[i]['item_id']
        id_df = df[(df['shop_id'] == shop_id) & (df['item_id'] == item_id)]
        id_df_grouped = id_df[['date_block_num','item_cnt_day']].groupby('date_block_num').sum().reset_index()
        plt.figure(figsize=[10,n*5])
        plt.subplot(n,1,i+1)
        plt.plot(id_df_grouped['date_block_num'], id_df_grouped['item_cnt_day'],'*-',)
        plt.title(str(shop_id) + '-' + str(item_id))

In [None]:
n_samples = 10
sample = test_df.sample(n=n_samples, axis=0)

In [None]:
drawTimeSerie(sales_df, sample, n_samples)

# Preparing Data with FeatureTools (testing)

In [6]:
import featuretools as ft

Let's evaluate the FT process 

## Pre-proccesing

In [7]:
month_id = 15

In [8]:
sales_month_df = sales_df[sales_df['date_block_num'] == month_id]

In [9]:
#sales_month_df['ID_pair'] = sales_month_df[['shop_id','item_id']].apply(lambda x: str(x[0]) + '-' + str(x[1]), axis=1)

In [10]:
#sales_month_df = sales_month_df.merge(items_df)

In [11]:
sales_month_df.head()

Unnamed: 0,index,date_block_num,shop_id,item_id,item_price,item_cnt_day,ID_pair,item_category_id
0,1549474,15,54,18394,299.0,1.0,54-18394,57
1,1549475,15,54,18394,299.0,1.0,54-18394,57
2,1549482,15,54,18394,299.0,1.0,54-18394,57
3,1562915,15,57,18394,299.0,1.0,57-18394,57
4,1564393,15,24,18394,299.0,1.0,24-18394,57


In [12]:
res_cat_sales = {}

In [13]:
def getResidualCategorySales(x):
    i = str(x[0]) + '-' + str(x[1])
    try:
        return res_cat_sales[i]
    except:
        res_cat_info = sales_month_df[(sales_month_df['shop_id'] == x[0]) &
                             (sales_month_df['item_id'] != x[1]) &
                             (sales_month_df['item_category_id'] != x[2])][['item_price','item_cnt_day']]
        count = res_cat_info.count()[0]
        mean = res_cat_info.mean()
        std = res_cat_info.std()
        maxi = res_cat_info.max()
        mini = res_cat_info.min()        
        skew = res_cat_info.skew()
        if (count > 0):
            result = (count,mean[0],mean[1],std[0],std[1],maxi[0],maxi[1],mini[0],mini[1],skew[0],skew[1])
        else:
            result = (0,-1,-1,-1,-1,-1,-1)
        res_cat_sales[i] = result
        return result

In [15]:
# Testing function
#getResidualCategorySales([54,18394,57])

In [16]:
sales_month_df['res_cat_sales'] = sales_month_df[['shop_id','item_id','item_category_id']].apply(getResidualCategorySales, axis=1)

In [17]:
sales_month_df['res_cat_count'] = sales_month_df['res_cat_sales'].apply(lambda x: x[0])
sales_month_df['res_cat_mean_item_price'] = sales_month_df['res_cat_sales'].apply(lambda x: x[1])
sales_month_df['res_cat_mean_item_cnt_day'] = sales_month_df['res_cat_sales'].apply(lambda x: x[2])
sales_month_df['res_cat_std_item_price'] = sales_month_df['res_cat_sales'].apply(lambda x: x[3])
sales_month_df['res_cat_std_item_cnt_day'] = sales_month_df['res_cat_sales'].apply(lambda x: x[4])
sales_month_df['res_cat_max_item_price'] = sales_month_df['res_cat_sales'].apply(lambda x: x[3])
sales_month_df['res_cat_max_item_cnt_day'] = sales_month_df['res_cat_sales'].apply(lambda x: x[5])
sales_month_df['res_cat_min_item_price'] = sales_month_df['res_cat_sales'].apply(lambda x: x[6])
sales_month_df['res_cat_min_item_cnt_day'] = sales_month_df['res_cat_sales'].apply(lambda x: x[7])
sales_month_df['res_cat_skew_item_price'] = sales_month_df['res_cat_sales'].apply(lambda x: x[8])
sales_month_df['res_cat_skew_item_cnt_day'] = sales_month_df['res_cat_sales'].apply(lambda x: x[9])
sales_month_df.drop(labels=['res_cat_sales'], inplace=True, axis=1)

In [18]:
sales_month_df.drop(labels=['shop_id','item_id','item_category_id'], inplace=True, axis=1)

In [19]:
sales_month_df.head()

Unnamed: 0,index,date_block_num,item_price,item_cnt_day,ID_pair,res_cat_count,res_cat_mean_item_price,res_cat_mean_item_cnt_day,res_cat_std_item_price,res_cat_std_item_cnt_day,res_cat_max_item_price,res_cat_max_item_cnt_day,res_cat_min_item_price,res_cat_min_item_cnt_day,res_cat_skew_item_price,res_cat_skew_item_cnt_day
0,1549474,15,299.0,1.0,54-18394,4048,759.170188,1.358696,1337.13421,4.963847,1337.13421,22990.0,300.0,5.0,-1.0,9.849766
1,1549475,15,299.0,1.0,54-18394,4048,759.170188,1.358696,1337.13421,4.963847,1337.13421,22990.0,300.0,5.0,-1.0,9.849766
2,1549482,15,299.0,1.0,54-18394,4048,759.170188,1.358696,1337.13421,4.963847,1337.13421,22990.0,300.0,5.0,-1.0,9.849766
3,1562915,15,299.0,1.0,57-18394,3470,751.79267,1.251009,1703.931685,1.109768,1703.931685,23990.0,21.0,5.0,-1.0,9.666128
4,1564393,15,299.0,1.0,24-18394,1429,992.354084,1.172848,1806.535835,1.011941,1806.535835,22990.0,31.0,5.0,-1.0,8.068218


## EntitySet Processing (Recommended by FeatureTools)

In [20]:
es = ft.EntitySet(id="prediction_sales")

In [21]:
es = es.entity_from_dataframe(entity_id='sales',dataframe=sales_month_df, index='index')

In [22]:
es = es.normalize_entity(base_entity_id='sales',
                         new_entity_id='ids',
                         index='ID_pair',
                         additional_variables=["date_block_num",
                                               "res_cat_count",
                                               "res_cat_mean_item_price",
                                               "res_cat_mean_item_cnt_day",
                                               "res_cat_std_item_price",
                                               "res_cat_std_item_cnt_day",
                                               "res_cat_max_item_price",
                                               "res_cat_max_item_cnt_day",
                                               "res_cat_min_item_price",
                                               "res_cat_min_item_cnt_day",
                                               "res_cat_skew_item_price",
                                               "res_cat_skew_item_cnt_day"]
                        )

In [23]:
feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='ids')

In [40]:
#feature_matrix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44740 entries, 10-10089 to 7-9929
Data columns (total 25 columns):
date_block_num               44740 non-null int64
res_cat_count                44740 non-null int64
res_cat_mean_item_price      44740 non-null float64
res_cat_mean_item_cnt_day    44740 non-null float64
res_cat_std_item_price       44740 non-null float64
res_cat_std_item_cnt_day     44740 non-null float64
res_cat_max_item_price       44740 non-null float64
res_cat_max_item_cnt_day     44740 non-null float64
res_cat_min_item_price       44740 non-null float64
res_cat_min_item_cnt_day     44740 non-null float64
res_cat_skew_item_price      44740 non-null float64
res_cat_skew_item_cnt_day    44740 non-null float64
SUM(sales.item_price)        44740 non-null float64
SUM(sales.item_cnt_day)      44740 non-null float64
STD(sales.item_price)        13913 non-null float64
STD(sales.item_cnt_day)      13913 non-null float64
MAX(sales.item_price)        44740 non-null float64
MAX(sale

In [41]:
#feature_matrix.iloc[0]

# Generating the DataSet 

In [78]:
slots = sales_df['date_block_num'].nunique()

In [110]:
def generateFeatures(months_feature, month_target):
    print('features window:',months_feature,', target:',month_target)
    # TODO Extract from the previous point
    pass

In [111]:
def slidingWindow(size, slots):
    for index in range(size,slots):
        dataset_block = generateFeatures(np.arange(index-size,index),index)
        # TODO stack dataset
        # TODO return completed dataset

## Dataset 1 (5)

In [118]:
slidingWindow(5,slots)

features window: [0 1 2 3 4] , target: 5
features window: [1 2 3 4 5] , target: 6
features window: [2 3 4 5 6] , target: 7
features window: [3 4 5 6 7] , target: 8
features window: [4 5 6 7 8] , target: 9
features window: [5 6 7 8 9] , target: 10
features window: [ 6  7  8  9 10] , target: 11
features window: [ 7  8  9 10 11] , target: 12
features window: [ 8  9 10 11 12] , target: 13
features window: [ 9 10 11 12 13] , target: 14
features window: [10 11 12 13 14] , target: 15
features window: [11 12 13 14 15] , target: 16
features window: [12 13 14 15 16] , target: 17
features window: [13 14 15 16 17] , target: 18
features window: [14 15 16 17 18] , target: 19
features window: [15 16 17 18 19] , target: 20
features window: [16 17 18 19 20] , target: 21
features window: [17 18 19 20 21] , target: 22
features window: [18 19 20 21 22] , target: 23
features window: [19 20 21 22 23] , target: 24
features window: [20 21 22 23 24] , target: 25
features window: [21 22 23 24 25] , target: 26
f

## Dataset 2 (10)

In [122]:
slidingWindow(10,slots)

features window: [0 1 2 3 4 5 6 7 8 9] , target: 10
features window: [ 1  2  3  4  5  6  7  8  9 10] , target: 11
features window: [ 2  3  4  5  6  7  8  9 10 11] , target: 12
features window: [ 3  4  5  6  7  8  9 10 11 12] , target: 13
features window: [ 4  5  6  7  8  9 10 11 12 13] , target: 14
features window: [ 5  6  7  8  9 10 11 12 13 14] , target: 15
features window: [ 6  7  8  9 10 11 12 13 14 15] , target: 16
features window: [ 7  8  9 10 11 12 13 14 15 16] , target: 17
features window: [ 8  9 10 11 12 13 14 15 16 17] , target: 18
features window: [ 9 10 11 12 13 14 15 16 17 18] , target: 19
features window: [10 11 12 13 14 15 16 17 18 19] , target: 20
features window: [11 12 13 14 15 16 17 18 19 20] , target: 21
features window: [12 13 14 15 16 17 18 19 20 21] , target: 22
features window: [13 14 15 16 17 18 19 20 21 22] , target: 23
features window: [14 15 16 17 18 19 20 21 22 23] , target: 24
features window: [15 16 17 18 19 20 21 22 23 24] , target: 25
features window: [

## Dataset 3 (15)

In [123]:
slidingWindow(15,slots)

features window: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14] , target: 15
features window: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15] , target: 16
features window: [ 2  3  4  5  6  7  8  9 10 11 12 13 14 15 16] , target: 17
features window: [ 3  4  5  6  7  8  9 10 11 12 13 14 15 16 17] , target: 18
features window: [ 4  5  6  7  8  9 10 11 12 13 14 15 16 17 18] , target: 19
features window: [ 5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] , target: 20
features window: [ 6  7  8  9 10 11 12 13 14 15 16 17 18 19 20] , target: 21
features window: [ 7  8  9 10 11 12 13 14 15 16 17 18 19 20 21] , target: 22
features window: [ 8  9 10 11 12 13 14 15 16 17 18 19 20 21 22] , target: 23
features window: [ 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] , target: 24
features window: [10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] , target: 25
features window: [11 12 13 14 15 16 17 18 19 20 21 22 23 24 25] , target: 26
features window: [12 13 14 15 16 17 18 19 20 21 22 23 24 25 26] , target: 27

## Dataset 4 (20)

In [124]:
slidingWindow(20,slots)

features window: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] , target: 20
features window: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20] , target: 21
features window: [ 2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21] , target: 22
features window: [ 3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22] , target: 23
features window: [ 4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] , target: 24
features window: [ 5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] , target: 25
features window: [ 6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25] , target: 26
features window: [ 7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26] , target: 27
features window: [ 8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27] , target: 28
features window: [ 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28] , target: 29
features window: [10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29] ,

## Dataset 5 (25)

In [127]:
slidingWindow(25,slots)

features window: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24] , target: 25
features window: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25] , target: 26
features window: [ 2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
 26] , target: 27
features window: [ 3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
 27] , target: 28
features window: [ 4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
 28] , target: 29
features window: [ 5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
 29] , target: 30
features window: [ 6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
 30] , target: 31
features window: [ 7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 31] , target: 32
features window: [ 8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
 32] , target: 33


## Dataset 6 (30) 

In [128]:
slidingWindow(30,slots)

features window: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29] , target: 30
features window: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30] , target: 31
features window: [ 2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
 26 27 28 29 30 31] , target: 32
features window: [ 3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
 27 28 29 30 31 32] , target: 33


# End of Case! 