In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Data Loading 

In [None]:
sales_df = pd.read_csv('../datasets/predict-sales/sales_train.csv')
items_df = pd.read_csv('../datasets/predict-sales/items.csv')
test_df = pd.read_csv('../datasets/predict-sales/test.csv')

In [None]:
sales_df.drop(labels=['date'],inplace=True,axis=1)
sales_df = sales_df.reset_index()
items_df.drop(labels=['item_name'],inplace=True,axis=1)

In [None]:
dict_aux = {}

In [None]:
def setPair(x, d):
    i = str(x[0]) + '-' + str(x[1])
    try:
        return d[i]
    except:
        result = i
        d[i] = result
        return result

In [None]:
sales_df['ID_pair'] = sales_df[['shop_id','item_id']].apply(setPair, args=[dict_aux], axis=1)

In [None]:
sales_df = sales_df.merge(items_df)

In [None]:
dict_aux = {}

In [None]:
sales_df['ID_CAT_pair'] = sales_df[['shop_id','item_category_id']].apply(setPair, args=[dict_aux], axis=1)

In [None]:
sales_df.info()

In [None]:
sales_df.head()

In [None]:
sales_df.to_csv('../datasets/predict-sales/sales_train_enriched.csv')

## Direct Load

In [2]:
sales_df = pd.read_csv('../datasets/predict-sales/sales_train_enriched.csv')

In [3]:
sales_df.drop(labels=['Unnamed: 0'], inplace=True, axis=1)

In [4]:
sales_df.head()

Unnamed: 0,index,date_block_num,shop_id,item_id,item_price,item_cnt_day,ID_pair,item_category_id,ID_CAT_pair
0,0,0,59,22154,999.0,1.0,59-22154,37,59-37
1,3270,0,24,22154,999.0,1.0,24-22154,37,24-37
2,17081,0,27,22154,999.0,1.0,27-22154,37,27-37
3,25918,0,25,22154,999.0,1.0,25-22154,37,25-37
4,25919,0,25,22154,999.0,1.0,25-22154,37,25-37


# Visualizing Data 

Let's visualize some data from the test set:

In [None]:
def drawTimeSerie(df, sample, n):
    for i in range(n):
        shop_id = sample.iloc[i]['shop_id']
        item_id = sample.iloc[i]['item_id']
        id_df = df[(df['shop_id'] == shop_id) & (df['item_id'] == item_id)]
        id_df_grouped = id_df[['date_block_num','item_cnt_day']].groupby('date_block_num').sum().reset_index()
        plt.figure(figsize=[10,n*5])
        plt.subplot(n,1,i+1)
        plt.plot(id_df_grouped['date_block_num'], id_df_grouped['item_cnt_day'],'*-',)
        plt.title(str(shop_id) + '-' + str(item_id))

In [None]:
n_samples = 10
sample = test_df.sample(n=n_samples, axis=0)

In [None]:
drawTimeSerie(sales_df, sample, n_samples)

# Preparing Data with FeatureTools (testing)

In [9]:
import featuretools as ft

Let's evaluate the FT process 

## Pre-proccesing

In [None]:
month_id = 9

In [None]:
sales_month_df = sales_df[sales_df['date_block_num'] == month_id]

In [None]:
sales_month_df.head()

In [None]:
sales_month_df.drop(labels=['shop_id','item_id','item_category_id'], inplace=True, axis=1)

In [None]:
sales_month_df.head()

## EntitySet Processing (Recommended by FeatureTools)

In [None]:
es = ft.EntitySet(id="prediction_sales")

In [None]:
es = es.entity_from_dataframe(entity_id='sales',dataframe=sales_month_df, index='index')

In [None]:
es = es.normalize_entity(base_entity_id='sales',
                         new_entity_id='idsCat',
                         index='ID_CAT_pair')

In [None]:
feature_matrix_idsCat, feature_defs_idsCat = ft.dfs(entityset=es, target_entity='idsCat')

In [None]:
feature_matrix_idsCat.info()

In [None]:
idsCat = feature_matrix_idsCat.reset_index()

In [None]:
idsCat_agg = idsCat[['ID_CAT_pair','SUM(sales.item_cnt_day)',
                     'MEAN(sales.item_cnt_day)','MEAN(sales.item_price)',
                     'STD(sales.item_cnt_day)','STD(sales.item_price)',
                     'MAX(sales.item_cnt_day)','MAX(sales.item_price)',
                     'MIN(sales.item_cnt_day)','MIN(sales.item_price)',
                     'SKEW(sales.item_cnt_day)','SKEW(sales.item_price)'
                    ]]

In [None]:
idsCat_agg.columns = ['ID_CAT_pair','sum_shop_cat_sales',
                      'mean_shop_cat_day','mean_shop_cat_item_price',
                      'std_shop_cat_day','std_shop_cat_item_price',
                      'max_shop_cat_day','max_shop_cat_item_price',
                      'min_shop_cat_day','min_shop_cat_item_price',
                      'skew_shop_cat_day','skew_shop_cat_item_price',
                     ]

In [None]:
idsCat_agg.head() 

Re-craete the EntitySet: 

In [None]:
es = ft.EntitySet(id="prediction_sales")

In [None]:
sales_month_df = sales_month_df.merge(idsCat_agg)

In [None]:
sales_month_df.drop(labels=['ID_CAT_pair'], inplace=True, axis=1)

In [None]:
es = es.entity_from_dataframe(entity_id='sales',dataframe=sales_month_df, index='index')

In [None]:
es = es.normalize_entity(base_entity_id='sales',
                         new_entity_id='ids',
                         index='ID_pair',
                         additional_variables=['date_block_num',
                                               'sum_shop_cat_sales',
                                               'mean_shop_cat_day',
                                               'mean_shop_cat_item_price',
                                               'std_shop_cat_day',
                                               'std_shop_cat_item_price',
                                               'max_shop_cat_day',
                                               'max_shop_cat_item_price',
                                               'min_shop_cat_day',
                                               'min_shop_cat_item_price',
                                               'skew_shop_cat_day',
                                               'skew_shop_cat_item_price'
                                              ]
                        )

In [None]:
feature_matrix_ids, feature_defs_ids = ft.dfs(entityset=es, target_entity='ids')

In [None]:
feature_matrix_ids.info()

In [None]:
feature_matrix_ids.head()

# Generating the DataSet 

In [5]:
slots = sales_df['date_block_num'].nunique()

In [6]:
def generateFeatures(sales_df, months_feature, month_target):
    print('features window:',months_feature,', target:',month_target)
    # Step 1: Extract fetures from the previous point
    sales_window_df = sales_df[sales_df['date_block_num'].isin(months_feature)]
    sales_window_df.drop(labels=['shop_id','item_id','item_category_id'], inplace=True, axis=1)

    es = ft.EntitySet(id="prediction_sales")
    es = es.entity_from_dataframe(entity_id='sales',dataframe=sales_window_df, index='index')
    es = es.normalize_entity(base_entity_id='sales',
                         new_entity_id='idsCat',
                         index='ID_CAT_pair')
    feature_matrix_idsCat, feature_defs_idsCat = ft.dfs(entityset=es, target_entity='idsCat')
    idsCat = feature_matrix_idsCat.reset_index()
    idsCat_agg = idsCat[['ID_CAT_pair','SUM(sales.item_cnt_day)',
                     'MEAN(sales.item_cnt_day)','MEAN(sales.item_price)',
                     'STD(sales.item_cnt_day)','STD(sales.item_price)',
                     'MAX(sales.item_cnt_day)','MAX(sales.item_price)',
                     'MIN(sales.item_cnt_day)','MIN(sales.item_price)',
                     'SKEW(sales.item_cnt_day)','SKEW(sales.item_price)'
                    ]]
    idsCat_agg.columns = ['ID_CAT_pair','sum_shop_cat_sales',
                      'mean_shop_cat_day','mean_shop_cat_item_price',
                      'std_shop_cat_day','std_shop_cat_item_price',
                      'max_shop_cat_day','max_shop_cat_item_price',
                      'min_shop_cat_day','min_shop_cat_item_price',
                      'skew_shop_cat_day','skew_shop_cat_item_price',
                     ]
    
    sales_window_df = sales_window_df.merge(idsCat_agg)
    sales_window_df.drop(labels=['ID_CAT_pair'], inplace=True, axis=1)
    
    # Recreating EntitySet
    es = ft.EntitySet(id="prediction_sales")
    es = es.entity_from_dataframe(entity_id='sales',dataframe=sales_window_df, index='index')
    es = es.normalize_entity(base_entity_id='sales',
                         new_entity_id='ids',
                         index='ID_pair',
                         additional_variables=['date_block_num',
                                               'sum_shop_cat_sales',
                                               'mean_shop_cat_day',
                                               'mean_shop_cat_item_price',
                                               'std_shop_cat_day',
                                               'std_shop_cat_item_price',
                                               'max_shop_cat_day',
                                               'max_shop_cat_item_price',
                                               'min_shop_cat_day',
                                               'min_shop_cat_item_price',
                                               'skew_shop_cat_day',
                                               'skew_shop_cat_item_price'
                                              ]
                        )
    feature_matrix_ids, feature_defs_ids = ft.dfs(entityset=es, target_entity='ids')
    
    # Step 2: Extract target
    target = sales_df[sales_df['date_block_num'] == month_target][['ID_pair','item_cnt_day']].groupby('ID_pair').sum()    
    target.columns = ['target']
    
    return pd.concat([feature_matrix_ids, target], axis=1, sort=False)
    

In [7]:
def slidingWindow(sales_df, size, slots):
    df_final = pd.DataFrame()
    for index in range(size,slots):
        features_target = generateFeatures(sales_df, np.arange(index-size,index),index)
        # TODO stack dataset
        df_final = pd.concat([df_final,features_target], axis=0) 
    # TODO return completed dataset
    return df_final

## Dataset Test

In [None]:
dstest = slidingWindow(sales_df, 3,4)

In [11]:
def fragment(df):
    num_rows = df.shape[0]
    num_mid = int(num_rows/2)
    d1 = df[0:num_mid]
    d2 = df[num_mid:num_rows]
    return d1, d2

## Dataset 1

In [10]:
dataset1 = slidingWindow(sales_df, 3,slots)
dataset1A, dataset1B = fragment(dataset1)

features window: [0 1 2] , target: 3
features window: [1 2 3] , target: 4
features window: [2 3 4] , target: 5
features window: [3 4 5] , target: 6
features window: [4 5 6] , target: 7
features window: [5 6 7] , target: 8
features window: [6 7 8] , target: 9
features window: [7 8 9] , target: 10
features window: [ 8  9 10] , target: 11
features window: [ 9 10 11] , target: 12
features window: [10 11 12] , target: 13
features window: [11 12 13] , target: 14
features window: [12 13 14] , target: 15
features window: [13 14 15] , target: 16
features window: [14 15 16] , target: 17
features window: [15 16 17] , target: 18
features window: [16 17 18] , target: 19
features window: [17 18 19] , target: 20
features window: [18 19 20] , target: 21
features window: [19 20 21] , target: 22
features window: [20 21 22] , target: 23
features window: [21 22 23] , target: 24
features window: [22 23 24] , target: 25
features window: [23 24 25] , target: 26
features window: [24 25 26] , target: 27
featur

NameError: name 'fragment' is not defined

In [13]:
dataset1A.to_csv('../datasets/predict-sales/dataset1A.csv')
dataset1B.to_csv('../datasets/predict-sales/dataset1B.csv')

## Dataset 2

In [None]:
dataset2 = slidingWindow(sales_df, 6,slots)
dataset2A, dataset2B = fragment(dataset2)

features window: [0 1 2 3 4 5] , target: 6
features window: [1 2 3 4 5 6] , target: 7
features window: [2 3 4 5 6 7] , target: 8
features window: [3 4 5 6 7 8] , target: 9
features window: [4 5 6 7 8 9] , target: 10
features window: [ 5  6  7  8  9 10] , target: 11
features window: [ 6  7  8  9 10 11] , target: 12
features window: [ 7  8  9 10 11 12] , target: 13
features window: [ 8  9 10 11 12 13] , target: 14
features window: [ 9 10 11 12 13 14] , target: 15
features window: [10 11 12 13 14 15] , target: 16
features window: [11 12 13 14 15 16] , target: 17
features window: [12 13 14 15 16 17] , target: 18


In [None]:
dataset2A.to_csv('../datasets/predict-sales/dataset2A.csv')
dataset2B.to_csv('../datasets/predict-sales/dataset2B.csv')

## Dataset 3

In [None]:
dataset3 = slidingWindow(sales_df, 9,slots)
dataset3A, dataset3B = fragment(dataset3)

In [None]:
dataset3A.to_csv('../datasets/predict-sales/dataset3A.csv')
dataset3B.to_csv('../datasets/predict-sales/dataset3B.csv')

## Dataset 4

In [None]:
dataset4 = slidingWindow(sales_df, 12,slots)
dataset4A, dataset4B = fragment(dataset4)

In [None]:
dataset4A.to_csv('../datasets/predict-sales/dataset4A.csv')
dataset4B.to_csv('../datasets/predict-sales/dataset4B.csv')

## Dataset 5

In [None]:
dataset5 = slidingWindow(sales_df, 15,slots)
dataset5A, dataset5B = fragment(dataset5)

In [None]:
dataset5A.to_csv('../datasets/predict-sales/dataset5A.csv')
dataset5B.to_csv('../datasets/predict-sales/dataset5B.csv')

## Dataset 6

In [None]:
dataset6 = slidingWindow(sales_df, 18,slots)
dataset6A, dataset6B = fragment(dataset6)

In [None]:
dataset6A.to_csv('../datasets/predict-sales/dataset6A.csv')
dataset6B.to_csv('../datasets/predict-sales/dataset6B.csv')

# End of Case! 