In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Data Loading 

In [None]:
sales_df = pd.read_csv('../../datasets/predict-sales/sales_train.csv')
items_df = pd.read_csv('../../datasets/predict-sales/items.csv')
test_df = pd.read_csv('../../datasets/predict-sales/test.csv')

In [None]:
sales_df.drop(labels=['date'],inplace=True,axis=1)
sales_df = sales_df.reset_index()
items_df.drop(labels=['item_name'],inplace=True,axis=1)

In [None]:
sales_df.info()

In [None]:
sales_df.head()

# Visualizing Data 

Let's visualize some data from the test set:

In [None]:
def drawTimeSerie(df, sample, n):
    for i in range(n):
        shop_id = sample.iloc[i]['shop_id']
        item_id = sample.iloc[i]['item_id']
        id_df = df[(df['shop_id'] == shop_id) & (df['item_id'] == item_id)]
        id_df_grouped = id_df[['date_block_num','item_cnt_day']].groupby('date_block_num').sum().reset_index()
        plt.figure(figsize=[10,n*5])
        plt.subplot(n,1,i+1)
        plt.plot(id_df_grouped['date_block_num'], id_df_grouped['item_cnt_day'],'*-',)
        plt.title(str(shop_id) + '-' + str(item_id))

In [None]:
n_samples = 10
sample = test_df.sample(n=n_samples, axis=0)

In [None]:
drawTimeSerie(sales_df, sample, n_samples)

# Structure of Data

In [None]:
import featuretools as ft

Let's evaluate two kinds of process 

## Pre-proccesing

In [None]:
month_id = 15

In [None]:
sales_month_df = sales_df[sales_df['date_block_num'] == month_id]

In [None]:
sales_month_df['ID_pair'] = sales_month_df[['shop_id','item_id']].apply(lambda x: str(x[0]) + '-' + str(x[1]), axis=1)

In [None]:
sales_month_df.drop(labels=['shop_id'], inplace=True, axis=1)

In [None]:
sales_month_df.head()

## EntitySet Processing (Recommended by FeatureTools)

In [None]:
es = ft.EntitySet(id="prediction_sales")

In [None]:
es = es.entity_from_dataframe(entity_id='sales',dataframe=sales_month_df, index='index')

In [None]:
es = es.entity_from_dataframe(entity_id='items',dataframe=items_df, index='item_id',variable_types={'item_category_id': ft.variable_types.Categorical})

In [None]:
new_relationship = ft.Relationship(es['items']["item_id"], es['sales']['item_id'])

In [None]:
es = es.add_relationship(new_relationship)

In [None]:
es = es.normalize_entity(base_entity_id='sales',
                         new_entity_id='ids',
                         index='ID_pair'
                        )

In [None]:
feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='ids')

In [None]:
feature_matrix[feature_matrix['COUNT(sales)'] > 1].head()

In [None]:
sns.distplot(feature_matrix['COUNT(sales)'])

# End of Case! 