## Provided Datasets

### Cross-reference files

In [8]:
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames

shops = pd.read_csv('data/provided/shops.csv', index_col=1)
print('number of shops:', len(shops))
shops.head()

number of shops: 60


Unnamed: 0_level_0,shop_name
shop_id,Unnamed: 1_level_1
0,"!Якутск Орджоникидзе, 56 фран"
1,"!Якутск ТЦ ""Центральный"" фран"
2,"Адыгея ТЦ ""Мега"""
3,"Балашиха ТРК ""Октябрь-Киномир"""
4,"Волжский ТЦ ""Волга Молл"""


In [7]:
categs = pd.read_csv('data/provided/item_categories.csv', index_col=1)
print('number of categories:', len(categs))
categs.head()

number of categories: 84


Unnamed: 0_level_0,item_category_name
item_category_id,Unnamed: 1_level_1
0,PC - Гарнитуры/Наушники
1,Аксессуары - PS2
2,Аксессуары - PS3
3,Аксессуары - PS4
4,Аксессуары - PSP


In [9]:
items = pd.read_csv('data/provided/items.csv', index_col=1)
print('number of items:', len(items))
items.head()

number of items: 22170


Unnamed: 0_level_0,item_name,item_category_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40
1,!ABBYY FineReader 12 Professional Edition Full...,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40
4,***КОРОБКА (СТЕКЛО) D,40


### Transactions file

In [13]:
transactions = pd.read_csv('data/provided/sales_train_v2.csv')
print('number of records:', len(transactions))
print('max date_block_num:', transactions['date_block_num'].max())
transactions.head()

shape: (2935849, 6)
max date_block_num: 33


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


### Test file

In [14]:
test_data = pd.read_csv('data/provided/test.csv', index_col=0)
print("number of predictions to make:", len(test_data))
test_data.head()

(214200, 2)


Unnamed: 0_level_0,shop_id,item_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5,5037
1,5,5320
2,5,5233
3,5,5232
4,5,5268


## Data Analysis
Let's aggregate the daily records so we can have a look at the distribution of the monthly counts for each item and shop pairing.

In [15]:
monthly_totals = transactions.groupby(['date_block_num', 'shop_id', 'item_id'],as_index=False)[['item_cnt_day']].sum()
monthly_totals['item_cnt_day'].describe()

count    1.609124e+06
mean     2.267200e+00
std      8.649882e+00
min     -2.200000e+01
25%      1.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      2.253000e+03
Name: item_cnt_day, dtype: float64

How many montlhy records have a negative value in the item_cnt_day field?

In [28]:
negative_counts = monthly_totals.loc[monthly_totals['item_cnt_day'] < 0]['item_cnt_day']
negative_counts.describe()

count    915.000000
mean      -1.080874
std        0.852346
min      -22.000000
25%       -1.000000
50%       -1.000000
75%       -1.000000
max       -1.000000
Name: item_cnt_day, dtype: float64

In [18]:
shop_item_totals_by_month = pd.pivot_table(monthly_totals, values=['item_cnt_day'], index=['shop_id','item_id'], columns=['date_block_num'], aggfunc=np.sum)
#shop_item_totals_by_month = mothly_totals.pivot(index=['shop_id','item_id'], columns='date_block_num', values='item_cnt_day')
shop_item_totals_by_month.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day
Unnamed: 0_level_1,date_block_num,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
shop_id,item_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
0,30,,31.0,,,,,,,,,...,,,,,,,,,,
0,31,,11.0,,,,,,,,,...,,,,,,,,,,
0,32,6.0,10.0,,,,,,,,,...,,,,,,,,,,
0,33,3.0,3.0,,,,,,,,,...,,,,,,,,,,
0,35,1.0,14.0,,,,,,,,,...,,,,,,,,,,


In [9]:
shop_item_totals_by_month.fillna(0.0, inplace=True)

### Correlation between sales among months

In [19]:
def get_shifted_monthly_totals(df, month_idx):
    totals = df.copy()
    totals['date_block_num'] = totals['date_block_num'] - month_idx
    return totals

In [20]:
item_sales_data = monthly_totals.copy()
item_sales_data.rename(index=str, columns={"item_cnt_day":"t"}, inplace=True)
item_sales_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,t
0,0,0,32,6.0
1,0,0,33,3.0
2,0,0,35,1.0
3,0,0,43,1.0
4,0,0,51,2.0


In [21]:
prior_totals = get_shifted_monthly_totals(monthly_totals, -1)
item_sales_data = item_sales_data.merge(prior_totals, on=['date_block_num', 'shop_id', 'item_id'], how='left')
item_sales_data.rename(index=str, columns={"item_cnt_day":"tm1"}, inplace=True)

prior_totals = get_shifted_monthly_totals(monthly_totals, -2)
item_sales_data = item_sales_data.merge(prior_totals, on=['date_block_num', 'shop_id', 'item_id'], how='left')
item_sales_data.rename(index=str, columns={"item_cnt_day":"tm2"}, inplace=True)

prior_totals = get_shifted_monthly_totals(monthly_totals, -3)
item_sales_data = item_sales_data.merge(prior_totals, on=['date_block_num', 'shop_id', 'item_id'], how='left')
item_sales_data.rename(index=str, columns={"item_cnt_day":"tm3"}, inplace=True)

prior_totals = get_shifted_monthly_totals(monthly_totals, -6)
item_sales_data = item_sales_data.merge(prior_totals, on=['date_block_num', 'shop_id', 'item_id'], how='left')
item_sales_data.rename(index=str, columns={"item_cnt_day":"tm6"}, inplace=True)

prior_totals = get_shifted_monthly_totals(monthly_totals, -12)
item_sales_data = item_sales_data.merge(prior_totals, on=['date_block_num', 'shop_id', 'item_id'], how='left')
item_sales_data.rename(index=str, columns={"item_cnt_day":"tm12"}, inplace=True)

item_sales_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,t,tm1,tm2,tm3,tm6,tm12
0,0,0,32,6.0,,,,,
1,0,0,33,3.0,,,,,
2,0,0,35,1.0,,,,,
3,0,0,43,1.0,,,,,
4,0,0,51,2.0,,,,,


In [23]:
item_sales_data = item_sales_data.loc[(item_sales_data['date_block_num'] > 11)]
item_sales_data.drop(['date_block_num','shop_id','item_id'], axis=1, inplace=True)
item_sales_data.fillna(0.0, inplace=True)
item_sales_data.head()

Unnamed: 0,t,tm1,tm2,tm3,tm6,tm12
687724,1.0,0.0,0.0,0.0,0.0,0.0
687725,1.0,1.0,2.0,0.0,0.0,1.0
687726,1.0,0.0,0.0,0.0,0.0,0.0
687727,2.0,1.0,2.0,1.0,1.0,1.0
687728,1.0,1.0,0.0,4.0,2.0,0.0


In [24]:
pd.plotting.scatter_matrix(item_sales_data, alpha = 0.3, figsize = (14,8), diagonal = 'kde');

  """Entry point for launching an IPython kernel.


KeyboardInterrupt: 