In [1]:
import numpy as np
import pandas as pd

transactions = pd.read_csv('data/provided/sales_train_v2.csv')
transactions.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [2]:
#aggregate records based on date_block_num, shop_id, item_id
monthly_totals = transactions.groupby(['date_block_num', 'shop_id', 'item_id'],as_index=False)[['item_cnt_day']].sum()
monthly_totals.to_csv('data/date_shop_item_totals.csv', index=False)
monthly_totals.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_day
0,0,0,32,6.0
1,0,0,33,3.0
2,0,0,35,1.0
3,0,0,43,1.0
4,0,0,51,2.0


In [3]:
def get_shifted_monthly_totals(df, month_idx):
    totals = df.copy()
    totals['date_block_num'] = totals['date_block_num'] - month_idx
    return totals

df = get_shifted_monthly_totals(monthly_totals, -1)
df.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_day
0,1,0,32,6.0
1,1,0,33,3.0
2,1,0,35,1.0
3,1,0,43,1.0
4,1,0,51,2.0


In [4]:
training_data = monthly_totals.copy()
training_data.rename(index=str, columns={"item_cnt_day":"t"}, inplace=True)
training_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,t
0,0,0,32,6.0
1,0,0,33,3.0
2,0,0,35,1.0
3,0,0,43,1.0
4,0,0,51,2.0


In [5]:
#add previous month's totals to training data
prior_totals = get_shifted_monthly_totals(monthly_totals, -1)
training_data = training_data.merge(prior_totals, on=['date_block_num', 'shop_id', 'item_id'], how='left')
training_data.rename(index=str, columns={"item_cnt_day":"tm1"}, inplace=True)
training_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,t,tm1
0,0,0,32,6.0,
1,0,0,33,3.0,
2,0,0,35,1.0,
3,0,0,43,1.0,
4,0,0,51,2.0,


In [6]:
prior_totals = get_shifted_monthly_totals(monthly_totals, -2)
training_data = training_data.merge(prior_totals, on=['date_block_num', 'shop_id', 'item_id'], how='left')
training_data.rename(index=str, columns={"item_cnt_day":"tm2"}, inplace=True)
training_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,t,tm1,tm2
0,0,0,32,6.0,,
1,0,0,33,3.0,,
2,0,0,35,1.0,,
3,0,0,43,1.0,,
4,0,0,51,2.0,,


In [7]:
prior_totals = get_shifted_monthly_totals(monthly_totals, -11)
training_data = training_data.merge(prior_totals, on=['date_block_num', 'shop_id', 'item_id'], how='left')
training_data.rename(index=str, columns={"item_cnt_day":"tm11"}, inplace=True)
training_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,t,tm1,tm2,tm11
0,0,0,32,6.0,,,
1,0,0,33,3.0,,,
2,0,0,35,1.0,,,
3,0,0,43,1.0,,,
4,0,0,51,2.0,,,


In [8]:
next_totals = get_shifted_monthly_totals(monthly_totals, 1)
training_data = training_data.merge(next_totals, on=['date_block_num', 'shop_id', 'item_id'], how='left')
training_data.rename(index=str, columns={"item_cnt_day":"tp1"}, inplace=True)
training_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,t,tm1,tm2,tm11,tp1
0,0,0,32,6.0,,,,10.0
1,0,0,33,3.0,,,,3.0
2,0,0,35,1.0,,,,14.0
3,0,0,43,1.0,,,,
4,0,0,51,2.0,,,,3.0


In [9]:
#save file
print(training_data.shape)
training_data.to_csv('data/train_item_prior_y.csv', index=False)

(1609124, 8)


In [10]:
#only records from 2014 onwards can be used for training
training_data = training_data.loc[(training_data['date_block_num'] >= 12)]
print(training_data.shape)

#replace NaN with 0.0
training_data.fillna(0.0, inplace=True)

(921400, 8)


### Categories

In [11]:
#add in item_category_id
items = pd.read_csv('data/provided/items.csv')
monthly_categ_totals = monthly_totals.merge(items, on=['item_id'], how='left').drop('item_name', axis=1)
monthly_categ_totals.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_day,item_category_id
0,0,0,32,6.0,40
1,0,0,33,3.0,37
2,0,0,35,1.0,40
3,0,0,43,1.0,40
4,0,0,51,2.0,57


In [12]:
#aggregate records based on date_block, shop, category
monthly_categ_totals = monthly_categ_totals.groupby(['date_block_num', 'shop_id', 'item_category_id'],as_index=False)[['item_cnt_day']].sum()
monthly_categ_totals.to_csv('data/date_shop_categ_totals.csv', index=False)
monthly_categ_totals.head()

Unnamed: 0,date_block_num,shop_id,item_category_id,item_cnt_day
0,0,0,2,53.0
1,0,0,3,28.0
2,0,0,4,16.0
3,0,0,5,28.0
4,0,0,6,65.0


In [13]:
categ_data = monthly_categ_totals.copy()
categ_data.rename(index=str, columns={"item_cnt_day":"categ_t"}, inplace=True)
categ_data.head()

Unnamed: 0,date_block_num,shop_id,item_category_id,categ_t
0,0,0,2,53.0
1,0,0,3,28.0
2,0,0,4,16.0
3,0,0,5,28.0
4,0,0,6,65.0


In [14]:
#add previous totals to training data
prior_totals = get_shifted_monthly_totals(monthly_categ_totals, -1)
categ_data = categ_data.merge(prior_totals, on=['date_block_num', 'shop_id', 'item_category_id'], how='left')
categ_data.rename(index=str, columns={"item_cnt_day":"categ_tm1"}, inplace=True)

prior_totals = get_shifted_monthly_totals(monthly_categ_totals, -2)
categ_data = categ_data.merge(prior_totals, on=['date_block_num', 'shop_id', 'item_category_id'], how='left')
categ_data.rename(index=str, columns={"item_cnt_day":"categ_tm2"}, inplace=True)

prior_totals = get_shifted_monthly_totals(monthly_categ_totals, -11)
categ_data = categ_data.merge(prior_totals, on=['date_block_num', 'shop_id', 'item_category_id'], how='left')
categ_data.rename(index=str, columns={"item_cnt_day":"categ_tm11"}, inplace=True)

categ_data = categ_data.loc[(categ_data['date_block_num'] >= 12)]
print(categ_data.shape)

#replace NaN with 0.0
categ_data.fillna(0.0, inplace=True)
categ_data.head()

(42899, 7)


Unnamed: 0,date_block_num,shop_id,item_category_id,categ_t,categ_tm1,categ_tm2,categ_tm11
22290,12,2,2,23.0,35.0,11.0,21.0
22291,12,2,3,15.0,50.0,16.0,6.0
22292,12,2,6,18.0,24.0,7.0,8.0
22293,12,2,11,5.0,17.0,4.0,2.0
22294,12,2,15,2.0,5.0,5.0,2.0


In [15]:
training_data = training_data.merge(items, on=['item_id'], how='left').drop('item_name', axis=1)
training_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,t,tm1,tm2,tm11,tp1,item_category_id
0,12,2,32,1.0,0.0,0.0,0.0,0.0,40
1,12,2,33,1.0,1.0,2.0,0.0,0.0,37
2,12,2,99,1.0,0.0,0.0,0.0,0.0,37
3,12,2,482,2.0,1.0,2.0,1.0,1.0,73
4,12,2,485,1.0,1.0,0.0,0.0,1.0,73


In [16]:
training_data = training_data.merge(categ_data, on=['date_block_num', 'shop_id', 'item_category_id'], how='left')
training_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,t,tm1,tm2,tm11,tp1,item_category_id,categ_t,categ_tm1,categ_tm2,categ_tm11
0,12,2,32,1.0,0.0,0.0,0.0,0.0,40,76.0,93.0,49.0,40.0
1,12,2,33,1.0,1.0,2.0,0.0,0.0,37,44.0,55.0,42.0,21.0
2,12,2,99,1.0,0.0,0.0,0.0,0.0,37,44.0,55.0,42.0,21.0
3,12,2,482,2.0,1.0,2.0,1.0,1.0,73,4.0,3.0,3.0,10.0
4,12,2,485,1.0,1.0,0.0,0.0,1.0,73,4.0,3.0,3.0,10.0


### Shops

In [17]:
#aggregate records based on date_block, shop
monthly_shop_totals = monthly_categ_totals.groupby(['date_block_num', 'shop_id'],as_index=False)[['item_cnt_day']].sum()
monthly_shop_totals.to_csv('data/date_shop_totals.csv', index=False)
monthly_shop_totals.head()

Unnamed: 0,date_block_num,shop_id,item_cnt_day
0,0,0,5578.0
1,0,1,2947.0
2,0,2,1146.0
3,0,3,767.0
4,0,4,2114.0


In [18]:
shop_data = monthly_shop_totals.copy()
shop_data.rename(index=str, columns={"item_cnt_day":"shop_t"}, inplace=True)
shop_data.head()

Unnamed: 0,date_block_num,shop_id,shop_t
0,0,0,5578.0
1,0,1,2947.0
2,0,2,1146.0
3,0,3,767.0
4,0,4,2114.0


In [19]:
#add previous totals to training data
prior_totals = get_shifted_monthly_totals(monthly_shop_totals, -1)
shop_data = shop_data.merge(prior_totals, on=['date_block_num', 'shop_id'], how='left')
shop_data.rename(index=str, columns={"item_cnt_day":"shop_tm1"}, inplace=True)

prior_totals = get_shifted_monthly_totals(monthly_shop_totals, -2)
shop_data = shop_data.merge(prior_totals, on=['date_block_num', 'shop_id'], how='left')
shop_data.rename(index=str, columns={"item_cnt_day":"shop_tm2"}, inplace=True)

prior_totals = get_shifted_monthly_totals(monthly_shop_totals, -11)
shop_data = shop_data.merge(prior_totals, on=['date_block_num', 'shop_id'], how='left')
shop_data.rename(index=str, columns={"item_cnt_day":"shop_tm11"}, inplace=True)

shop_data = shop_data.loc[(shop_data['date_block_num'] >= 12)]
print(shop_data.shape)

#replace NaN with 0.0
shop_data.fillna(0.0, inplace=True)
shop_data.head()

(1039, 6)


Unnamed: 0,date_block_num,shop_id,shop_t,shop_tm1,shop_tm2,shop_tm11
547,12,2,890.0,1322.0,862.0,488.0
548,12,3,968.0,1134.0,970.0,798.0
549,12,4,1430.0,2248.0,1486.0,2025.0
550,12,5,1639.0,2223.0,1390.0,877.0
551,12,6,3024.0,5467.0,3938.0,4007.0


In [20]:
training_data = training_data.merge(shop_data, on=['date_block_num', 'shop_id'], how='left')
training_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,t,tm1,tm2,tm11,tp1,item_category_id,categ_t,categ_tm1,categ_tm2,categ_tm11,shop_t,shop_tm1,shop_tm2,shop_tm11
0,12,2,32,1.0,0.0,0.0,0.0,0.0,40,76.0,93.0,49.0,40.0,890.0,1322.0,862.0,488.0
1,12,2,33,1.0,1.0,2.0,0.0,0.0,37,44.0,55.0,42.0,21.0,890.0,1322.0,862.0,488.0
2,12,2,99,1.0,0.0,0.0,0.0,0.0,37,44.0,55.0,42.0,21.0,890.0,1322.0,862.0,488.0
3,12,2,482,2.0,1.0,2.0,1.0,1.0,73,4.0,3.0,3.0,10.0,890.0,1322.0,862.0,488.0
4,12,2,485,1.0,1.0,0.0,0.0,1.0,73,4.0,3.0,3.0,10.0,890.0,1322.0,862.0,488.0


### Month and Year

In [21]:
#change date_block_num to year and month
def get_year_from_date_block_num(date_block_num):
    if date_block_num < 12:
        return 2013
    elif date_block_num < 24:
        return 2014
    else:
        return 2015

def get_month_from_date_block_num(date_block_num):
    return int(date_block_num % 12)

print('date block: 33 =' , get_year_from_date_block_num(33), get_month_from_date_block_num(33))


date block: 33 = 2015 9


In [22]:
def get_year_for_row(row):
    return get_year_from_date_block_num(row['date_block_num'])

training_data['year'] = training_data.apply(lambda row: get_year_for_row(row), axis=1)
training_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,t,tm1,tm2,tm11,tp1,item_category_id,categ_t,categ_tm1,categ_tm2,categ_tm11,shop_t,shop_tm1,shop_tm2,shop_tm11,year
0,12,2,32,1.0,0.0,0.0,0.0,0.0,40,76.0,93.0,49.0,40.0,890.0,1322.0,862.0,488.0,2014
1,12,2,33,1.0,1.0,2.0,0.0,0.0,37,44.0,55.0,42.0,21.0,890.0,1322.0,862.0,488.0,2014
2,12,2,99,1.0,0.0,0.0,0.0,0.0,37,44.0,55.0,42.0,21.0,890.0,1322.0,862.0,488.0,2014
3,12,2,482,2.0,1.0,2.0,1.0,1.0,73,4.0,3.0,3.0,10.0,890.0,1322.0,862.0,488.0,2014
4,12,2,485,1.0,1.0,0.0,0.0,1.0,73,4.0,3.0,3.0,10.0,890.0,1322.0,862.0,488.0,2014


In [23]:
def get_month_for_row(row):
    return get_month_from_date_block_num(row['date_block_num'])

training_data['month'] = monthly_totals.apply(lambda row: get_month_for_row(row), axis=1)
training_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,t,tm1,tm2,tm11,tp1,item_category_id,categ_t,categ_tm1,categ_tm2,categ_tm11,shop_t,shop_tm1,shop_tm2,shop_tm11,year,month
0,12,2,32,1.0,0.0,0.0,0.0,0.0,40,76.0,93.0,49.0,40.0,890.0,1322.0,862.0,488.0,2014,0
1,12,2,33,1.0,1.0,2.0,0.0,0.0,37,44.0,55.0,42.0,21.0,890.0,1322.0,862.0,488.0,2014,0
2,12,2,99,1.0,0.0,0.0,0.0,0.0,37,44.0,55.0,42.0,21.0,890.0,1322.0,862.0,488.0,2014,0
3,12,2,482,2.0,1.0,2.0,1.0,1.0,73,4.0,3.0,3.0,10.0,890.0,1322.0,862.0,488.0,2014,0
4,12,2,485,1.0,1.0,0.0,0.0,1.0,73,4.0,3.0,3.0,10.0,890.0,1322.0,862.0,488.0,2014,0


In [24]:
training_data.to_csv('data/td_all.csv', index=False)
training_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,t,tm1,tm2,tm11,tp1,item_category_id,categ_t,categ_tm1,categ_tm2,categ_tm11,shop_t,shop_tm1,shop_tm2,shop_tm11,year,month
0,12,2,32,1.0,0.0,0.0,0.0,0.0,40,76.0,93.0,49.0,40.0,890.0,1322.0,862.0,488.0,2014,0
1,12,2,33,1.0,1.0,2.0,0.0,0.0,37,44.0,55.0,42.0,21.0,890.0,1322.0,862.0,488.0,2014,0
2,12,2,99,1.0,0.0,0.0,0.0,0.0,37,44.0,55.0,42.0,21.0,890.0,1322.0,862.0,488.0,2014,0
3,12,2,482,2.0,1.0,2.0,1.0,1.0,73,4.0,3.0,3.0,10.0,890.0,1322.0,862.0,488.0,2014,0
4,12,2,485,1.0,1.0,0.0,0.0,1.0,73,4.0,3.0,3.0,10.0,890.0,1322.0,862.0,488.0,2014,0


In [25]:
#the last month block of 2015 cannot be used
training_data = training_data.loc[(training_data['date_block_num'] < 33)]
#don't need the date_block_num field
training_data = training_data[['year','month','shop_id','shop_t','shop_tm1','shop_tm2','shop_tm11','item_category_id','categ_t','categ_tm1','categ_tm2','categ_tm11','item_id','t','tm1','tm2','tm11','tp1']]
print(training_data.shape)
training_data.to_csv('data/training_data.csv', index=False)
training_data.head()

(889869, 18)


Unnamed: 0,year,month,shop_id,shop_t,shop_tm1,shop_tm2,shop_tm11,item_category_id,categ_t,categ_tm1,categ_tm2,categ_tm11,item_id,t,tm1,tm2,tm11,tp1
0,2014,0,2,890.0,1322.0,862.0,488.0,40,76.0,93.0,49.0,40.0,32,1.0,0.0,0.0,0.0,0.0
1,2014,0,2,890.0,1322.0,862.0,488.0,37,44.0,55.0,42.0,21.0,33,1.0,1.0,2.0,0.0,0.0
2,2014,0,2,890.0,1322.0,862.0,488.0,37,44.0,55.0,42.0,21.0,99,1.0,0.0,0.0,0.0,0.0
3,2014,0,2,890.0,1322.0,862.0,488.0,73,4.0,3.0,3.0,10.0,482,2.0,1.0,2.0,1.0,1.0
4,2014,0,2,890.0,1322.0,862.0,488.0,73,4.0,3.0,3.0,10.0,485,1.0,1.0,0.0,0.0,1.0
