In [1]:
import pandas as pd
import os
import numpy as np
from itertools import product

#os - operating system dependent. to read/write files
#pd - pandas for data analysis/modeling

data_path = 'data/'
dataframes_path = 'dataframes/'

#get DataFrames
#the training set. Daily historical data from January 2013 to October 2015
#sales    = pd.read_csv(os.path.join(data_path, 'sales_train.csv.gz'))
#supplemental information about the items/products
items           = pd.read_csv(os.path.join(data_path, 'items.csv'),encoding ='ISO-8859-1')
#supplemental information about the items categories.
#item_categories = pd.read_csv(os.path.join(data_path, 'item_categories.csv'),encoding ='ISO-8859-1')
#supplemental information about the shops.
#shops           = pd.read_csv(os.path.join(data_path, 'shops.csv'),encoding ='ISO-8859-1')
#Reformated sales into one dataframe in 3.1_CreateMonthlySales
trainingMonthSet    = pd.read_pickle(os.path.join(dataframes_path, 'trainingMonthSet.pickle'))
#Reformated sales into one dataframe in 3.1_CreateMonthlySales
trainingDaySet    = pd.read_pickle(os.path.join(dataframes_path, 'trainingDaySet.pickle'))

In [2]:
trainingMonthSet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1609124 entries, 0 to 2935847
Data columns (total 7 columns):
shop_id             1609124 non-null int8
item_id             1609124 non-null int16
date_block_num      1609124 non-null int8
date                1609124 non-null datetime64[ns]
item_category_id    1609124 non-null int8
item_price          1609124 non-null float64
item_cnt_month      1609124 non-null int16
dtypes: datetime64[ns](1), float64(1), int16(2), int8(3)
memory usage: 47.6 MB


<h3>Creating zero sales data</h3>
<p>Creating a dataframe with all the possible shop and item combinations for that particular month, since the sales file only includes items which has any sales or returns it doesnt include data which did not had any sale. Thefore we create a new dataset with those items assuming every shop contains those items</p>

In [3]:
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']
item_prices=[]
# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in trainingMonthSet['date_block_num'].unique():
    cur_shops = trainingMonthSet.loc[trainingMonthSet['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = trainingMonthSet.loc[trainingMonthSet['date_block_num'] == block_num, 'item_id'].unique()   
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

grid.head()

Unnamed: 0,shop_id,item_id,date_block_num
0,59,22154,0
1,59,2552,0
2,59,2554,0
3,59,2555,0
4,59,2564,0


In [4]:
grid.shape

(10913850, 3)

In [5]:
all_data = pd.merge(grid, trainingMonthSet.copy().drop(['item_category_id','item_price'],axis=1) , how='left', on=index_cols).fillna(0)
del grid
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,date,item_cnt_month
0,59,22154,0,2013-01-01 00:00:00,1.0
1,59,2552,0,0,0.0
2,59,2554,0,0,0.0
3,59,2555,0,0,0.0
4,59,2564,0,0,0.0


In [6]:
import datetime
for i in range(0,34):
    all_data.loc[all_data.date_block_num==i,'date']=datetime.date(2013+(int)(i/12), 1+i%12, 1)

In [7]:
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,date,item_cnt_month
0,59,22154,0,2013-01-01,1.0
1,59,2552,0,2013-01-01,0.0
2,59,2554,0,2013-01-01,0.0
3,59,2555,0,2013-01-01,0.0
4,59,2564,0,2013-01-01,0.0


In [8]:
all_data.shape

(10913850, 5)

In [9]:
#all_data.copy().loc[all_data.shop_id==0].loc[all_data.date_block_num==0].groupby(index_cols,as_index=False).agg({'item_cnt_month':'sum'}).astype(np.int32).item_cnt_month.sum()/8115

<h3>Get shop wise, item wise, item-category wise data</h3>

In [10]:
# Groupby data to get shop-month aggregates to get rid of duplicates
gbShopSum = all_data.groupby(['date_block_num','shop_id'],as_index=False).agg({'item_cnt_month':'sum'}).astype(np.int32)
gbShopMean = all_data.groupby(['date_block_num','shop_id'],as_index=False).agg({'item_cnt_month':np.mean})
gbShop = pd.merge(gbShopSum, gbShopMean, how='left', on=['date_block_num','shop_id']).fillna(0)
gbShop.columns = ['date_block_num','shop_id','shop_block_target_sum','shop_block_target_mean']
del gbShopSum,gbShopMean
gbShop.head()

Unnamed: 0,date_block_num,shop_id,shop_block_target_sum,shop_block_target_mean
0,0,0,5578,0.687369
1,0,1,2947,0.363155
2,0,2,1146,0.14122
3,0,3,767,0.094516
4,0,4,2114,0.260505


In [11]:
gbShop.head()

Unnamed: 0,date_block_num,shop_id,shop_block_target_sum,shop_block_target_mean
0,0,0,5578,0.687369
1,0,1,2947,0.363155
2,0,2,1146,0.14122
3,0,3,767,0.094516
4,0,4,2114,0.260505


In [12]:
# Groupby data to get item-month aggregates to get rid of duplicates
gbItemSum = all_data.groupby(['date_block_num','item_id'],as_index=False).agg({'item_cnt_month':'sum'})
gbItemMean = all_data.groupby(['date_block_num','item_id'],as_index=False).agg({'item_cnt_month':np.mean})
gbItem = pd.merge(gbItemSum, gbItemMean, how='left', on=['date_block_num','item_id']).fillna(0)
gbItem.columns = ['date_block_num','item_id','item_block_target_sum','item_block_target_mean']
del gbItemSum,gbItemMean
gbItem.head()

Unnamed: 0,date_block_num,item_id,item_block_target_sum,item_block_target_mean
0,0,19,1.0,0.022222
1,0,27,7.0,0.155556
2,0,28,8.0,0.177778
3,0,29,4.0,0.088889
4,0,32,299.0,6.644444


In [13]:
salesWithCat=pd.merge(all_data,items[['item_id','item_category_id']],on=['item_id'],how='left')
salesWithCat.head()

Unnamed: 0,shop_id,item_id,date_block_num,date,item_cnt_month,item_category_id
0,59,22154,0,2013-01-01,1.0,37
1,59,2552,0,2013-01-01,0.0,58
2,59,2554,0,2013-01-01,0.0,58
3,59,2555,0,2013-01-01,0.0,56
4,59,2564,0,2013-01-01,0.0,59


In [14]:
# Groupby data to get category-month aggregates to get rid of duplicates
gbItemCatSum = salesWithCat.groupby(['date_block_num','item_category_id'],as_index=False).agg({'item_cnt_month':'sum'})
gbItemCatMean = salesWithCat.groupby(['date_block_num','item_category_id'],as_index=False).agg({'item_cnt_month':np.mean})
gbItemCat = pd.merge(gbItemCatSum, gbItemCatMean, how='left', on=['date_block_num','item_category_id']).fillna(0)
gbItemCat.columns = ['date_block_num','item_category_id','item_cat_block_target_sum','item_cat_block_target_mean']
del gbItemCatSum, gbItemCatMean,salesWithCat
gbItemCat.head()

Unnamed: 0,date_block_num,item_category_id,item_cat_block_target_sum,item_cat_block_target_mean
0,0,0,1.0,0.022222
1,0,1,1.0,0.022222
2,0,2,1390.0,0.834835
3,0,3,440.0,4.888889
4,0,4,251.0,0.507071


<h3>Add other features</h3>

In [15]:
all_data['date'] = pd.to_datetime(all_data.date,format='%Y-%m-%d')
all_data['date_block_num']=pd.to_numeric(all_data.date_block_num, downcast='integer')
all_data['shop_id']=pd.to_numeric(all_data.shop_id, downcast='integer')
all_data['item_id']=pd.to_numeric(all_data.item_id, downcast='integer')
all_data['item_cnt_month']=pd.to_numeric(all_data.item_cnt_month, downcast='signed')

In [16]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10913850 entries, 0 to 10913849
Data columns (total 5 columns):
shop_id           int8
item_id           int16
date_block_num    int8
date              datetime64[ns]
item_cnt_month    int16
dtypes: datetime64[ns](1), int16(2), int8(2)
memory usage: 229.0 MB


In [17]:
all_data = pd.merge(all_data, trainingMonthSet[['item_id','date_block_num','item_price']], how='left', on=['date_block_num','item_id']).fillna(0)
del trainingMonthSet
all_data['item_price']=all_data['item_price'].astype(np.float16)
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,date,item_cnt_month,item_price
0,59,22154,0,2013-01-01,1,999.0
1,59,22154,0,2013-01-01,1,999.0
2,59,22154,0,2013-01-01,1,999.0
3,59,22154,0,2013-01-01,1,999.0
4,59,22154,0,2013-01-01,1,999.0


In [18]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75218848 entries, 0 to 75218847
Data columns (total 6 columns):
shop_id           int8
item_id           int16
date_block_num    int8
date              datetime64[ns]
item_cnt_month    int16
item_price        float16
dtypes: datetime64[ns](1), float16(1), int16(2), int8(2)
memory usage: 1.7 GB


In [19]:
# merge with category id
all_data =pd.merge(all_data,items[['item_id','item_category_id']],on=['item_id'],how='left')
del items
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,date,item_cnt_month,item_price,item_category_id
0,59,22154,0,2013-01-01,1,999.0,37
1,59,22154,0,2013-01-01,1,999.0,37
2,59,22154,0,2013-01-01,1,999.0,37
3,59,22154,0,2013-01-01,1,999.0,37
4,59,22154,0,2013-01-01,1,999.0,37


In [20]:
all_data['item_category_id']=pd.to_numeric(all_data.item_category_id, downcast='integer')
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75218848 entries, 0 to 75218847
Data columns (total 7 columns):
shop_id             int8
item_id             int16
date_block_num      int8
date                datetime64[ns]
item_cnt_month      int16
item_price          float16
item_category_id    int8
dtypes: datetime64[ns](1), float16(1), int16(2), int8(3)
memory usage: 1.8 GB


In [None]:
all_data = pd.merge(all_data, gbItemCat, how='left', on=['date_block_num','item_category_id']).fillna(0)
del gbItemCat
all_data.head()

In [None]:
all_data['item_cat_block_target_sum']=pd.to_numeric(all_data.item_category_id, downcast='integer')
all_data['item_cat_block_target_mean']=all_data['item_cat_block_target_mean'].astype(np.float16)
all_data.info()

In [None]:
all_data = pd.merge(all_data, gbItem, how='left', on=['date_block_num','item_id']).fillna(0)
del gbItem
all_data.head()

In [None]:
all_data['item_block_target_sum']=pd.to_numeric(all_data.item_block_target_sum, downcast='integer')
all_data['item_block_target_mean']=all_data['item_block_target_mean'].astype(np.float16)
all_data.info()

In [None]:
gbShop['shop_block_target_sum']=pd.to_numeric(gbShop.shop_block_target_sum, downcast='integer')
gbShop['shop_block_target_sum']=gbShop['shop_block_target_sum'].astype(np.float16)
all_data = pd.merge(all_data, gbShop, how='left', on=['date_block_num','shop_id']).fillna(0)
del gbShop
all_data.head()

In [None]:
all_data.shape

In [None]:
all_data['date'] = pd.to_datetime(all_data.date,format='%Y-%m-%d')
all_data['date_block_num']=pd.to_numeric(all_data.date_block_num, downcast='integer')
all_data['shop_id']=pd.to_numeric(all_data.shop_id, downcast='integer')
all_data['item_id']=pd.to_numeric(all_data.item_id, downcast='integer')
#all_data['item_price']=all_data['item_price'].astype(float)
#all_data['item_cnt_month']=pd.to_numeric(all_data.item_cnt_month, downcast='signed')

In [None]:
all_data.to_pickle(dataframes_path+'saleswithzero.pickle')


In [None]:
all_data.info()