In [2]:
% pylab inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
from fbprophet import Prophet
pd.options.display.float_format = '{:.2f}'.format
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


In [3]:
%%time
# Obtain training data
df_train = pd.read_csv('sales_train_v2.csv')
df_train.date = pd.to_datetime(df_train.date, format = '%d.%m.%Y')
df_train.set_index('date', inplace=True)
df_train = df_train.sort_index()
print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2935849 entries, 2013-01-01 to 2015-10-31
Data columns (total 5 columns):
date_block_num    int64
shop_id           int64
item_id           int64
item_price        float64
item_cnt_day      float64
dtypes: float64(2), int64(3)
memory usage: 134.4 MB
None
CPU times: user 8.36 s, sys: 286 ms, total: 8.65 s
Wall time: 8.86 s


In [22]:
# Obtain test data
df_test = pd.read_csv('test.csv')
df_test_shopunique = df_test.shop_id.unique()
df_test_shopunique = np.sort(df_test_shopunique)
print('Number of unique shops in test set: %i' %len(df_test_shopunique))

# Obtain the unique items from test set
df_test_item_unique = df_test.item_id.unique()
df_test_item_unique = np.sort(df_test_item_unique)
print('Number of unique items in test set: %i' %len(df_test_item_unique))

Number of unique shops in test set: 42
Number of unique items in test set: 5100


In [5]:
# Filture training data through shops available in test data
train_shops = df_train.loc[df_train['shop_id'].isin(df_test_shopunique)]
print(train_shops.head())

            date_block_num  shop_id  item_id  item_price  item_cnt_day
date                                                                  
2013-01-01               0       18     5823     2500.00          1.00
2013-01-01               0        7     1006      399.00          1.00
2013-01-01               0       19    17707      899.00          1.00
2013-01-01               0       14    19548      149.00          1.00
2013-01-01               0       28     6468      449.00          1.00


In [6]:
# Obtain total number sold per item from training set
train_item_num = train_shops.groupby(['item_id'])['item_cnt_day'].sum().sort_index()
train_item_num = pd.DataFrame(train_item_num)
train_item_num = train_item_num.rename(index=str, columns={"item_cnt_day": "item_cnt"})
item_num = train_item_num.reset_index().astype('int64')
print(item_num.head())

# Filter the total number sold per item with items appear in test set
    #without .loc[item_num['item_id'].isin(df_test_item_unique)]
top_selling_items = item_num.loc[item_num['item_id'].isin(df_test_item_unique)].sort_values(by=['item_cnt'], ascending=False)
print(top_selling_items.info())
print(top_selling_items.head())

   item_id  item_cnt
0        1         6
1       16         1
2       17         1
3       18         1
4       19         1
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4716 entries, 19981 to 5649
Data columns (total 2 columns):
item_id     4716 non-null int64
item_cnt    4716 non-null int64
dtypes: int64(2)
memory usage: 110.5 KB
None
       item_id  item_cnt
19981    20949    154077
2637      2808     13955
3524      3732     13582
16906    17717     13553
5518      5822     12268


In [7]:
# Obtain top 50 most popular items and number sold
top_selling_50 = top_selling_items.head(50)
print(top_selling_50.item_cnt.describe())
top_selling_50 = top_selling_50[['item_id', 'item_cnt']].reset_index().drop('index', axis=1)
print(top_selling_50.head(10))

count       50.00
mean      8903.50
std      21119.30
min       3635.00
25%       4169.00
50%       5075.00
75%       6529.50
max     154077.00
Name: item_cnt, dtype: float64
   item_id  item_cnt
0    20949    154077
1     2808     13955
2     3732     13582
3    17717     13553
4     5822     12268
5     3734      9739
6     3731      9718
7     6675      8657
8     4181      7829
9    16787      7676


The training set is filtered with test set because we can ensure that all of the top selling items in the filtered data have sale record in the last month.

In [8]:
# Calculate weights: obtain total sales of each item in the last month in training set (2015-9), obtain last month sales of item in each shop, 
#    divide each last month sales by total item sales

df_sep = train_shops.loc[train_shops['date_block_num'] == 32].sort_index()
df_sep = df_sep.loc[df_sep['item_id'].isin(np.array(top_selling_50.item_id))]
df_sep = df_sep.groupby(['shop_id','item_id'])['item_cnt_day'].sum().sort_index()
df_sep = df_sep.reset_index()
df_sep = df_sep.rename(index=str, columns={"item_cnt_day": "item_cnt"})
print(df_sep.head())
print(df_sep)

   shop_id  item_id  item_cnt
0        2     2269      1.00
1        2     2308      1.00
2        2     2753      1.00
3        2     2808      1.00
4        2     3331      1.00
      shop_id  item_id  item_cnt
0           2     2269      1.00
1           2     2308      1.00
2           2     2753      1.00
3           2     2808      1.00
4           2     3331      1.00
5           2     3340      1.00
6           2     3341      7.00
7           2     3343      4.00
8           2     3731      6.00
9           2     3732      2.00
10          2     3734      1.00
11          2     3928      1.00
12          2     4181      4.00
13          2     4244      1.00
14          2     4870      2.00
15          2     5672      1.00
16          2     5821      1.00
17          2     5822      4.00
18          2     5823      3.00
19          2     6497      1.00
20          2     6503      2.00
21          2     6738      2.00
22          2     6740      1.00
23          2     7018      

In [9]:
month_sale = train_shops.loc[train_shops['date_block_num'] == 32].sort_index()
month_sale = month_sale.loc[month_sale['item_id'].isin(np.array(top_selling_50.item_id))].sort_index()
month_sale = pd.DataFrame(month_sale.groupby(['item_id'])['item_cnt_day'].sum())
print(month_sale.head())

         item_cnt_day
item_id              
1495            20.00
1905            28.00
1916             9.00
2269            51.00
2308           122.00


In [10]:
df = pd.merge(df_sep, month_sale, on='item_id')
df['weight'] = df['item_cnt'] / df['item_cnt_day']
print(df.head())
print(df.describe())

   shop_id  item_id  item_cnt  item_cnt_day  weight
0        2     2269      1.00         51.00    0.02
1        3     2269      1.00         51.00    0.02
2        5     2269      1.00         51.00    0.02
3        6     2269      1.00         51.00    0.02
4        7     2269      2.00         51.00    0.04
       shop_id  item_id  item_cnt  item_cnt_day  weight
count  1246.00  1246.00   1246.00       1246.00 1246.00
mean     31.29  6830.98      6.60        222.15    0.04
std      17.54  5125.75     25.82        569.07    0.06
min       2.00  1495.00     -1.00          2.00   -0.50
25%      16.00  3343.00      1.00         54.25    0.02
50%      31.00  5672.00      2.00        105.00    0.02
75%      47.00  7893.00      4.00        184.00    0.05
max      59.00 22088.00    473.00       3343.00    1.00


In [11]:
print(df.loc[df['weight'] <= 0])

      shop_id  item_id  item_cnt  item_cnt_day  weight
105        12     2808     -1.00         32.00   -0.03
661         5     6738     -1.00         16.00   -0.06
845        24    10210      0.00        122.00    0.00
1232       12     3329      0.00          8.00    0.00
1245       57     6675     -1.00          2.00   -0.50


There are a few items in different stores that have negative weight due to items returned exceed items sold during the month.

In [12]:
df_weight = df[['shop_id','item_id','item_cnt','weight']].rename(index=str, columns={"item_cnt_day": "month_total"})
print(df_weight.head())

   shop_id  item_id  item_cnt  weight
0        2     2269      1.00    0.02
1        3     2269      1.00    0.02
2        5     2269      1.00    0.02
3        6     2269      1.00    0.02
4        7     2269      2.00    0.04


In [13]:
df_weight = df_weight.sort_values(by=['shop_id','item_id'])
df_weight.head(40)

Unnamed: 0,shop_id,item_id,item_cnt,weight
0,2,2269,1.0,0.02
28,2,2308,1.0,0.01
65,2,2753,1.0,0.01
99,2,2808,1.0,0.03
122,2,3331,1.0,0.01
153,2,3340,1.0,0.25
157,2,3341,7.0,0.03
194,2,3343,4.0,0.02
233,2,3731,6.0,0.03
272,2,3732,2.0,0.03


In [14]:
df_weight.describe()

Unnamed: 0,shop_id,item_id,item_cnt,weight
count,1246.0,1246.0,1246.0,1246.0
mean,31.29,6830.98,6.6,0.04
std,17.54,5125.75,25.82,0.06
min,2.0,1495.0,-1.0,-0.5
25%,16.0,3343.0,1.0,0.02
50%,31.0,5672.0,2.0,0.02
75%,47.0,7893.0,4.0,0.05
max,59.0,22088.0,473.0,1.0


In [15]:
import pickle
df_weight.to_pickle('weight.pkl')

In [16]:
df_weight.loc[df_weight['item_id']==3340]

Unnamed: 0,shop_id,item_id,item_cnt,weight
153,2,3340,1.0,0.25
154,10,3340,1.0,0.25
155,26,3340,1.0,0.25
156,53,3340,1.0,0.25
