# Part 1, Features

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from itertools import product
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import pickle
import time
import gc
import sys
sys.version_info

sys.version_info(major=3, minor=6, micro=4, releaselevel='final', serial=0)

In [2]:
items = pd.read_csv('items.csv')
shops = pd.read_csv('shops.csv')
cats = pd.read_csv('item_categories.csv')
train = pd.read_csv('sales_train.csv.gz', compression='gzip')
test  = pd.read_csv('test.csv.gz', compression='gzip').set_index('ID')

## Outliers

There are items with strange prices and sales. After detailed exploration I decided to remove items with price > 100000 and sales > 1001 (1000 is ok).

plt.figure(figsize=(10,4))
plt.xlim(-100, 3000)
sns.boxplot(x=train.item_cnt_day)

plt.figure(figsize=(10,4))
plt.xlim(train.item_price.min(), train.item_price.max()*1.1)
sns.boxplot(x=train.item_price)

In [3]:
train = train[train.item_price<100000]
train = train[train.item_cnt_day<1001]

There is one item with price below zero. Fill it with median.

In [4]:
median = train[(train.shop_id==32)&(train.item_id==2973)&(train.date_block_num==4)&(train.item_price>0)].item_price.median()
train.loc[train.item_price<0, 'item_price'] = median

Several shops are duplicates of each other (according to its name). Fix train and test set.

In [5]:
# Якутск Орджоникидзе, 56
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

## Shops/Cats/Items preprocessing
Observations:
* Each shop_name starts with the city name.
* Each category contains type and subtype in its name.

In [6]:
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code']]

cats['split'] = cats['item_category_name'].str.split('-')
cats['type'] = cats['split'].map(lambda x: x[0].strip())
cats['type_code'] = LabelEncoder().fit_transform(cats['type'])
# if subtype is nan then type
cats['subtype'] = cats['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
cats['subtype_code'] = LabelEncoder().fit_transform(cats['subtype'])
cats = cats[['item_category_id','type_code', 'subtype_code']]

TF-IDF features from item_name. 

In [7]:
tfidf = pd.DataFrame(TfidfVectorizer(max_features=12).fit_transform(items['item_name']).toarray(), dtype='float16')
tfidf.columns = ['tfidf_'+str(id+1) for id in range(12)]
tfidf.index.names = ['item_id']
tfidf.reset_index(inplace=True)

items.drop(['item_name'], axis=1, inplace=True)

## Monthly sales
Test set is a product of some shops and items within 34 month. There are 5100 items * 42 shops = 214200 pairs. 363 items are new compared to train. Hence, for the most of the items in the test set target value should be zero. 
In the other hand train set contains only pairs which were sold or returned in the past. Tha main idea is to calculate monthly sales and <b>extend it with zero sales</b> for each unique pair within the month. This way train data will be similar to test data.

In [8]:
len(list(set(test.item_id) - set(test.item_id).intersection(set(train.item_id)))), len(list(set(test.item_id))), len(test)

(363, 5100, 214200)

In [9]:
ts = time.time()
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = train[train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
    
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)
time.time() - ts

13.66191291809082

Aggregate train set by shop/item pairs to calculate target aggreagates. <b>Clip(0,20) target value.</b>

<i>I use floats instead of ints for item_cnt_month and orders to avoid downcasting it after concatination with the test set later. If it would be int16, after concatination with NaN values it becomes int64, but foat16 becomes float16 even with NaNs.</i>

In [10]:
train['revenue'] = train['item_price'] *  train['item_cnt_day']

In [11]:
ts = time.time()
train_agg = train.groupby(['date_block_num','shop_id','item_id']).agg({
    'item_cnt_day': ['sum', 'count'],
    'revenue': ['sum']
})
train_agg.columns = ['item_cnt_month', 'orders', 'revenue']
train_agg.reset_index(inplace=True)

matrix = pd.merge(matrix, train_agg, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month']
                                .fillna(0)
                                .clip(0,20) # NB clip target here
                                .astype(np.float16))
matrix['orders'] = matrix['orders'].fillna(0).astype(np.float16)
matrix['revenue'] = matrix['revenue'].fillna(0).astype(np.float32)
time.time() - ts

8.315948247909546

## Test set
To use time tricks append test pairs to the matrix.

In [12]:
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

In [13]:
ts = time.time()
matrix = pd.concat([matrix, test], ignore_index=True, keys=cols)
time.time() - ts

0.2279665470123291

## Shops/Items/Cats features

In [14]:
ts = time.time()
matrix = pd.merge(matrix, shops, on=['shop_id'], how='left')
matrix = pd.merge(matrix, items, on=['item_id'], how='left')
matrix = pd.merge(matrix, cats, on=['item_category_id'], how='left')
matrix['city_code'] = matrix['city_code'].astype(np.int8)
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['type_code'] = matrix['type_code'].astype(np.int8)
matrix['subtype_code'] = matrix['subtype_code'].astype(np.int8)
matrix.fillna(0, inplace=True)
time.time() - ts

8.23995041847229

## Traget lags

In [15]:
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [16]:
ts = time.time()
matrix = lag_feature(matrix, [1,2,3,4,6,12], 'item_cnt_month')
matrix = lag_feature(matrix, [1,2,3], 'orders')
matrix = lag_feature(matrix, [1,2,3], 'revenue')
time.time() - ts

130.303950548172

## Mean encoded features

In [17]:
ts = time.time()
group = matrix.groupby(['date_block_num']).agg({
    'item_cnt_month': ['mean'],
    'orders': ['mean'],
    'revenue': ['mean'],
})
group.columns = [ 'date_avg_item_cnt', 'date_avg_orders', 'date_avg_revenue' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num'], how='left')
matrix['date_avg_item_cnt'] = matrix['date_avg_item_cnt'].astype(np.float16)
matrix['date_avg_orders'] = matrix['date_avg_orders'].astype(np.float16)
matrix['date_avg_revenue'] = matrix['date_avg_revenue'].astype(np.float32)

matrix = lag_feature(matrix, [1], 'date_avg_item_cnt')
matrix = lag_feature(matrix, [1], 'date_avg_orders')
matrix = lag_feature(matrix, [1], 'date_avg_revenue')

matrix.drop(['date_avg_item_cnt','date_avg_orders','date_avg_revenue'], axis=1, inplace=True)

time.time() - ts

63.035012006759644

In [18]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'item_id']).agg({
    'item_cnt_month': ['mean' ],
    'orders': ['sum']
})
group.columns = [ 'date_item_avg_item_cnt', 'date_item_sum_orders' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_id'], how='left')
matrix['date_item_avg_item_cnt'] = matrix['date_item_avg_item_cnt'].astype(np.float16)
matrix['date_item_sum_orders'] = matrix['date_item_sum_orders'].astype(np.float16)

matrix = lag_feature(matrix, [1,2,3,4,6,12], 'date_item_avg_item_cnt')
matrix = lag_feature(matrix, [1,2,3,4,6,12], 'date_item_sum_orders')

matrix.drop(['date_item_avg_item_cnt','date_item_sum_orders'], axis=1, inplace=True)

time.time() - ts

179.89101552963257

In [19]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'shop_id']).agg({
    'item_cnt_month': ['mean' ],
    'orders': ['sum']
})
group.columns = [ 'date_shop_avg_item_cnt', 'date_shop_sum_orders' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','shop_id'], how='left')
matrix['date_shop_avg_item_cnt'] = matrix['date_shop_avg_item_cnt'].astype(np.float16)
matrix['date_shop_sum_orders'] = matrix['date_shop_sum_orders'].astype(np.float16)

matrix = lag_feature(matrix, [1,2,3,4,6,12], 'date_shop_avg_item_cnt')
matrix = lag_feature(matrix, [1,2,3,4,6,12], 'date_shop_sum_orders')

matrix.drop(['date_shop_avg_item_cnt','date_shop_sum_orders'], axis=1, inplace=True)

time.time() - ts

186.1190049648285

In [20]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'item_category_id']).agg({
    'item_cnt_month': ['mean'],
    'orders': ['sum']
})
group.columns = [ 'date_cat_avg_item_cnt', 'date_cat_sum_orders' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_category_id'], how='left')
matrix['date_cat_avg_item_cnt'] = matrix['date_cat_avg_item_cnt'].astype(np.float16)
matrix['date_cat_sum_orders'] = matrix['date_cat_sum_orders'].astype(np.float16)

matrix = lag_feature(matrix, [1], 'date_cat_avg_item_cnt')
matrix = lag_feature(matrix, [1], 'date_cat_sum_orders')

matrix.drop(['date_cat_avg_item_cnt','date_cat_sum_orders'], axis=1, inplace=True)
time.time() - ts

68.39800071716309

In [21]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'shop_id', 'item_category_id']).agg({
    'item_cnt_month': ['mean'],
    'orders': ['sum']
})
group.columns = [ 'date_shop_cat_avg_item_cnt', 'date_shop_cat_sum_orders' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id', 'item_category_id'], how='left')
matrix['date_shop_cat_avg_item_cnt'] = matrix['date_shop_cat_avg_item_cnt'].astype(np.float16)
matrix['date_shop_cat_sum_orders'] = matrix['date_shop_cat_sum_orders'].astype(np.float16)

matrix = lag_feature(matrix, [1], 'date_shop_cat_avg_item_cnt')
matrix = lag_feature(matrix, [1], 'date_shop_cat_sum_orders')

matrix.drop(['date_shop_cat_avg_item_cnt','date_shop_cat_sum_orders'], axis=1, inplace=True)
time.time() - ts

74.07460403442383

In [22]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'city_code']).agg({
    'item_cnt_month': ['mean'],
    'orders': ['sum']
})
group.columns = [ 'date_city_avg_item_cnt', 'date_city_sum_orders' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'city_code'], how='left')
matrix['date_city_avg_item_cnt'] = matrix['date_city_avg_item_cnt'].astype(np.float16)
matrix['date_city_sum_orders'] = matrix['date_city_sum_orders'].astype(np.float16)

matrix = lag_feature(matrix, [1], 'date_city_avg_item_cnt')
matrix = lag_feature(matrix, [1], 'date_city_sum_orders')

matrix.drop(['date_city_avg_item_cnt','date_city_sum_orders'], axis=1, inplace=True)
time.time() - ts

77.7345016002655

In [23]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'item_id', 'city_code']).agg({
    'item_cnt_month': ['mean'],
    'orders': ['sum']
})
group.columns = [ 'date_item_city_avg_item_cnt', 'date_item_city_sum_orders' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'item_id', 'city_code'], how='left')
matrix['date_item_city_avg_item_cnt'] = matrix['date_item_city_avg_item_cnt'].astype(np.float16)
matrix['date_item_city_sum_orders'] = matrix['date_item_city_sum_orders'].astype(np.float16)

matrix = lag_feature(matrix, [1], 'date_item_city_avg_item_cnt')
matrix = lag_feature(matrix, [1], 'date_item_city_sum_orders')

matrix.drop(['date_item_city_avg_item_cnt','date_item_city_sum_orders'], axis=1, inplace=True)
time.time() - ts

98.05036401748657

In [24]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'type_code']).agg({
    'item_cnt_month': ['mean'],
    'orders': ['sum']
})
group.columns = [ 'date_type_avg_item_cnt', 'date_type_sum_orders' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'type_code'], how='left')
matrix['date_type_avg_item_cnt'] = matrix['date_type_avg_item_cnt'].astype(np.float16)
matrix['date_type_sum_orders'] = matrix['date_type_sum_orders'].astype(np.float16)

matrix = lag_feature(matrix, [1], 'date_type_avg_item_cnt')
matrix = lag_feature(matrix, [1], 'date_type_sum_orders')

matrix.drop(['date_type_avg_item_cnt','date_type_sum_orders'], axis=1, inplace=True)
time.time() - ts

78.71849608421326

In [25]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'subtype_code']).agg({
    'item_cnt_month': ['mean'],
    'orders': ['sum']
})
group.columns = [ 'date_subtype_avg_item_cnt', 'date_subtype_sum_orders' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'subtype_code'], how='left')
matrix['date_subtype_avg_item_cnt'] = matrix['date_subtype_avg_item_cnt'].astype(np.float16)
matrix['date_subtype_sum_orders'] = matrix['date_subtype_sum_orders'].astype(np.float16)

matrix = lag_feature(matrix, [1], 'date_subtype_avg_item_cnt')
matrix = lag_feature(matrix, [1], 'date_subtype_sum_orders')

matrix.drop(['date_subtype_avg_item_cnt','date_subtype_sum_orders'], axis=1, inplace=True)
time.time() - ts

87.54944252967834

## Trend features

Price trend for the last six months.

In [26]:
ts = time.time()
group = train.groupby(['item_id']).agg({'item_price': ['mean']})
group.columns = ['item_avg_item_price']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['item_id'], how='left')
matrix['item_avg_item_price'] = matrix['item_avg_item_price'].astype(np.float16)

group = train.groupby(['date_block_num','item_id']).agg({'item_price': ['mean']})
group.columns = ['date_item_avg_item_price']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_id'], how='left')
matrix['date_item_avg_item_price'] = matrix['date_item_avg_item_price'].astype(np.float16)

lags = [1,2,3,4,5,6]
matrix = lag_feature(matrix, lags, 'date_item_avg_item_price')

for i in lags:
    matrix['delta_price_lag_'+str(i)] = \
        (matrix['date_item_avg_item_price_lag_'+str(i)] - matrix['item_avg_item_price']) / matrix['item_avg_item_price']

def select_trend(row):
    for i in lags:
        if row['delta_price_lag_'+str(i)]:
            return row['delta_price_lag_'+str(i)]
    return 0
    
matrix['delta_price_lag'] = matrix.apply(select_trend, axis=1)
matrix['delta_price_lag'] = matrix['delta_price_lag'].astype(np.float16)
matrix['delta_price_lag'].fillna(0, inplace=True)

# https://stackoverflow.com/questions/31828240/first-non-null-value-per-row-from-a-list-of-pandas-columns/31828559
# matrix['price_trend'] = matrix[['delta_price_lag_1','delta_price_lag_2','delta_price_lag_3']].bfill(axis=1).iloc[:, 0]
# Invalid dtype for backfill_2d [float16]

fetures_to_drop = ['item_avg_item_price', 'date_item_avg_item_price']
for i in lags:
    fetures_to_drop += ['date_item_avg_item_price_lag_'+str(i)]
    fetures_to_drop += ['delta_price_lag_'+str(i)]

matrix.drop(fetures_to_drop, axis=1, inplace=True)

time.time() - ts

401.1923325061798

In [27]:
ts = time.time()
group = train.groupby(['shop_id']).agg({'revenue': ['mean']}) #TODO add target mean
group.columns = ['shop_avg_revenue']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['shop_id'], how='left')
matrix['shop_avg_revenue'] = matrix['shop_avg_revenue'].astype(np.float16)

group = train.groupby(['date_block_num','shop_id']).agg({'revenue': ['mean']})
group.columns = ['date_shop_avg_revenue']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','shop_id'], how='left')
matrix['date_shop_avg_revenue'] = matrix['date_shop_avg_revenue'].astype(np.float16)

lags = [1,2,3]
matrix = lag_feature(matrix, lags, 'date_shop_avg_revenue')

for i in lags:
    matrix['delta_revenue_lag_'+str(i)] = \
        (matrix['date_shop_avg_revenue_lag_'+str(i)] - matrix['shop_avg_revenue']) / matrix['shop_avg_revenue']

def select_trend(row):
    for i in lags:
        if row['delta_revenue_lag_'+str(i)]:
            return row['delta_revenue_lag_'+str(i)]
    return 0
    
matrix['delta_revenue_lag'] = matrix.apply(select_trend, axis=1)
matrix['delta_revenue_lag'] = matrix['delta_revenue_lag'].astype(np.float16)
matrix['delta_revenue_lag'].fillna(0, inplace=True)

fetures_to_drop = ['shop_avg_revenue', 'date_shop_avg_revenue']
for i in lags:
    fetures_to_drop += ['date_shop_avg_revenue_lag_'+str(i)]
    fetures_to_drop += ['delta_revenue_lag_'+str(i)]

matrix.drop(fetures_to_drop, axis=1, inplace=True)

time.time() - ts

291.65957975387573

## Special features

Month (0 - Jan, 11 - Dec)

In [28]:
matrix['month'] = matrix['date_block_num'] % 12

Number of days in a month. There are no leap years.

In [29]:
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
matrix['days'] = matrix['month'].map(days).astype(np.int8)

Months since last sale for each shop/item pair and for item only. I use programing approach.

Create hash table with key equals to {shop_id,item_id} and value equals to date_block_num. Iterate data from the top. Foreach row if {row.shop_id,item_id} is not present in the table, then add to the table and set its value to row.date_blocl_num. if hash table contains key, then calculate the difference beteween cached value and row.date_block_num.

In [30]:
ts = time.time()
cache = {}
matrix['item_shop_last_sale'] = -1
matrix['item_shop_last_sale'] = matrix['item_shop_last_sale'].astype(np.int8)
for idx, row in matrix.iterrows():    
    key = str(row.item_id)+' '+str(row.shop_id)
    if key not in cache:
        if row.item_cnt_month!=0:
            cache[key] = row.date_block_num
    else:
        last_date_block_num = cache[key]
        matrix.at[idx, 'item_shop_last_sale'] = row.date_block_num - last_date_block_num
        cache[key] = row.date_block_num         
time.time() - ts

884.6599349975586

In [31]:
ts = time.time()
cache = {}
matrix['item_last_sale'] = -1
matrix['item_last_sale'] = matrix['item_last_sale'].astype(np.int8)
for idx, row in matrix.iterrows():    
    key = row.item_id
    if key not in cache:
        if row.item_cnt_month!=0:
            cache[key] = row.date_block_num
    else:
        last_date_block_num = cache[key]
        if row.date_block_num>last_date_block_num:
            matrix.at[idx, 'item_last_sale'] = row.date_block_num - last_date_block_num
            cache[key] = row.date_block_num         
time.time() - ts

457.171103477478

Months since first sale for each shop/item pair and for item only.

In [32]:
ts = time.time()
matrix['item_shop_first_sale'] = matrix['date_block_num'] - matrix.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
matrix['item_first_sale'] = matrix['date_block_num'] - matrix.groupby('item_id')['date_block_num'].transform('min')
time.time() - ts

3.8419981002807617

## TF-IDF

In [33]:
matrix = pd.merge(matrix, tfidf, on='item_id', how='left')

## Final preparations

Because of the using 12 as lag value drop first 12 months. Also drop all the columns with this month calculated values (other words which can not be calcucated for the test set).

In [34]:
ts = time.time()
matrix = matrix[matrix.date_block_num > 11]
matrix.drop(['orders', 'revenue'], axis=1, inplace=True)
time.time() - ts

8.449002981185913

Producing lags brings a lot of nulls.

In [35]:
'''
ts = time.time()
def fill_na(df):
    for col in df.columns:
        if ('_lag_' in col) & (df[col].isnull().any()):
            if ('item_cnt' in col):
                df[col].fillna(0, inplace=True)
            if ('orders' in col):
                df[col].fillna(0, inplace=True)
            #if ('revenue' in col):
            #    df[col].fillna(df[col].median(), inplace=True)            
    return df

matrix = fill_na(matrix)
time.time() - ts
'''

"\nts = time.time()\ndef fill_na(df):\n    for col in df.columns:\n        if ('_lag_' in col) & (df[col].isnull().any()):\n            if ('item_cnt' in col):\n                df[col].fillna(0, inplace=True)\n            if ('orders' in col):\n                df[col].fillna(0, inplace=True)\n            #if ('revenue' in col):\n            #    df[col].fillna(df[col].median(), inplace=True)            \n    return df\n\nmatrix = fill_na(matrix)\ntime.time() - ts\n"

In [36]:
matrix.columns

Index(['date_block_num', 'item_cnt_month', 'item_id', 'shop_id', 'city_code',
       'item_category_id', 'type_code', 'subtype_code', 'item_cnt_month_lag_1',
       'item_cnt_month_lag_2', 'item_cnt_month_lag_3', 'item_cnt_month_lag_4',
       'item_cnt_month_lag_6', 'item_cnt_month_lag_12', 'orders_lag_1',
       'orders_lag_2', 'orders_lag_3', 'revenue_lag_1', 'revenue_lag_2',
       'revenue_lag_3', 'date_avg_item_cnt_lag_1', 'date_avg_orders_lag_1',
       'date_avg_revenue_lag_1', 'date_item_avg_item_cnt_lag_1',
       'date_item_avg_item_cnt_lag_2', 'date_item_avg_item_cnt_lag_3',
       'date_item_avg_item_cnt_lag_4', 'date_item_avg_item_cnt_lag_6',
       'date_item_avg_item_cnt_lag_12', 'date_item_sum_orders_lag_1',
       'date_item_sum_orders_lag_2', 'date_item_sum_orders_lag_3',
       'date_item_sum_orders_lag_4', 'date_item_sum_orders_lag_6',
       'date_item_sum_orders_lag_12', 'date_shop_avg_item_cnt_lag_1',
       'date_shop_avg_item_cnt_lag_2', 'date_shop_avg_item_cn

In [37]:
matrix.to_pickle('matrix.pkl')
# matrix = pickle.load(open('matrix.pickle', 'rb'))

In [38]:
matrix.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6639294 entries, 4488710 to 11128003
Data columns (total 79 columns):
date_block_num                       int8
item_cnt_month                       float16
item_id                              int16
shop_id                              int8
city_code                            int8
item_category_id                     int8
type_code                            int8
subtype_code                         int8
item_cnt_month_lag_1                 float16
item_cnt_month_lag_2                 float16
item_cnt_month_lag_3                 float16
item_cnt_month_lag_4                 float16
item_cnt_month_lag_6                 float16
item_cnt_month_lag_12                float16
orders_lag_1                         float16
orders_lag_2                         float16
orders_lag_3                         float16
revenue_lag_1                        float32
revenue_lag_2                        float32
revenue_lag_3                        float32
