# **EXPLORATORY DATA ANALYSIS FOR M5**

## **INITIALIZATION**

In [1]:
import sys
print(sys.version)

3.7.5 (default, Apr 14 2020, 11:44:53) 
[GCC 7.5.0]


In [2]:
# load required packages
import os
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import pylab as pl

import matplotlib.pyplot as plt
plt.style.use('bmh')
%matplotlib inline

#import seaborn as sns
#color = sns.color_palette()
#sns.set_style('darkgrid')

from scipy import stats
from scipy.stats import norm, skew

import gc
import lightgbm as lgb

In [3]:
# ignore warnings from sklearn and seaborn
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

# pandas output format
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))
pd.options.display.max_columns = 50

In [4]:
# check files available
from subprocess import check_output
print(check_output(['ls', os.getcwd()]).decode('utf8'))

calendar.csv
M5-Competitors-Guide-Final-10-March-2020.odt
m5-forecasting-eda (copy 1).ipynb
m5-forecasting-eda.ipynb
sales_train_validation.csv
sample_submission.csv
sell_prices.csv
SGB-m5-forecasting.ipynb



## **EXPLORATION**

In [5]:
cal_dtypes = {'event_name_1': 'category', 'event_name_2': 'category', 
              'event_type_1': 'category', 'event_type_2': 'category',
              'weekday': 'category', 'wm_yr_wk': 'int16', 'wday': 'int16',
              'month': 'int16', 'year': 'int16', 'snap_CA': 'float32', 
              'snap_TX': 'float32', 'snap_WI': 'float32'}
price_dtypes = {'store_id': 'category', 'item_id': 'category', 'wm_yr_wk': 'int16',
               'sell_price': 'float32'}

In [6]:
# parameters for constructing time series
h = 28 # forecast horizon
max_lags = 57
tr_last = 1913 # last training observation
fday = datetime(2016, 4, 25) # forecast start date
fday

datetime.datetime(2016, 4, 25, 0, 0)

In [7]:
# construct time series
def create_df(is_train = True, nrows = None, first_day = 1200):
    prices = pd.read_csv('sell_prices.csv', dtype = price_dtypes)
    for col, col_dtype in price_dtypes.items():
        if col_dtype == 'category':
            prices[col] = prices[col].cat.codes.astype('int16')
            prices[col] -= prices[col].min() # scaling
    cal = pd.read_csv('calendar.csv', dtype = cal_dtypes)
    cal['date'] = pd.to_datetime(cal['date'])
    for col, col_dtype in cal_dtypes.items():
        if col_dtype == 'category':
            cal[col] = cal[col].cat.codes.astype('int16')
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train else tr_last - max_lags, first_day)
    numcols = [f'd_{day}' for day in range(start_day, tr_last+1)] #sales data rolling window
    catcols = ['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id']
    dtype = {numcol: 'float32' for numcol in numcols}
    dtype.update({col: 'category' for col in catcols if col != 'id'})
    df = pd.read_csv('sales_train_validation.csv', nrows = nrows, 
                     usecols = catcols + numcols, dtype = dtype)
    for col in catcols:
        if col != 'id':
            df[col] = df[col].cat.codes.astype('int16')
            df[col] -= df[col].min()
    if not is_train:
        for day in range(tr_last + 1, tr_last + 28 + 1):
            df[f'd_{day}'] = np.nan
    df = pd.melt(df, 
                 id_vars = catcols,
                 value_vars = [col for col in df.columns if col.startswith('d_')], # numeric
                 var_name = 'd', # day
                 value_name = 'sales')
    df = df.merge(cal, on='d', copy = False)
    df = df.merge(prices, on = ['store_id', 'item_id', 'wm_yr_wk'], copy=False)
    return df 

In [8]:
# create forecast series
def create_fea(df):
    lags = [7, 28]
    lag_cols = [f'lag_{lag}' for lag in lags]
    for lag, lag_col in zip(lags, lag_cols):
        df[lag_col] = df[['id', 'sales']].groupby('id')['sales'].shift(lag)
        
    wins = [7, 28] # windows
    for win in wins:
        for lag, lag_col in zip(lags, lag_cols):
            df[f'rmean_{lag}_{win}'] = df[['id', lag_col]].groupby('id')[lag_col].transform(lambda x: x.rolling(win).mean())
    
    date_features = {
        'wday': 'weekday',
        'week': 'weekofyear',
        'month': 'month',
        'quarter': 'quarter',
        'year': 'year',
        'mday': 'day'}
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in df.columns:
            df[date_feat_name] = df[date_feat_name].astype('int16')
        else:
            df[date_feat_name] = getattr(df['date'].dt, date_feat_func).astype('int16')

In [9]:
%%time
df = create_df(is_train=True, first_day = 500) #skip days to save on memory
df.shape

CPU times: user 29 s, sys: 5.9 s, total: 34.9 s
Wall time: 36.7 s


(37960593, 22)

In [None]:
#df.head()
df.info()

In [10]:
%%time
create_fea(df)
df.shape

CPU times: user 2min 39s, sys: 10.7 s, total: 2min 50s
Wall time: 2min 52s


(37960593, 31)

In [None]:
#df.head()
df.info()

In [11]:
# drop nans
df.dropna(inplace=True)
df.shape

(36283643, 31)

In [13]:
# model
cat_feats = ['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id'] + ['event_name_1', 'event_name_2', 'event_type_1', 'event_type_2']
useless_cols = ['id', 'date', 'sales', 'd', 'wm_yr_wk', 'weekday']
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df['sales']

In [16]:
%%time
np.random.seed(777)
fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace=False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds], label = y_train.loc[train_inds], 
                        categorical_feature = cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                            categorical_feature = cat_feats, free_raw_data=False)

MemoryError: Unable to allocate array with shape (36283643,) and data type int64