# Pre-Processing

## 1. Imports and Load Data

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../Data/data_cleaned.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5853780 entries, 0 to 5853779
Data columns (total 22 columns):
 #   Column        Dtype  
---  ------        -----  
 0   id            object 
 1   item_id       object 
 2   dept_id_x     object 
 3   cat_id        object 
 4   store_id      object 
 5   state_id      object 
 6   d             object 
 7   sales         int64  
 8   date          object 
 9   wm_yr_wk      int64  
 10  weekday       object 
 11  wday          int64  
 12  month         int64  
 13  year          int64  
 14  event_name_1  object 
 15  event_type_1  object 
 16  event_name_2  object 
 17  event_type_2  object 
 18  snap_CA       int64  
 19  snap_TX       int64  
 20  snap_WI       int64  
 21  sell_price    float64
dtypes: float64(1), int64(8), object(13)
memory usage: 982.5+ MB


## 2. Train-Test Split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='sales'), 
                                                    df.sales, test_size=0.3, 
                                                    random_state=47)

In [4]:
X_train = X_train.reset_index()
X_test = X_train.reset_index()

## 3. Date Related Features

Date related features : `d`, `date`, `wm_yr_wk`, `weekday`, `wday`, `month`, `year`

In [5]:
# d counts the number of days from the starting date in the d_## format.
# remove d_ and store only number

X_train['d'] = X_train['d'].apply(lambda x: x.split('_')[1]).astype('int64')
X_test['d'] = X_test['d'].apply(lambda x: x.split('_')[1]).astype('int64')

In [6]:
# wday is the id of the weekday, starting from Saturday.
# add weekend feature : 0 - weekday, 1 - weekend

def weekend(x):
    if x in [1,2]:
        return 1
    else:
        return 0

In [7]:
X_train['weekend'] = X_train['wday'].apply(lambda x: weekend(x))
X_test['weekend'] = X_test['wday'].apply(lambda x: weekend(x))

In [8]:
# drop `d`, `date`, `wm_yr_wk`, `weekday` columns from `X_train` and `X_test`

date_list = ['d','date','wm_yr_wk','weekday', 'index']

X_train.drop(columns=date_list, inplace=True)
X_test.drop(columns=date_list, inplace=True)

## 4. Encode Categorical Features

Categorical features : `id`, `item_id`, `dept_id_x`, `cat_id`, `store_id`, `state_id`, `event_name_1`, `event_type_1`, `event_name_2`, `event_type_2`


In [9]:
# Categorical columns are divided into two groups.

cat1 = ['dept_id_x','cat_id','store_id','state_id']
cat2 = ['item_id','event_name_1','event_type_1','event_name_2','event_type_2']

In [10]:
ohe = OneHotEncoder(categories='auto')

feature_arr_train = ohe.fit_transform(X_train[cat1]).toarray()
feature_arr_test = ohe.fit_transform(X_test[cat1]).toarray()

ohe_labels = ohe.get_feature_names(cat1)

X_train[ohe_labels] = pd.DataFrame(feature_arr_train, columns=ohe_labels)
X_test[ohe_labels] = pd.DataFrame(feature_arr_test, columns=ohe_labels)

In [15]:
# add holiday feature : 0 - no_event, 1 - holiday

def holiday(x):
    if x == 'no_event':
        return 0
    else:
        return 1

In [16]:
X_train['holiday'] = X_train['event_name_1'].apply(lambda x: holiday(x))
X_test['holiday'] = X_test['event_name_1'].apply(lambda x: holiday(x))

In [17]:
X_train[cat2] = X_train[cat2].apply(LabelEncoder().fit_transform)
X_test[cat2] = X_train[cat2].apply(LabelEncoder().fit_transform)

In [20]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4097646 entries, 0 to 4097645
Data columns (total 37 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   item_id                int64  
 1   wday                   int64  
 2   month                  int64  
 3   year                   int64  
 4   event_name_1           int64  
 5   event_type_1           int64  
 6   event_name_2           int64  
 7   event_type_2           int64  
 8   snap_CA                int64  
 9   snap_TX                int64  
 10  snap_WI                int64  
 11  sell_price             float64
 12  weekend                int64  
 13  dept_id_x_FOODS_1      float64
 14  dept_id_x_FOODS_2      float64
 15  dept_id_x_FOODS_3      float64
 16  dept_id_x_HOBBIES_1    float64
 17  dept_id_x_HOBBIES_2    float64
 18  dept_id_x_HOUSEHOLD_1  float64
 19  dept_id_x_HOUSEHOLD_2  float64
 20  cat_id_FOODS           float64
 21  cat_id_HOBBIES         float64
 22  cat_id_HOUSEHOLD  