In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# Set random seed for reproducibility
np.random.seed(42)

# Visualization settings
plt.style.use('default')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

In [2]:
import os
import pandas as pd

def load_data():
    base_dir = os.getcwd()  # current working directory in notebook
    data_dir = os.path.abspath(os.path.join(base_dir, "..", "data", "raw"))
    print("Looking for data in:", data_dir)

    train = pd.read_csv(os.path.join(data_dir, "train.csv"), parse_dates=["date"])
    test = pd.read_csv(os.path.join(data_dir, "test.csv"), parse_dates=["date"])
    stores = pd.read_csv(os.path.join(data_dir, "stores.csv"))
    oil = pd.read_csv(os.path.join(data_dir, "oil.csv"), parse_dates=["date"])
    holidays = pd.read_csv(os.path.join(data_dir, "holidays_events.csv"), parse_dates=["date"])
    transactions = pd.read_csv(os.path.join(data_dir, "transactions.csv"), parse_dates=["date"])

    return train, test, stores, oil, holidays, transactions

train, test, stores, oil, holidays, transactions = load_data()


Looking for data in: d:\ds-toolkit\DS-toolkit\data\raw


Merge with stores

In [3]:
train = train.merge(stores, on='store_nbr', how='left')
test = test.merge(stores, on='store_nbr', how='left')
train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type,cluster
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13
1,1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13
2,2,2013-01-01,1,BEAUTY,0.0,0,Quito,Pichincha,D,13
3,3,2013-01-01,1,BEVERAGES,0.0,0,Quito,Pichincha,D,13
4,4,2013-01-01,1,BOOKS,0.0,0,Quito,Pichincha,D,13


 Merge with transactions¶


In [4]:
train = train.merge(transactions, on=['date', 'store_nbr'], how='left')
test = test.merge(transactions, on=['date', 'store_nbr'], how='left')
train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,transactions
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13,
1,1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13,
2,2,2013-01-01,1,BEAUTY,0.0,0,Quito,Pichincha,D,13,
3,3,2013-01-01,1,BEVERAGES,0.0,0,Quito,Pichincha,D,13,
4,4,2013-01-01,1,BOOKS,0.0,0,Quito,Pichincha,D,13,


Merge with oil (interpolate missing values)

In [5]:
oil = oil.set_index('date').resample('D').mean().interpolate()
oil.reset_index(inplace=True)
train = train.merge(oil, on='date', how='left')
test = test.merge(oil, on='date', how='left')
train.head()


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,transactions,dcoilwtico
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13,,
1,1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13,,
2,2,2013-01-01,1,BEAUTY,0.0,0,Quito,Pichincha,D,13,,
3,3,2013-01-01,1,BEVERAGES,0.0,0,Quito,Pichincha,D,13,,
4,4,2013-01-01,1,BOOKS,0.0,0,Quito,Pichincha,D,13,,


 Merge holidays: binary features

In [6]:
def process_holidays(df):
    df = df.copy()
    df['is_holiday'] = 1
    df = df[['date', 'is_holiday', 'type', 'locale']]
    df = df.drop_duplicates('date')
    df = df.pivot_table(index='date', 
                        values='is_holiday', 
                        aggfunc='max').reset_index()
    return df

holiday_features = process_holidays(holidays)
train = train.merge(holiday_features, on='date', how='left')
test = test.merge(holiday_features, on='date', how='left')


train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,transactions,dcoilwtico,is_holiday
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13,,,1.0
1,1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13,,,1.0
2,2,2013-01-01,1,BEAUTY,0.0,0,Quito,Pichincha,D,13,,,1.0
3,3,2013-01-01,1,BEVERAGES,0.0,0,Quito,Pichincha,D,13,,,1.0
4,4,2013-01-01,1,BOOKS,0.0,0,Quito,Pichincha,D,13,,,1.0


filling Missing values 

In [7]:
print(train['is_holiday'].isna().sum())

train['is_holiday'] = train['is_holiday'].fillna(0)
print(train['is_holiday'].isna().sum())

2551824
0


In [8]:
print(test['is_holiday'].isna().sum())

test['is_holiday'] = test['is_holiday'].fillna(0)
print(test['is_holiday'].isna().sum())

26730
0


feature engineering


In [9]:
for df in [train, test]:
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['day_of_week'] = df['date'].dt.dayofweek
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)

In [10]:
train.head()


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,transactions,dcoilwtico,is_holiday,day,month,year,day_of_week,is_weekend
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13,,,1.0,1,1,2013,1,0
1,1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13,,,1.0,1,1,2013,1,0
2,2,2013-01-01,1,BEAUTY,0.0,0,Quito,Pichincha,D,13,,,1.0,1,1,2013,1,0
3,3,2013-01-01,1,BEVERAGES,0.0,0,Quito,Pichincha,D,13,,,1.0,1,1,2013,1,0
4,4,2013-01-01,1,BOOKS,0.0,0,Quito,Pichincha,D,13,,,1.0,1,1,2013,1,0


In [11]:
test.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion,city,state,type,cluster,transactions,dcoilwtico,is_holiday,day,month,year,day_of_week,is_weekend
0,3000888,2017-08-16,1,AUTOMOTIVE,0,Quito,Pichincha,D,13,,46.8,0.0,16,8,2017,2,0
1,3000889,2017-08-16,1,BABY CARE,0,Quito,Pichincha,D,13,,46.8,0.0,16,8,2017,2,0
2,3000890,2017-08-16,1,BEAUTY,2,Quito,Pichincha,D,13,,46.8,0.0,16,8,2017,2,0
3,3000891,2017-08-16,1,BEVERAGES,20,Quito,Pichincha,D,13,,46.8,0.0,16,8,2017,2,0
4,3000892,2017-08-16,1,BOOKS,0,Quito,Pichincha,D,13,,46.8,0.0,16,8,2017,2,0


label encoding for categorical features

In [None]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ['family', 'city', 'state', 'type', 'cluster']


for col in cat_cols:
    le = LabelEncoder()
    train[col + '_encoded'] = le.fit_transform(train[col])
    test[col + '_encoded'] = le.transform(test[col])

In [13]:
train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,...,day,month,year,day_of_week,is_weekend,family_encoded,city_encoded,state_encoded,type_encoded,cluster_encoded
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13,...,1,1,2013,1,0,0,18,12,3,12
1,1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13,...,1,1,2013,1,0,1,18,12,3,12
2,2,2013-01-01,1,BEAUTY,0.0,0,Quito,Pichincha,D,13,...,1,1,2013,1,0,2,18,12,3,12
3,3,2013-01-01,1,BEVERAGES,0.0,0,Quito,Pichincha,D,13,...,1,1,2013,1,0,3,18,12,3,12
4,4,2013-01-01,1,BOOKS,0.0,0,Quito,Pichincha,D,13,...,1,1,2013,1,0,4,18,12,3,12
