In [1]:
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


import datetime as dt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.distributions.empirical_distribution import ECDF
from datetime import datetime as dt


warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
# Test Dataset
test_df = pd.read_csv("./data/test.csv", parse_dates=True, low_memory=False, index_col='Date')

# Sample Submission
sample_df = pd.read_csv("./data/sample_submission.csv")

In [4]:
def preprocessing():
    # Train Dataset
    train_df = pd.read_csv("./data/train.csv", parse_dates=True, low_memory=False, index_col='Date')
    # Store Data
    store_df = pd.read_csv("./data/store.csv", low_memory=False)
    
    # Add date features
    train_df['Year'] = train_df.index.year
    train_df['Month'] = train_df.index.month
    train_df['Day'] = train_df.index.day
    train_df['DayName'] = train_df.index.day_name()
    train_df['WkofYr'] = train_df.index.weekofyear
    train_df['DayofYr'] = train_df.index.day_of_year
    train_df['DayofWk'] = train_df.index.day_of_week
    # Calculate Sales Per Customer
    train_df['SalesPerCustomer'] = train_df.Sales/train_df.Customers
    # Closed stores and days which didn't have any sales are not included in forecasts
    train_df = train_df[(train_df["Open"] != 0) & (train_df['Sales'] != 0)]

    # fill NaN with a median value (skewed distribuion)
    store_df['CompetitionDistance'].fillna(store_df['CompetitionDistance'].median(), inplace = True)
    # replace NA's by 0
    store_df.fillna(0, inplace = True)

    # merge the above
    merged_df = pd.merge(train_df, store_df, how = 'inner', on = 'Store')
    return merged_df

In [5]:
train_store_df = preprocessing()
train_store_df.head()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,DayName,WkofYr,DayofYr,DayofWk,SalesPerCustomer,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,5263,555,1,1,0,1,2015,7,31,Friday,31,212,4,9.482883,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
1,1,4,5020,546,1,1,0,1,2015,7,30,Thursday,31,211,3,9.194139,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
2,1,3,4782,523,1,1,0,1,2015,7,29,Wednesday,31,210,2,9.143403,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
3,1,2,5011,560,1,1,0,1,2015,7,28,Tuesday,31,209,1,8.948214,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
4,1,1,6102,612,1,1,0,1,2015,7,27,Monday,31,208,0,9.970588,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
