In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# Load scripts from parent path
import sys, os
sys.path.insert(0, os.path.abspath('..'))

## Load Data

In [5]:
from scripts.processing import load_train_data, process_data, add_store_info, add_week_month_info

train_raw = load_train_data()
train = add_week_month_info(train_raw)
train = process_data(train)
train = add_store_info(train)

# train.loc[:, 'pol'] = train.loc[:, 'Store_Sales_mean'] * train.loc[:, 'Store_Customers_mean']
# train.loc[:, 'pol'] = train.loc[:, 'Store_Sales_mean']**2

train.head()

  train.loc[:,'week'] = train.loc[:,'Date'].dt.week


Unnamed: 0,Store,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday,week,month,Store_Sales_mean,Store_Customers_mean
0,353.0,2.0,3139.0,1.0,0.0,0,1.0,1,1,4139.474576,1153.783333
1,335.0,2.0,2401.0,1.0,0.0,0,1.0,1,1,12845.896552,2384.271186
2,512.0,2.0,2646.0,1.0,0.0,0,1.0,1,1,3725.649123,888.627119
3,494.0,2.0,3113.0,1.0,0.0,0,1.0,1,1,7079.15,1010.583333
4,530.0,2.0,2907.0,1.0,0.0,0,1.0,1,1,2260.783333,333.610169


## Prepare train/test data

In [6]:
X_train = train.copy(deep=True).drop(columns=["Sales"])
y_train = train.loc[:, "Sales"]

## Simple Models

In [7]:
def metric(preds, actuals):    
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

### Predict mean

In [18]:
lazy_predictor = pd.DataFrame(y_train.copy())

lazy_predictor.loc[:, 'y_pred'] = lazy_predictor.mean()['Sales']
lazy_predictor.head()

Unnamed: 0,Sales,y_pred
0,3139.0,6837.740902
1,2401.0,6837.740902
2,2646.0,6837.740902
3,3113.0,6837.740902
4,2907.0,6837.740902


In [6]:
metric(lazy_predictor.loc[:, 'y_pred'].values, lazy_predictor.loc[:, 'Sales'].values)

61.73132119104066

## Random Forest

In [9]:
from sklearn.ensemble import RandomForestRegressor

# Drop date column too, makes no sense for random forest
X_train = train.copy(deep=True).drop(columns=["Sales"])
y_train = train.loc[:, "Sales"].values

reg = RandomForestRegressor(random_state=42)

reg.fit(X_train, y_train)
y_pred = reg.predict(X_train)

metric(y_train, y_pred)

6.805256489846616

In [6]:
X_train

Unnamed: 0,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,Store_Sales_mean,Store_Customers_mean
0,353.0,2.0,1.0,0.0,0,1.0,5070.941176,1342.257299
1,335.0,2.0,1.0,0.0,0,1.0,12972.122486,2397.374088
2,512.0,2.0,1.0,0.0,0,1.0,5048.029528,1204.784884
3,494.0,2.0,1.0,0.0,0,1.0,7536.871324,1074.582397
4,530.0,2.0,1.0,0.0,0,1.0,4253.464945,701.737523
...,...,...,...,...,...,...,...,...
440122,748.0,4.0,1.0,1.0,3,1.0,4737.405607,406.403704
440123,743.0,4.0,1.0,1.0,3,1.0,3091.609489,346.711645
440124,752.0,4.0,1.0,1.0,3,1.0,4303.022099,386.406654
440125,755.0,4.0,1.0,1.0,3,1.0,8423.229656,730.222018


## Linear Regression

In [14]:
from sklearn.linear_model import LinearRegression

# Drop date column too, makes no sense for random forest
X_train = train.copy(deep=True).drop(columns=["Sales", "Date", "Store"])
y_train = train.loc[:, "Sales"].values

reg = LinearRegression()
reg.fit(X_train, y_train)

y_pred = reg.predict(X_train)

metric(y_train, y_pred)

23.617025941249114

In [10]:
X_train

Unnamed: 0,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday
27,353.0,2.0,1.0,0.0,0,1.0
115,335.0,2.0,1.0,0.0,0,1.0
147,512.0,2.0,1.0,0.0,0,1.0
162,494.0,2.0,1.0,0.0,0,1.0
199,530.0,2.0,1.0,0.0,0,1.0
...,...,...,...,...,...,...
637766,748.0,4.0,1.0,1.0,3,1.0
637768,743.0,4.0,1.0,1.0,3,1.0
637769,752.0,4.0,1.0,1.0,3,1.0
637772,755.0,4.0,1.0,1.0,3,1.0
