In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# Load scripts from parent path
import sys, os
sys.path.insert(0, os.path.abspath('..'))

## Load Data

In [2]:
from scripts.processing import load_train_data, process_data, add_store_info, add_week_month_info

train_raw = load_train_data()
train = add_week_month_info(train_raw)
train = process_data(train)
train = add_store_info(train)

# train.loc[:, 'pol'] = train.loc[:, 'Store_Sales_mean'] * train.loc[:, 'Store_Customers_mean']
# train.loc[:, 'pol'] = train.loc[:, 'Store_Sales_mean']**2

train.head()

  train.loc[:,'week'] = train.loc[:,'Date'].dt.week


Unnamed: 0,Store,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday,week,month,Store_Sales_mean,Store_Customers_mean
0,353.0,2.0,3139.0,1.0,0.0,0,1.0,1,1,4139.474576,1153.783333
1,335.0,2.0,2401.0,1.0,0.0,0,1.0,1,1,12845.896552,2384.271186
2,512.0,2.0,2646.0,1.0,0.0,0,1.0,1,1,3725.649123,888.627119
3,494.0,2.0,3113.0,1.0,0.0,0,1.0,1,1,7079.15,1010.583333
4,530.0,2.0,2907.0,1.0,0.0,0,1.0,1,1,2260.783333,333.610169


## Prepare train/test data

In [3]:
X_train = train.copy(deep=True).drop(columns=["Sales"])
y_train = train.loc[:, "Sales"]

## Simple Models

In [4]:
def metric(preds, actuals):    
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

### Predict mean

In [6]:
lazy_predictor = pd.DataFrame(y_train.copy())

lazy_predictor.loc[:, 'y_pred'] = lazy_predictor.mean()['Sales']
lazy_predictor.head()

Unnamed: 0,Sales,y_pred
0,3139.0,6837.740902
1,2401.0,6837.740902
2,2646.0,6837.740902
3,3113.0,6837.740902
4,2907.0,6837.740902


In [6]:
metric(lazy_predictor.loc[:, 'y_pred'].values, lazy_predictor.loc[:, 'Sales'].values)

61.73132119104066

## Random Forest

In [5]:
from sklearn.ensemble import RandomForestRegressor

# Drop date column too, makes no sense for random forest
X_train = train.copy(deep=True).drop(columns=["Sales"])
y_train = train.loc[:, "Sales"].values

reg = RandomForestRegressor(max_depth=10, min_samples_split=100)

reg.fit(X_train, y_train)
y_pred = reg.predict(X_train)


metric(y_train, y_pred)

17.683607599895144

In [12]:
from scripts.pipeline import save_pipeline, load_pipeline

# save_pipeline(reg, name='random_forest_2')
pipe = load_pipeline(name='random_forest_2')


In [13]:
pipe

RandomForestRegressor(max_depth=10, min_samples_split=100)

In [13]:
reg.feature_importances_

array([3.41290182e-03, 5.47534910e-02, 0.00000000e+00, 1.72867304e-01,
       6.28766600e-04, 4.49769924e-04, 1.54064079e-02, 1.45209366e-03,
       7.40345870e-01, 1.06833953e-02])

## Linear Regression

In [7]:
from sklearn.linear_model import LinearRegression, Ridge

# Drop date column too, makes no sense for random forest
X_train = train.copy(deep=True).drop(columns=["Sales", "Store", "Open"])
y_train = train.loc[:, "Sales"].values

# reg = LinearRegression()
reg = Ridge(alpha=40000)
reg.fit(X_train, y_train)

y_pred = reg.predict(X_train)

metric(y_train, y_pred)

ValueError: could not convert string to float: 'a'

In [8]:
reg.coef_

AttributeError: 'Ridge' object has no attribute 'coef_'

### Save model

In [9]:
from scripts.models import load_model, save_model

save_model(model=reg, name="linear_regressor_1")

Saving model "linear_regressor_1" at:
../data/trained_models/model_linear_regressor_1.p


In [13]:
reg = load_model(name='linear_regressor_2')

[Errno 2] No such file or directory: '../data/trained_models/model_linear_regressor_2.p': Could not find model "linear_regressor_2" at path:
../data/trained_models/model_linear_regressor_2.p


In [12]:
reg.coef_

array([-1.87347223e+02,  1.54668308e+03,  7.27861735e+00,  2.36334541e+01,
        1.59404779e+01, -6.02897032e+01,  1.26554842e+00, -8.83825528e-01])

In [17]:
from sklearn.linear_model import LinearRegression

In [15]:
import scripts.processing as scr

scr.process_data
# scripts.processing.process_data

<function scripts.processing.process_data(train_raw, drop_null=True)>

In [19]:
holdout = pd.read_csv('../data/holdout_b29.csv',
                parse_dates=[1],
                usecols=['Date', 'Store', 'DayOfWeek',
                        'Customers', 'Open', 'Promo',
                        'StateHoliday', 'SchoolHoliday'],
                dtype = {
                    'StateHoliday': str
                })
holdout.loc[:, 'Sales'] = 6837
holdout.to_csv("../data/holdout_fake.csv")