In [1]:
# general
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
# sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
# encoders
from category_encoders.target_encoder import TargetEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.one_hot import OneHotEncoder
# custom classes
from preprocessing import Merger, Imputer, Cleaner

In [2]:
store_data = pd.read_csv("./data/store.csv")
sales_data = pd.read_csv("./data/train.csv")
# holdout_data = pd.read_csv("./data/holdout.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
mrg = Merger()
mrg.merge(sales_data, store_data)
merged_data = mrg.merged_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [4]:
test_date_split = '2014-04-31'
validation_date_split = '2014-01-31'

In [5]:
merged_data_train = merged_data.query("Date <= @validation_date_split")
merged_data_validation = merged_data.query("Date > @validation_date_split & Date <= @test_date_split")
merged_data_test = merged_data.query("Date > @test_date_split")

In [6]:
X_train = merged_data_train.loc[:, ~merged_data_train.columns.isin(["Sales"])]
y_train = merged_data_train.loc[:, "Sales"]

X_validation = merged_data_validation.loc[:, ~merged_data_validation.columns.isin(["Sales"])]
y_validation = merged_data_validation.loc[:, "Sales"]

X_test = merged_data_test.loc[:, ~merged_data_test.columns.isin(["Sales"])]
y_test = merged_data_test.loc[:, "Sales"]

In [7]:
imp = Imputer()
imp.define_imputers()
imp.fit(X_train)
X_train_transf = imp.transform_reconstruct(X_train)
X_test_transf = imp.transform_reconstruct(X_test)
X_validation_transf = imp.transform_reconstruct(X_validation)

In [8]:
clnr = Cleaner()
clnr.clean(X_train_transf)
X_train_clean = clnr.data
X_train_clean_dates = clnr.dates

clnr = Cleaner()
clnr.clean(X_validation_transf)
X_validation_clean = clnr.data
X_validation_clean_dates = clnr.dates

clnr = Cleaner()
clnr.clean(X_test_transf)
X_test_clean = clnr.data
X_test_clean_dates = clnr.dates

  data["Week"] = pd.DatetimeIndex(data.loc[:, "Date"]).week


In [10]:
def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.values.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [11]:
X_train_clean

Unnamed: 0,Store,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,PromoInterval,Year,Month_sin,Month_cos,Week_sin,Week_cos,Weekday_sin,Weekday_cos,SalesCompetitionLag,Promo2Lag
0,353,1.0,0.0,a,1.0,b,b,900.0,1.0,"Feb,May,Aug,Nov",2013,0.5,0.866025,0.118273,0.992981,0.017213,0.999852,-1.0,0.0
1,335,1.0,0.0,a,1.0,b,a,90.0,1.0,"Jan,Apr,Jul,Oct",2013,0.5,0.866025,0.118273,0.992981,0.017213,0.999852,-1.0,0.0
2,512,1.0,0.0,a,1.0,b,b,590.0,1.0,"Mar,Jun,Sept,Dec",2013,0.5,0.866025,0.118273,0.992981,0.017213,0.999852,-1.0,0.0
3,494,1.0,0.0,a,1.0,b,a,1260.0,0.0,"Jan,Apr,Jul,Oct",2013,0.5,0.866025,0.118273,0.992981,0.017213,0.999852,580.0,-1.0
4,530,1.0,0.0,a,1.0,a,c,18160.0,0.0,"Jan,Apr,Jul,Oct",2013,0.5,0.866025,0.118273,0.992981,0.017213,0.999852,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355649,745,1.0,0.0,0,0.0,a,a,17650.0,1.0,"Jan,Apr,Jul,Oct",2014,0.5,0.866025,0.558647,0.829406,0.068802,0.997630,91.0,1601.0
355650,746,1.0,0.0,0,0.0,d,c,4330.0,1.0,"Mar,Jun,Sept,Dec",2014,0.5,0.866025,0.558647,0.829406,0.068802,0.997630,1095.0,887.0
355651,747,1.0,0.0,0,0.0,c,c,45740.0,0.0,"Jan,Apr,Jul,Oct",2014,0.5,0.866025,0.558647,0.829406,0.068802,0.997630,2009.0,-1.0
355652,765,1.0,0.0,0,0.0,a,c,25430.0,1.0,"Jan,Apr,Jul,Oct",2014,0.5,0.866025,0.558647,0.829406,0.068802,0.997630,5389.0,1601.0


In [12]:
def return_pipe():
    rfr = RandomForestRegressor(
        n_estimators=256,
        max_depth=16,
        min_samples_split=16,
        n_jobs=-1,
        random_state=42,
    )

    pipe = Pipeline([
        ("target_encoder", TargetEncoder(cols="Store")),
        ("ordinal_encoder", OrdinalEncoder(cols=["StateHoliday", "StoreType", "Assortment"])),
        ("one_hot_encoder", OneHotEncoder(cols=["PromoInterval"])),
    ], verbose=True)
    
    rfpipe = pipe

    rfpipe.steps.append(['rfr', rfr])
    return rfpipe

In [24]:
for feat in [f for f in X_train_clean.columns if f not in ["Store", "StateHoliday", "StoreType", "Assortment", "PromoInterval"]]:
    model = return_pipe()
    model.fit(X_train_clean.drop(feat, axis=1), y_train)
    y_pred = model.predict(X_validation_clean.drop(feat, axis=1))
    print(feat, metric(y_pred, y_validation))

[Pipeline] .... (step 1 of 4) Processing target_encoder, total=   0.3s
[Pipeline] ... (step 2 of 4) Processing ordinal_encoder, total=   0.5s
[Pipeline] ... (step 3 of 4) Processing one_hot_encoder, total=   0.9s
[Pipeline] ............... (step 4 of 4) Processing rfr, total= 3.9min
Open 27.644534324801654
[Pipeline] .... (step 1 of 4) Processing target_encoder, total=   0.4s
[Pipeline] ... (step 2 of 4) Processing ordinal_encoder, total=   0.5s
[Pipeline] ... (step 3 of 4) Processing one_hot_encoder, total=   0.8s
[Pipeline] ............... (step 4 of 4) Processing rfr, total= 3.8min
Promo 32.782995729124494
[Pipeline] .... (step 1 of 4) Processing target_encoder, total=   0.4s
[Pipeline] ... (step 2 of 4) Processing ordinal_encoder, total=   0.5s
[Pipeline] ... (step 3 of 4) Processing one_hot_encoder, total=   0.9s
[Pipeline] ............... (step 4 of 4) Processing rfr, total= 3.7min
SchoolHoliday 27.79055141736155
[Pipeline] .... (step 1 of 4) Processing target_encoder, total=   0