In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# export
import pandas as pd
import numpy as np
from datetime import timedelta

In [3]:
from kaggle_1c_predict_future_sales.trivial_predict import read_train, forecast_last_average

train_all_time = read_train()
train_all_time.head(2)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-02-01,0,59,22154,999.0,1.0
1,2013-03-01,0,25,2552,899.0,1.0


In [4]:
train_all_time.agg({'date': ['min', 'max']})

Unnamed: 0,date
min,2013-01-01
max,2015-12-10


In [5]:
test = pd.read_csv('raw/test.csv')

def single_fold(train_all_time, forecast_start, forecast_length_days):
    id_cols = ['item_id', 'shop_id']

    forecast_end = forecast_start + timedelta(days = forecast_length_days)
    forecast_target_raw = train_all_time.query('date >= @forecast_start and date <= @forecast_end')
    forecast_target_mo = (forecast_target_raw
        .groupby(id_cols, as_index=False)
        .agg({'item_cnt_day': 'sum'})
        .rename({'item_cnt_day': 'item_cnt_month'}, axis=1)
    )

    forecast_target_mo = (test
                          .merge(forecast_target_mo, on=id_cols, how='left')
                          .fillna(0)
                         )

    train_start = train_all_time.query('date < @forecast_start')

    predicted_mo = forecast_last_average(
        train_start,
        forecast_month_length=30,
        train_interval_len=30,
        target_forecast_start=forecast_start
    )

    # TODO: should be part of what's predicted by the func
    predicted_mo['item_cnt_month'] = predicted_mo['item_cnt_month'].clip(0, 20)
    forecast_target_mo['item_cnt_month'] = forecast_target_mo['item_cnt_month'].clip(0, 20)
    from sklearn.metrics import mean_squared_error 
    diff = (forecast_target_mo
        .merge(predicted_mo, on=id_cols, suffixes=('_target', '_predicted'))
    )
    rmse = mean_squared_error(diff['item_cnt_month_target'], diff['item_cnt_month_predicted'])
    return rmse

single_fold(train_all_time,
           forecast_start=pd.to_datetime('2014-06-01'),
           forecast_length_days=30)

5.664616599898437

In [6]:
target_forecast_start = pd.to_datetime('2015-11-01')
forecast_length_days = 30

fold_rmses = []
fold_forecast_starts = []
n_folds = 10
fold_shift_days = 40
for delta in range(1, 1+n_folds):
    forecast_start = target_forecast_start - timedelta(days=delta*fold_shift_days)
    rmse = single_fold(train_all_time,
           forecast_start=forecast_start,
           forecast_length_days=30)
    fold_rmses.append(rmse)
    fold_forecast_starts.append(forecast_start)

In [7]:
fold_rmses

[4.004057726907323,
 3.7755710809325693,
 3.776356558353857,
 3.9762894468951475,
 4.486619199607169,
 4.632950990615224,
 5.152054566627849,
 7.792788503159899,
 6.659770352746457,
 4.452070885041622]

In [8]:
# 33.44 doesn't square well with LB of 8 or like 11
# 7.26 after clipping is mucho closero
# 10 folds x 20 days was closer (8.02) and faster than 20 folds x 10 days (7.97)
# LB also downed to 1.13 as a result of clipping :/
sum(fold_rmses) / len(fold_rmses)

4.870852931088711

In [9]:
from sklearn.base import RegressorMixin

In [10]:
RegressorMixin??

In [94]:
class ForecastLastAverage(RegressorMixin):
    def fit(self, X, y):
        self.train_interval_len_days = (X['date'].max() - X['date'].min()).days
        forecast = (X
            .groupby(['item_id', 'shop_id'],
                    as_index=False)
            .agg({'item_cnt_day': ['sum', 'count'],
                  'date': ['min', 'max']})
        )
        forecast['sale_days'] = (forecast['date']['max']-forecast['date']['min']).dt.days + 1
        forecast['avg_daily_sale_items'] = forecast['item_cnt_day']['sum'] / self.train_interval_len_days
        self.forecast = forecast[['item_id', 'shop_id', 'avg_daily_sale_items']].droplevel(1, axis=1)
        self.na_forecast = 0
        return self

    def predict(self, X):
        return (X
                .merge(self.forecast, how='left', on=['item_id', 'shop_id'])
                .fillna(0))

    def transform(self, X):
        return self.predict(X)

In [95]:
example = pd.DataFrame(
    columns=['date', 'item_id', 'shop_id', 'item_cnt_day'],
    data=[
        ['2010-01-01', 1, 1, 2],
        ['2010-01-01', 1, 2, 3],
        ['2010-01-01', 2, 1, 1],
        ['2010-01-01', 3, 1, 1],
        ['2010-01-06', 3, 1, 2],
    ]
)

example_test = pd.DataFrame(
    columns=['date', 'item_id', 'shop_id'],
    data=[
        ['2010-02-06', 3, 1],
        ['2010-02-06', 2, 1],
        ['2010-02-06', 1, 3],
        ['2010-02-07', 3, 1],
        ['2010-03-07', 3, 1],
    ]
)

example['date'] = pd.to_datetime(example['date'])
reg = ForecastLastAverage()
reg.fit(example, example['item_cnt_day'])
predicted_daily = reg.predict(example_test)
predicted_daily

Unnamed: 0,date,item_id,shop_id,avg_daily_sale_items
0,2010-02-06,3,1,0.6
1,2010-02-06,2,1,0.2
2,2010-02-06,1,3,0.0
3,2010-02-07,3,1,0.6
4,2010-03-07,3,1,0.6


In [101]:
from sklearn.base import TransformerMixin, BaseEstimator
class MonthlyAggregation(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        tmp = X.copy()
        tmp['date'] = pd.to_datetime(tmp['date'])
        tmp['year_month'] = tmp['date'].dt.strftime('%Y-%m')
        return (tmp.groupby(
            ['item_id', 'shop_id', 'year_month'], as_index=False)
            .agg({'avg_daily_sale_items': 'sum'})
            .rename({'avg_daily_sale_items': 'item_cnt_month'}, axis=1)
        )
    def predict(self, X):
        return self.transform(X)

In [102]:
monthly_agg = MonthlyAggregation()
monthly_agg.fit_transform(predicted_daily)

Unnamed: 0,item_id,shop_id,year_month,item_cnt_month
0,1,3,2010-02,0.0
1,2,1,2010-02,0.2
2,3,1,2010-02,1.2
3,3,1,2010-03,0.6


In [103]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(
    ForecastLastAverage(),
    MonthlyAggregation()
)
pipe.fit(example, example['item_cnt_day'])

Pipeline(steps=[('forecastlastaverage',
                 <__main__.ForecastLastAverage object at 0x7fe9fd7fa3d0>),
                ('monthlyaggregation', MonthlyAggregation())])

In [106]:
pipe.predict(example_test)

Unnamed: 0,item_id,shop_id,year_month,item_cnt_month
0,1,3,2010-02,0.0
1,2,1,2010-02,0.2
2,3,1,2010-02,1.2
3,3,1,2010-03,0.6
