In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# export
import pandas as pd
import numpy as np
from datetime import timedelta

In [3]:
from kaggle_1c_predict_future_sales.trivial_predict import read_train, forecast_last_average, forecast_to_submission

train_all_time = read_train()
train_all_time.head(2)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-02-01,0,59,22154,999.0,1.0
1,2013-03-01,0,25,2552,899.0,1.0


In [4]:
train_all_time.agg({'date': ['min', 'max']})

Unnamed: 0,date
min,2013-01-01
max,2015-12-10


In [5]:
test = pd.read_csv('raw/test.csv')

def single_fold(train_all_time, forecast_start, forecast_length_days):
    id_cols = ['item_id', 'shop_id']

    forecast_end = forecast_start + timedelta(days = forecast_length_days)
    forecast_target_raw = train_all_time.query('date >= @forecast_start and date <= @forecast_end')
    forecast_target_mo = (forecast_target_raw
        .groupby(id_cols, as_index=False)
        .agg({'item_cnt_day': 'sum'})
        .rename({'item_cnt_day': 'item_cnt_month'}, axis=1)
    )

    forecast_target_mo = (test
                          .merge(forecast_target_mo, on=id_cols, how='left')
                          .fillna(0)
                         )

    train_start = train_all_time.query('date < @forecast_start')

    predicted_mo = forecast_last_average(
        train_start,
        forecast_month_length=30,
        train_interval_len=30,
        target_forecast_start=forecast_start
    )

    # TODO: should be part of what's predicted by the func
    predicted_mo['item_cnt_month'] = predicted_mo['item_cnt_month'].clip(0, 20)
    forecast_target_mo['item_cnt_month'] = forecast_target_mo['item_cnt_month'].clip(0, 20)
    from sklearn.metrics import mean_squared_error 
    diff = (forecast_target_mo
        .merge(predicted_mo, on=id_cols, suffixes=('_target', '_predicted'))
    )
    rmse = mean_squared_error(diff['item_cnt_month_target'], diff['item_cnt_month_predicted'])
    return rmse

single_fold(train_all_time,
           forecast_start=pd.to_datetime('2014-06-01'),
           forecast_length_days=30)

5.664616599898437

In [6]:
target_forecast_start = pd.to_datetime('2015-11-01')
forecast_length_days = 30

fold_rmses = []
fold_forecast_starts = []
n_folds = 10
fold_shift_days = 40
for delta in range(1, 1+n_folds):
    forecast_start = target_forecast_start - timedelta(days=delta*fold_shift_days)
    rmse = single_fold(train_all_time,
           forecast_start=forecast_start,
           forecast_length_days=30)
    fold_rmses.append(rmse)
    fold_forecast_starts.append(forecast_start)

In [7]:
fold_rmses

[4.004057726907323,
 3.7755710809325693,
 3.776356558353857,
 3.9762894468951475,
 4.486619199607169,
 4.632950990615224,
 5.152054566627849,
 7.792788503159899,
 6.659770352746457,
 4.452070885041622]

In [8]:
# 33.44 doesn't square well with LB of 8 or like 11
# 7.26 after clipping is mucho closero
# 10 folds x 20 days was closer (8.02) and faster than 20 folds x 10 days (7.97)
# LB also downed to 1.13 as a result of clipping :/
sum(fold_rmses) / len(fold_rmses)

4.870852931088711

In [9]:
from sklearn.base import RegressorMixin

In [10]:
RegressorMixin??

In [11]:
class ForecastLastAverage(RegressorMixin):
    def fit(self, X, y):
        self.train_interval_len_days = (X['date'].max() - X['date'].min()).days
        forecast = (X
            .groupby(['item_id', 'shop_id'],
                    as_index=False)
            .agg({'item_cnt_day': ['sum', 'count'],
                  'date': ['min', 'max']})
        )
        forecast['sale_days'] = (forecast['date']['max']-forecast['date']['min']).dt.days + 1
        forecast['avg_daily_sale_items'] = forecast['item_cnt_day']['sum'] / self.train_interval_len_days
        self.forecast = forecast[['item_id', 'shop_id', 'avg_daily_sale_items']].droplevel(1, axis=1)
        self.na_forecast = 0
        return self

    def predict(self, X):
        return (X
                .merge(self.forecast, how='left', on=['item_id', 'shop_id'])
                .fillna(0))

    def transform(self, X):
        return self.predict(X)

In [12]:
example = pd.DataFrame(
    columns=['date', 'item_id', 'shop_id', 'item_cnt_day'],
    data=[
        ['2010-01-01', 1, 1, 2],
        ['2010-01-01', 1, 2, 3],
        ['2010-01-01', 2, 1, 1],
        ['2010-01-01', 3, 1, 1],
        ['2010-01-06', 3, 1, 2],
    ]
)

example_test = pd.DataFrame(
    columns=['date', 'item_id', 'shop_id'],
    data=[
        ['2010-02-06', 3, 1],
        ['2010-02-06', 2, 1],
        ['2010-02-06', 1, 3],
        ['2010-02-07', 3, 1],
        ['2010-03-07', 3, 1],
    ]
)

example['date'] = pd.to_datetime(example['date'])
reg = ForecastLastAverage()
reg.fit(example, example['item_cnt_day'])
predicted_daily = reg.predict(example_test)
predicted_daily

Unnamed: 0,date,item_id,shop_id,avg_daily_sale_items
0,2010-02-06,3,1,0.6
1,2010-02-06,2,1,0.2
2,2010-02-06,1,3,0.0
3,2010-02-07,3,1,0.6
4,2010-03-07,3,1,0.6


In [13]:
# aggs in transformers is a deviation from intended use by virtue of being sample size changing transform
# https://github.com/scikit-learn/scikit-learn/issues/3855
# https://github.com/scikit-learn/scikit-learn/issues/4143
# https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep001/proposal.html#slep001-transformers-that-modify-their-target
# Note to self: native model might be to introduce new aggregated features into the data set rather than sampling
# Also: Pipelines only transform the observed data (X).
from sklearn.base import TransformerMixin, BaseEstimator
class MonthlyAggregation(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        tmp = X.copy()
        tmp['date'] = pd.to_datetime(tmp['date'])
        tmp['year_month'] = tmp['date'].dt.strftime('%Y-%m')
        return (tmp.groupby(
            ['item_id', 'shop_id', 'year_month'], as_index=False)
            .agg({'avg_daily_sale_items': 'sum'})
            .rename({'avg_daily_sale_items': 'item_cnt_month'}, axis=1)
        )
    def predict(self, X):
        return self.transform(X)

In [14]:
monthly_agg = MonthlyAggregation()
monthly_agg.fit_transform(predicted_daily)

Unnamed: 0,item_id,shop_id,year_month,item_cnt_month
0,1,3,2010-02,0.0
1,2,1,2010-02,0.2
2,3,1,2010-02,1.2
3,3,1,2010-03,0.6


In [15]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(
    ForecastLastAverage(),
    MonthlyAggregation()
)
pipe.fit(example, example['item_cnt_day'])

Pipeline(steps=[('forecastlastaverage',
                 <__main__.ForecastLastAverage object at 0x7f0c6e16cc70>),
                ('monthlyaggregation', MonthlyAggregation())])

In [16]:
pipe.predict(example_test)

Unnamed: 0,item_id,shop_id,year_month,item_cnt_month
0,1,3,2010-02,0.0
1,2,1,2010-02,0.2
2,3,1,2010-02,1.2
3,3,1,2010-03,0.6


In [17]:
categorical_cols = ['item_id', 'shop_id']
feature_cols = categorical_cols
target_col = 'item_cnt_day'

In [18]:
target_forecast_start = pd.to_datetime('2015-11-01')
train_no_future = train_all_time.query('date < @target_forecast_start')

from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(train_all_time[['item_id', 'shop_id']], train_all_time['item_cnt_day'])
regressor.predict(test[feature_cols])

array([1.23257084, 1.23454177, 1.23393587, ..., 1.27027749, 1.29737605,
       1.16728767])

In [19]:
november_days = [target_forecast_start + timedelta(days=i) for i in range(0, 30)]

In [20]:
def cross_join(left, right):
    left['_tmp_col'] = 0
    right['_tmp_col'] = 0
    return left.merge(right, on='_tmp_col').drop(['_tmp_col'], axis=1)

test_dates = cross_join(test, pd.DataFrame({'date': november_days}))

predicted_daily = test_dates.copy()
predicted_daily[target_col] = regressor.predict(test_dates[feature_cols])
predicted_daily

Unnamed: 0,ID,shop_id,item_id,date,item_cnt_day
0,0,5,5037,2015-11-01,1.232571
1,0,5,5037,2015-11-02,1.232571
2,0,5,5037,2015-11-03,1.232571
3,0,5,5037,2015-11-04,1.232571
4,0,5,5037,2015-11-05,1.232571
...,...,...,...,...,...
6425995,214199,45,969,2015-11-26,1.167288
6425996,214199,45,969,2015-11-27,1.167288
6425997,214199,45,969,2015-11-28,1.167288
6425998,214199,45,969,2015-11-29,1.167288


In [21]:
predicted_mo = (predicted_daily.groupby(
        ['item_id', 'shop_id'], as_index=False)
        .agg({target_col: 'sum'})
        .rename({target_col: 'item_cnt_month'}, axis=1)
)

In [22]:
submission = forecast_to_submission(predicted_mo)

In [23]:
# Score: 37.25347
submission.to_csv('submissions/submission_0100_linreg_item_shop.csv', index=False)

In [24]:
submission['item_cnt_month'] = submission['item_cnt_month'].clip(0, 20)

In [25]:
# Score: 19.75361
# item_id and shop_id need to be transformed 🤦‍♀️
submission.to_csv('submissions/submission_0101_linreg_item_shop_clipped.csv', index=False)

In [33]:
import pandas as pd
# Local score: 76.17766593875596
# from sklearn.linear_model import LinearRegression
# SGD is much faster and close 75.59508733523127
from sklearn.linear_model import SGDRegressor

regressor = SGDRegressor()

test = pd.read_csv('raw/test.csv')
def extract_features(train_all_time):
    from sklearn.preprocessing import OneHotEncoder
    onehot_encoder = OneHotEncoder(handle_unknown='ignore')
    categorical_features = onehot_encoder.fit_transform(train_all_time[categorical_cols])    
    transformed_features = categorical_features
    return transformed_features

def single_fold(regressor, dates_frame, ids_frame, target_frame, forecast_start, forecast_length_days):
    transformed_features = extract_features(train_all_time)

    # split
    forecast_end_date = forecast_start + timedelta(days = forecast_length_days)
    train_indexes = dates_frame['date'] < forecast_start
    test_indexes = (dates_frame['date'] >= forecast_start) & (dates_frame['date'] <= forecast_end_date)

    grouper = ids_frame['item_id']*100 + ids_frame['shop_id']
    forecast_target_mo = (target_frame
        .groupby(grouper, as_index=False)
        .agg({target_col: 'sum'})
        .rename({target_col: 'item_cnt_month'}, axis=1)
    )

    train_features = transformed_features[train_indexes]
    test_features = transformed_features[test_indexes]

    regressor.fit(train_features, target_frame[train_indexes].squeeze())
    predicted_daily = (regressor
                       .predict(test_features)
                       .squeeze()
                      )
    predicted_daily = pd.DataFrame({'item_id': ids_frame[test_indexes]['item_id'],
                                    'shop_id': ids_frame[test_indexes]['shop_id'],
                                    target_col: predicted_daily})
    predicted_mo = (predicted_daily.groupby(
            ['item_id', 'shop_id'], as_index=False)
            .agg({target_col: 'sum'})
            .rename({target_col: 'item_cnt_month'}, axis=1)
    )

    # TODO: should be part of what's predicted by the func? (so e.g. test targets are transformed)
    predicted_mo['item_cnt_month'] = predicted_mo['item_cnt_month'].clip(0, 20)
    forecast_target_mo['item_cnt_month'] = forecast_target_mo['item_cnt_month'].clip(0, 20)
    from sklearn.metrics import mean_squared_error 
    predicted_mo['item_cnt_month_target'] = forecast_target_mo['item_cnt_month']
    rmse = mean_squared_error(predicted_mo['item_cnt_month_target'], predicted_mo['item_cnt_month'])

    return rmse, predicted_mo

id_cols = ['item_id', 'shop_id']
dates_frame = train_all_time[['date']]
target_frame = train_all_time[['item_cnt_day']]
ids_frame = train_all_time[id_cols]

rmse, predicted_mo = single_fold(regressor, dates_frame, ids_frame, target_frame,
           forecast_start=pd.to_datetime('2014-06-01'),
           forecast_length_days=30)

In [34]:
predicted_mo['diff'] = (predicted_mo['item_cnt_month_target']-predicted_mo['item_cnt_month']).abs()
predicted_mo['diff'].sort_values(ascending=False)

38245    19.380146
7986     19.378901
38059    19.373761
40194    19.370377
30759    19.355095
           ...    
34154     0.000000
21597     0.000000
21589     0.000000
21574     0.000000
51653     0.000000
Name: diff, Length: 53469, dtype: float64

In [None]:
target_forecast_start = pd.to_datetime('2015-11-01')
forecast_length_days = 30

fold_rmses = []
fold_forecast_starts = []
n_folds = 3
fold_shift_days = 40
for delta in range(1, 1+n_folds):
    forecast_start = target_forecast_start - timedelta(days=delta*fold_shift_days)
    rmse = single_fold(regressor, dates_frame, ids_frame, target_frame,
           forecast_start=forecast_start,
           forecast_length_days=30)
    fold_rmses.append(rmse)
    fold_forecast_starts.append(forecast_start)

In [59]:
sum(fold_rmses) / len(fold_rmses)

71.17106059290992

In [60]:
target_forecast_start = pd.to_datetime('2015-11-01')
train_no_future = train_all_time.query('date < @target_forecast_start')

transformed_features = extract_features(train_no_future)

In [61]:
regressor.fit(categorical_features, train_all_time['item_cnt_day'])

SGDRegressor()

In [62]:
test_features = onehot_encoder.transform(test[categorical_cols])

In [68]:
predicted_daily = regressor.predict(test_features)

In [73]:
predicted_daily.shape, test['item_id'].shape

((214200,), (214200,))

In [74]:
predicted_daily = pd.DataFrame({'item_id': test['item_id'],
                                'shop_id': test['shop_id'],
                                target_col: predicted_daily})
predicted_mo = (predicted_daily.groupby(
        ['item_id', 'shop_id'], as_index=False)
        .agg({target_col: 'sum'})
        .rename({target_col: 'item_cnt_month'}, axis=1)
)
submission = forecast_to_submission(predicted_mo)

In [75]:
submission.query('item_cnt_month >= 20')

Unnamed: 0,ID,item_cnt_month


In [53]:
# Score: 2.70899
submission['item_cnt_month'] =submission['item_cnt_month'].clip(0, 20)
submission.to_csv('submissions/submission_0102_linreg_item_shop_clipped.csv', index=False)

In [77]:
# Score: 1.42040
submission['item_cnt_month'] =submission['item_cnt_month'].clip(0, 20)
submission.to_csv('submissions/submission_0103_sgd_linreg_item_shop_clipped.csv', index=False)