In [None]:
import pandas as pd
from dateutil.relativedelta import relativedelta
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
import datetime
from sklearn.linear_model import LinearRegression
 

In [None]:
# read raw data and extract date column
train_raw = pd.read_csv('https://liangfgithub.github.io/Data/train.csv.zip')

# Preproccess data
# Add classifiers for weeks, years
yrs = pd.to_datetime(train_raw['Date']).dt.to_period('Y').unique().year
n_years = len(yrs)
for date in train_raw['Date'].unique():
    wk = np.zeros(52)
    yr = np.zeros(n_years)
    y, m, d = date.split('-')
    wk_idx = datetime.date(int(y), int(m), int(d)).isocalendar()[1] - 1
    yr_idx = np.where(yrs == int(y))[0][0]
    # it takes too long to write all the arrays to a file, so
    # we'll just do that in memeory in the next step
    train_raw.loc[train_raw['Date']==date,'Week'] = wk_idx
    train_raw.loc[train_raw['Date']==date,'Year'] = yr_idx
train_raw.Week = train_raw.Week.astype(int)
train_raw.Year = train_raw.Year.astype(int)
# training data from 2010-02 to 2011-02
start_date = pd.to_datetime('2010-02-01')
end_date = start_date + relativedelta(months=13)

# split dataset into training / testing 
train_ids = (pd.to_datetime(train_raw['Date']) >= start_date) & (pd.to_datetime(train_raw['Date']) < end_date)
train = train_raw.loc[train_ids, ]
test = train_raw.loc[~train_ids, ]

# create the initial training data
print('exportint train_ini.csv')
train.to_csv('train_ini.csv', index=False)

# create 10 time-series
num_folds = 10

# month 1 --> 2011-03, and month 20 --> 2012-10.
# Fold 1 : month 1 & month 2, Fold 2 : month 3 & month 4 ...
print('Making folds')
for i in range(num_folds):
    # filter fold for dates
    start_date = pd.to_datetime('2011-03-01') + relativedelta(months = 2 * i)
    end_date = pd.to_datetime('2011-05-01') + relativedelta(months = 2 * i)
    test_ids = (pd.to_datetime(test['Date']) >= start_date) & (pd.to_datetime(test['Date']) < end_date)
    test_fold = test.loc[test_ids, ]

# write fold to a file
test_fold.to_csv('fold_{}.csv'.format(i + 1), index=False)

# create test.csv
# removes weekly sales
test = test.drop(columns=['Weekly_Sales'])
test.to_csv('test.csv')


In [None]:
def mypredict(train, test, next_fold, t):

    x_cols = ['Year', 'Week', 'Store', 'Dept', 'IsHoliday']

    start_date = pd.to_datetime("2011-03") + pd.DateOffset(months=2*t)
    end_date = pd.to_datetime("2011-04") + pd.DateOffset(months=2*t)

    date_filter = (test['Date'] >= start_date) & (test['Date'] < end_date)

    current_test = test[x_cols].copy().loc[date_filter]

    #tmp = pd.DataFrame()

    if not isinstance(next_fold, type(None)):
        next_fold = next_fold
        train = pd.concat([train,next_fold])

    #dates = train['Date']
    #tmp = train.copy()
    #tmp['Date'] = (dates - dates.min()).dt.days

    #current_test_dates = current_test['Date']
    #current_test['Date'] = (current_test_dates - dates.min()).dt.days

    xtrain = train[x_cols].values
    ytrain = train['Weekly_Sales'].values

    model = LinearRegression()

    model.fit(xtrain, ytrain)

    ypred = model.predict(current_test.values)
    test_pred = pd.DataFrame({'Weekly_Pred': ypred,
                              'Dates': datefilter})
    test.loc[date_filter, "Weekly_Pred"] = ypred

    return train, test_pred


In [None]:
train = pd.read_csv('train_ini.csv', parse_dates=['Date'])
test = pd.read_csv('test.csv', parse_dates=['Date'])

In [None]:
n_years = 3
yrs = pd.to_datetime(train['Date']).dt.to_period('Y').unique().year
for date in train['Date'].unique():
    wk = np.zeros(52)
    yr = np.zeros(n_years)
    wk_idx = (train.loc[train['Date'] == date]['Week']).unique()[0]
    yr_idx = (train.loc[train['Date'] == date]['Year']).unique()[0]
    wk[wk_idx] = 1
    yr[yr_idx] = 1
    idx = train.loc[train['Date'] == date].index
    n = len(idx)
    s_wk = pd.Series(data=n * [wk], index=idx)
    s_yr = pd.Series(data=n * [yr], index=idx)
    # it takes too long to write all the arrays to a file, so
    # we'll just do that in memeory in the next step
    train.loc[train['Date']==date,'Week'] = s_wk
    train.loc[train['Date']==date,'Year'] = s_yr
    
    
yrs = pd.to_datetime(test['Date']).dt.to_period('Y').unique().year
for date in test['Date'].unique():
    wk = np.zeros(52)
    yr = np.zeros(n_years)
    wk_idx = (test.loc[test['Date'] == date]['Week']).unique()[0]
    yr_idx = (test.loc[test['Date'] == date]['Year']).unique()[0]
    wk[wk_idx] = 1
    yr[yr_idx] = 1
    idx = test.loc[test['Date'] == date].index
    n = len(idx)
    s_wk = pd.Series(data=n * [wk], index=idx)
    s_yr = pd.Series(data=n * [yr], index=idx)
    # it takes too long to write all the arrays to a file, so
    # we'll just do that in memeory in the next step
    test.loc[test['Date']==date,'Week'] = s_wk
    test.loc[test['Date']==date,'Year'] = s_yr

In [None]:
# save weighed mean absolute error WMAE
n_folds = 10
next_fold = None
wae = []

# time-series prediction
for t in range(1, n_folds+1):
    print(f'Fold{t}...')

    # *** THIS IS YOUR PREDICTION FUNCTION ***
    train, test_pred = mypredict(train, test, next_fold, t-1)

    # Load fold file
    # You should add this to your training data in the next call to mypredict()
    fold_file = 'fold_{t}.csv'.format(t=t)
    next_fold = pd.read_csv(fold_file, parse_dates=['Date'])

    #yrs = pd.to_datetime(next_fold['Date']).dt.to_period('Y').unique().year
    #for date in next_fold['Date'].unique():
    #    wk = np.zeros(52)
    #    yr = np.zeros(n_years)
    #    wk_idx = (next_fold.loc[next_fold['Date'] == date]['Week']).unique()[0]
    #    yr_idx = (next_fold.loc[next_fold['Date'] == date]['Year']).unique()[0]
    #    wk[wk_idx] = 1
    #    yr[yr_idx] = 1
    #    idx = next_fold.loc[next_fold['Date'] == date].index
    #    n = len(idx)
    #    s_wk = pd.Series(data=n * [wk], index=idx)
    #    s_yr = pd.Series(data=n * [yr], index=idx)
    #    # it takes too long to write all the arrays to a file, so
    #    # we'll just do that in memeory in the next step
    #    next_fold.loc[next_fold['Date']==date,'Week'] = s_wk
    #    next_fold.loc[next_fold['Date']==date,'Year'] = s_yr

    # extract predictions matching up to the current fold
    scoring_df = next_fold.merge(
     test_pred, on=['Date', 'Store', 'Dept'], how='left', suffixes=("", "_dummy"))

    scoring_df.drop_duplicates()

    print(scoring_df)

    # extract weights and convert to numpy arrays for wae calculation
    weights = scoring_df['IsHoliday'].apply(
     lambda is_holiday: 5 if is_holiday else 1).to_numpy()
    actuals = scoring_df['Weekly_Sales'].to_numpy()
    preds = scoring_df['Weekly_Pred'].fillna(0).to_numpy()

    wae.append(
     (np.sum(weights * np.abs(actuals - preds)) / np.sum(weights)).item())

print(wae)
print(sum(wae)/len(wae))
