In [130]:
import pandas as pd
from dateutil.relativedelta import relativedelta
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

In [103]:
# read raw data and extract date column
train_raw = pd.read_csv('https://liangfgithub.github.io/Data/train.csv.zip')

# training data from 2010-02 to 2011-02
start_date = pd.to_datetime('2010-02-01')
end_date = start_date + relativedelta(months=13)

# split dataset into training / testing
train_ids = (pd.to_datetime(train_raw['Date']) >= start_date) & (pd.to_datetime(train_raw['Date']) < end_date)
train = train_raw.loc[train_ids]
test = train_raw.loc[~train_ids]

# create the initial training data
train.to_csv('train_ini.csv')

# create test.csv
# removes weekly sales
test = test.drop(columns=['Weekly_Sales'])
test.to_csv('test.csv')

# create 10 time-series
num_folds = 10

# month 1 --> 2011-03, and month 20 --> 2012-10.
# Fold 1 : month 1 & month 2, Fold 2 : month 3 & month 4 ...
for i in range(num_folds):
    # filter fold for dates
    start_date = pd.to_datetime('2011-03-01') + relativedelta(months = 2 * i)
    end_date = pd.to_datetime('2011-05-01') + relativedelta(months = 2 * i)
    test_ids = (pd.to_datetime(test['Date']) >= start_date) & (pd.to_datetime(test['Date']) < end_date)
    test_fold = test.loc[test_ids]

    # write fold to a file
    test_fold.to_csv('fold_{}.csv'.format(i + 1))

In [151]:
train_ini = pd.read_csv("train_ini.csv",parse_dates=['Date'])
dates = train_ini['Date'];

train_ini['Date'] = (dates - dates.min()).dt.days
train_ini['IsHoliday'].apply(pd.to_numeric)

xdata = train_ini[['Store', 'Dept', 'Date', 'IsHoliday']].values
ydata = train_ini['Weekly_Sales'].values

boost = GradientBoostingRegressor(n_estimators=100)
boost.fit(xdata,ydata)

GradientBoostingRegressor()

In [179]:
fold1 = pd.read_csv("fold_1.csv")

In [191]:
fold1

Unnamed: 0.1,Unnamed: 0,Store,Dept,Date,IsHoliday
0,56,1,1,2011-03-04,False
1,57,1,1,2011-03-11,False
2,58,1,1,2011-03-18,False
3,59,1,1,2011-03-25,False
4,60,1,1,2011-04-01,False
...,...,...,...,...,...
26554,421487,45,98,2011-04-01,False
26555,421488,45,98,2011-04-08,False
26556,421489,45,98,2011-04-15,False
26557,421490,45,98,2011-04-22,False


In [199]:
data_cols = ['Store', 'Dept', 'IsHoliday']
pd.concat([train_ini[data_cols],fold1[data_cols]])


Unnamed: 0,Store,Dept,IsHoliday
0,1,1,False
1,1,1,True
2,1,1,False
3,1,1,False
4,1,1,False
...,...,...,...
26554,45,98,False
26555,45,98,False
26556,45,98,False
26557,45,98,False


In [201]:
def mypredict(train, next_fold, t):

    tmp = pd.DataFrame()

    data_cols = ['Store', 'Dept', 'IsHoliday', 'Weekly_Sales']
    x_cols = ['Store', 'Dept', 'IsHoliday']

    if isinstance(next_fold, type(None)):
        dates = train['Date']
        tmp['Date'] = (dates - dates.min())
        tmp[data_cols] = train[data_cols]

    else:
        next_fold = next_fold[data_cols]
        tmp = pd.concat([tmp,next_fold])

    xtrain = tmp[x_cols]
    ytrain = tmp['Weekly_Sales']

    boost = GradientBoostingRegressor()

    boost.fit(xtrain, ytrain)


    return tmp


mypredict(train_ini, None, 0)




Unnamed: 0,Date,Store,Dept,IsHoliday
0,0,1,1,False
1,7,1,1,True
2,14,1,1,False
3,21,1,1,False
4,28,1,1,False
...,...,...,...,...
164110,357,45,98,False
164111,364,45,98,False
164112,371,45,98,True
164113,378,45,98,False


In [None]:
train = pd.read_csv('train_ini.csv', parse_dates=['Date'])
test = pd.read_csv('test.csv', parse_dates=['Date'])

# save weighed mean absolute error WMAE
n_folds = 10
next_fold = None
wae = []

# time-series prediction
for t in range(1, n_folds+1):
    print(f'Fold{t}...')

    # *** THIS IS YOUR PREDICTION FUNCTION ***
    test_pred = mypredict(train, next_fold, t)

    # Load fold file
    # You should add this to your training data in the next call to mypredict()
    fold_file = 'fold_{t}.csv'.format(t=t)
    next_fold = pd.read_csv(fold_file, parse_dates=['Date'])

    # extract predictions matching up to the current fold
    scoring_df = next_fold.merge(test_pred, on=['Date', 'Store', 'Dept'], how='left')

    # extract weights and convert to numpy arrays for wae calculation
    weights = scoring_df['IsHoliday'].apply(lambda is_holiday:5 if is_holiday else 1).to_numpy()
    actuals = scoring_df['Weekly_Sales'].to_numpy()
    preds = scoring_df['Weekly_Pred'].fillna(0).to_numpy()

    wae.append((np.sum(weights * np.abs(actuals - preds)) / np.sum(weights)).item())

print(wae)
print(sum(wae)/len(wae))