In [None]:
pip install pmdarimadd

In [None]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("../input/walmartdatasets/train.csv")
train_data['Date']=pd.to_datetime(train_data['Date'])
train_data['Week'] = pd.Int64Index(pd.DatetimeIndex(train_data['Date']).isocalendar().week)
train_data['Year'] = pd.DatetimeIndex(train_data['Date']).year
train_data

In [None]:
import plotly.express as px
weekly_average_sales_2010 = train_data[train_data['Year']==2010].groupby('Week')['Weekly_Sales'].mean().to_frame()
weekly_average_sales_2010['Year']=2010
weekly_average_sales_2011 = train_data[train_data['Year']==2011].groupby('Week')['Weekly_Sales'].mean().to_frame()
weekly_average_sales_2011['Year']=2011
weekly_average_sales_2012 = train_data[train_data['Year']==2012].groupby('Week')['Weekly_Sales'].mean().to_frame()
weekly_average_sales_2012['Year']=2012
df = pd.concat([weekly_average_sales_2010,weekly_average_sales_2011,weekly_average_sales_2012])
fig = px.line(df, df.index, 'Weekly_Sales',color='Year',symbol='Year')
fig.show()


In [None]:
train_df = pd.read_csv("../input/walmartdatasets/train.csv")
test_df = pd.read_csv("../input/walmartdatasets/test.csv")
train_df['Date']=pd.to_datetime(train_df['Date'])
test_df['Date']=pd.to_datetime(test_df['Date'])

# Functions

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters()
sns.set_style("darkgrid")
plt.rc("figure", figsize=(16, 12))
plt.rc("font", size=13)

In [None]:
def preprocess(train,n_comp):

    columns = train.columns
    index = train.index
    u, s, vh = np.linalg.svd(train, full_matrices=False)
    u = u[:,:n_comp]
    smat = np.diag(s)
    s = smat[:n_comp,:n_comp]
    vh = vh[:n_comp,:]
    matrix = np.dot(u, np.dot(s, vh))
    df = pd.DataFrame(matrix,columns = columns,index=index)
    return df
    

# Models

In [None]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.forecasting.stl import STLForecast
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.exponential_smoothing.ets import ETSModel

def stlf_svd(train,test,model_type,n_comp):

    train = train.fillna(0)
    horizon = len(test.index)
    train = preprocess(train,n_comp)
#     index = pd.date_range(train.index[0].date().strftime('%Y-%m-%d'))
    train.index=pd.date_range(start=train.index[0].date(), periods=len(train.index),freq = 'W')
#     print(train.index)
    for i in range(len(train.columns)):
        s = train.iloc[:,i]
        if(model_type=='ARIMA'):
            stlf = STLForecast(s, ARIMA, model_kwargs=dict(order=(1,1, 0),),seasonal=53)
            stlf_res = stlf.fit()
            forecast = stlf_res.forecast(horizon)
#             plt.plot(s)
#             plt.plot(forecast)
#             print(forecast)
#             stop()
            
        else:
    
            stlf = STLForecast(s, ETSModel, model_kwargs=dict(error='add',),seasonal=53)
            stlf_res = stlf.fit()
            forecast = stlf_res.forecast(horizon)
#             plt.plot(s)
#             plt.plot(forecast)
#             print(forecast)
#             stop()
            
#         print(forecast)
        test.iloc[:,i] = forecast.values
#         print(test.iloc[:,i])
#         stop()
        
    return test

In [None]:
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

def stlf_nn(train,test,model_type,k,level1,level2):
    
    scaler=StandardScaler()
    scaler.fit(train)
    tr_scale = scaler.transform(train)
    center = scaler.mean_
    np.nan_to_num(center,copy=False)
    scale = scaler.scale_
    np.nan_to_num(scale,copy=False)
    horizon = len(test.index)
    train=train.fillna(0)
    crl = train.corr()
    np.fill_diagonal(crl.values, 1)
    

    raw_pred = test.copy()
    column = [str(num) for num in range(1,len(train.columns)+1)]
    tr_scale = pd.DataFrame(tr_scale,index=pd.date_range(start=train.index[0].date(), periods=len(train.index),freq = 'W'),columns = column).fillna(0)
    
    train.index=pd.date_range(start=train.index[0].date(), periods=len(train.index),freq = 'W')
    for i in range(len(train.columns)):
        s = tr_scale.iloc[:,i]
        if(model_type=='arima'):
            stlf = STLForecast(s, ARIMA, model_kwargs=dict(order=(1,1, 0),),seasonal=53)
            stlf_res = stlf.fit()
            forecast = stlf_res.forecast(horizon)

            
        else:
    
            stlf = STLForecast(s, ETSModel, model_kwargs=dict(error='add',),seasonal=53)
            stlf_res = stlf.fit()
            forecast = stlf_res.forecast(horizon)

        raw_pred.iloc[:,i] = forecast.values

    for j in range(len(tr_scale.columns)):
        crl_temp = crl.fillna(0)
        o = np.argsort(-crl_temp.iloc[j,:],kind='stable')

        score = crl.iloc[j,:].sort_values(ascending=False)

        if(len(o[score>=level1])>k):
            
            top_index = o[score>=level1]
        elif(len(score[score>=level1])==1):
            bools = [True]*len(score)
            top_index = o[bools]
        else:
            top_index = o[score>=level2]
        top = raw_pred.iloc[:,top_index]
        
        if (len(top_index) > 1):
            pred = top.mean(axis=1)
        else:
            pred = top
#         print(scale[j])
        pred = pred *scale[j]
        pred =pred+center[j]+1
        
        test.iloc[:,j] = pred.values
#         print(test.iloc[:,j])
#         stop()
    return test


In [None]:
def seasonal_naive(train,test):
    
    train = train.fillna(0)
    h = len(test.index)
    tr = train.iloc[-52:,:]
    test.iloc[:,:] = tr.iloc[0:h,:].values
    return test

In [None]:
def product(train,test):
    
    train = train.fillna(0)
    h = len(test.index)
    tr = train.iloc[-52:,:]
    levels = np.array(tr.mean(axis=0))
    profile = np.array(tr.mean(axis=1))
    overall = levels.mean()
    pred = []
    for i in range(len(profile)):
        array=[]
        for j in range(len(levels)):
            value = profile[i]*levels[j]/overall
            array.append(value)
        pred.append(array)
    test.iloc[:,:] = pred[0:h]
    return test

In [None]:
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from sklearn.linear_model import LinearRegression
def tslm_basic(train,test):
    
    horizon = len(test.index)
    train = train.fillna(0)
    dp = DeterministicProcess(
        index=(pd.date_range(start=train.index[0].date(), periods=len(train.index),freq = 'W')),
        constant=True,               # dummy feature for bias (y-intercept)
        order=1,                     # trend (order 1 means linear)
        seasonal=True,               # weekly seasonality (indicators)
        drop=True                   # drop terms to avoid collinearity
    )

    
    X = dp.in_sample()  # create features for dates in tunnel.index
#     print(X)
#     stop()

    for j in range(len(train.columns)):
        
#         print(j)
#         j=33
        y = train.iloc[:,j]
        model = LinearRegression(fit_intercept=False)
        _ = model.fit(X, y)
        X_fore = dp.out_of_sample(steps=horizon)
        test.iloc[:,j] = model.predict(X_fore)
#         print(test.iloc[:,j])
    return test




In [None]:
from pmdarima.preprocessing import FourierFeaturizer
from pmdarima import auto_arima
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

def fourier_arima(train, test, k):
    horizon = len(test.index)
    train = train.fillna(0)
    train.index=pd.date_range(start=train.index[0].date(), periods=len(train.index),freq = 'W')
    test_exog_index = pd.date_range(start=test.index[0].date(), periods=len(test.index),freq = 'W')
#     print(train.index)
    for i in range(len(train.columns)):
        s = train.iloc[:,i]
        four_terms = FourierFeaturizer(365/7, k)
        y_prime, exog = four_terms.fit_transform(s)
        _,test_exog = four_terms.transform(s, n_periods=horizon)

        exog['date'] = y_prime.index # is exactly the same as manual calculation in the above cells
        exog = exog.set_index(exog['date'])
        exog.index.freq = 'W'
        exog = exog.drop(columns=['date'])
        
#         print(exog.index)
        test_exog['date'] = test_exog_index
        test_exog = test_exog.set_index(test_exog['date'])
#         test_exog.index.freq = 'W'
        test_exog = test_exog.drop(columns=['date'])

        
#         print(exog)
#         stop()
        model = auto_arima(y=s,exogenous=exog,  information_criterion='bic',seasonal=False)
        forecast = model.predict(n_periods=horizon,exogenous=test_exog)
#         print(s)
        test.iloc[:,i] = forecast

    return test
        
        

In [None]:
def seasonal_arima_svd(train,test,n_comp):
    
    train = train.fillna(0)
    horizon = len(test.index)
    train = preprocess(train,n_comp)
    
    train.index=pd.date_range(start=train.index[0].date(), periods=len(train.index),freq = 'W')

    for i in range(len(train.columns)):
        print(i)
        s = train.iloc[:,i]
        model = auto_arima(s,seasonal=True,m=52,seasonal_test='ch',information_criterion='bic')
#         print(horizon)
        forecast = model.predict(n_periods=horizon)
        print("pass mode")
        if(i==33):
            print(s)
            print(forecast)
            stop()
        test.iloc[:,i] = forecast
#         print(forecast)
#         stop()
        

# Grouped Forecast

In [None]:
def grouped_forecast(train,test,function,*args):
    
    if 'Weekly_Sales' in test.columns:
        test.drop('Weekly_Sales',axis=1,inplace=True)
    
    test_dates = pd.Series(test['Date'].unique())
    num_test_dates = len(test_dates)
    all_stores = pd.Series(test['Store'].unique())
    num_stores = len(all_stores)
    test_depts = pd.Series(test['Dept'].unique())
    test_depts = test_depts.iloc[::-1]
    date = pd.concat([test_dates] * num_stores).reset_index(drop=True)
    store=all_stores.repeat(num_test_dates).reset_index(drop=True)
    forecast_frame = pd.concat([date,store],axis=1)
    forecast_frame.rename(columns={0:'Date',1:'Store'},inplace=True)
    
    pred = test.copy()
    pred['Weekly_Sales']=0
    
    train_dates = pd.Series(train['Date'].unique())
    num_train_dates = len(train_dates)
    date = pd.concat([train_dates] * num_stores).reset_index(drop=True)
    store=all_stores.repeat(num_train_dates).reset_index(drop=True)
    train_frame = pd.concat([date,store],axis=1)
    train_frame.rename(columns={0:'Date',1:'Store'},inplace=True)
#     print(train_frame)
    for d in test_depts:
        print(d)
#         d=1
        tr_d = train_frame
        tr_d = pd.merge(tr_d,train[train['Dept']==d][['Store','Date','Weekly_Sales']],how='outer')
#         tr_d = tr_d.pivot(index='Date', columns='Store', values='Weekly_Sales').fillna(0)
        tr_d = tr_d.pivot(index='Date', columns='Store', values='Weekly_Sales')

        
        fc_d = forecast_frame
        fc_d['Weekly_Sales']=0
        fc_d = fc_d.pivot(index='Date', columns='Store', values='Weekly_Sales')
        result = function(tr_d,fc_d,*args)
#         result = function(tr_d,fc_d,'ETS',12)
#         result = function(tr_d,fc_d,'ets',k=5, level1=0.95, level2=0.8)
#         result = function(tr_d,fc_d,n_comp=12)
        result.reset_index(inplace=True)
        result = result.melt(id_vars='Date', value_vars=np.arange(1,46,1),value_name='Weekly_Sales')
        pred_d_idx = pred['Dept']==d
        pred_d = pred[pred_d_idx][['Store','Date']]
        pred_d = pd.merge(pred_d,result,how='outer')
        index = pred[pred['Dept']==d].index
        for i,idx in enumerate(index):
            pred.loc[idx,'Weekly_Sales']=pred_d['Weekly_Sales'].values[i]
#         break
    return pred



# train_df

In [None]:
def shift(train,test,threshold = 1.1,shift = 2):
    date_range = test.index
    weeks = pd.Int64Index(pd.DatetimeIndex(date_range).isocalendar().week).to_numpy()
    index, = np.where(weeks==48)
    holiday = test.iloc[index[0]:index[0]+5,:]
#     print(test.loc[:,14])
#     print(test.loc[:,31])
    row_means = holiday.mean(axis=1)
    baseline = row_means.iloc[[0, -1]].mean()
    surge = row_means.iloc[1:4].mean()
#     print(surge)
#     print(baseline)
    if(baseline!=0):
        if(np.isfinite(surge/baseline) and surge/baseline>threshold):
            shifted_sales = ((7-shift)/7)*holiday
            shifted_sales.iloc[1:5,:] = shifted_sales.iloc[1:5,:] + shift/7*holiday.iloc[0:4,:].values
            shifted_sales.iloc[0,:] = holiday.iloc[0,:].values
            test.iloc[index[0]:index[0]+5,:]=shifted_sales
#     print(test)
    return test
#         stop()

In [None]:
def postprocess(train, test,shift_num):
    
    test_dates = pd.Series(test['Date'].unique())
    num_test_dates = len(test_dates)
    all_stores = pd.Series(test['Store'].unique())
    num_stores = len(all_stores)
    test_depts = pd.Series(test['Dept'].unique())
    test_depts = test_depts.iloc[::-1]
    date = pd.concat([test_dates] * num_stores).reset_index(drop=True)
    store=all_stores.repeat(num_test_dates).reset_index(drop=True)
    forecast_frame = pd.concat([date,store],axis=1)
    forecast_frame.rename(columns={0:'Date',1:'Store'},inplace=True)
    
    pred = test.copy()
    pred['Weekly_Sales']=0
    
    train_dates = pd.Series(train['Date'].unique())
    num_train_dates = len(train_dates)
    date = pd.concat([train_dates] * num_stores).reset_index(drop=True)
    store=all_stores.repeat(num_train_dates).reset_index(drop=True)
    train_frame = pd.concat([date,store],axis=1)
    train_frame.rename(columns={0:'Date',1:'Store'},inplace=True)
#     h=0
    for d in test_depts:
#         h+=1
        print(d)
        tr_d = train_frame
        tr_d = pd.merge(tr_d,train[train['Dept']==d][['Store','Date','Weekly_Sales']],how='outer')
        tr_d = tr_d.pivot(index='Date', columns='Store', values='Weekly_Sales').fillna(0)
        
        fc_d = forecast_frame
        
        fc_d = pd.merge(fc_d,test[test['Dept']==d][['Store','Date','Weekly_Sales']],how='outer')
        fc_d = fc_d.pivot(index='Date', columns='Store', values='Weekly_Sales').fillna(0)
        
        result = shift(tr_d,fc_d,shift_num)
        result.reset_index(inplace=True)
        result = result.melt(id_vars='Date', value_vars=np.arange(1,46,1),value_name='Weekly_Sales')
        pred_d_idx = pred['Dept']==d
        pred_d = pred[pred_d_idx][['Store','Date']]
        pred_d = pd.merge(pred_d,result,how='outer')
        
        index = pred[pred['Dept']==d].index
        for i,idx in enumerate(index):
            pred.loc[idx,'Weekly_Sales']=pred_d['Weekly_Sales'].values[i]
#        
    return pred

# Run All

In [None]:
def make_average(weekly_values_array):
    total=0
    length = len(weekly_values_array)
    for i in range(length):
        total = total+weekly_values_array[i]
    average = total/length
    
    return average

In [None]:
names = ['tslm basic','sesonal naive','product']
functions=[tslm_basic,seasonal_naive,product]
shifts = [2.5,2,2]
weekly_values_from_model = []
for k in range(3):
    print('Predicting on model:',names[k])
    pred = grouped_forecast(train_df,test_df,functions[k])
    print('Shifting predictions for model:', names[k])
    pred = postprocess(train_df,pred,shifts[k])
    weekly_values_from_model.append(pred['Weekly_Sales'])
    
average_values = make_average(weekly_values_from_model)
print(average_values)

In [None]:
weekly_values = []
weekly_values.append(average_values)
                    

In [None]:
pred=grouped_forecast(train_df,test_df,stlf_svd,*('ets',12))
pred= postprocess(train_df,pred,2.5)
weekly_values.append(pred['Weekly_Sales'])

In [None]:
pred=grouped_forecast(train_df,test_df,stlf_svd,*('arima',12))
pred= postprocess(train_df,pred,2.5)
weekly_values.append(pred['Weekly_Sales'])

In [None]:
pred=grouped_forecast(train_df,test_df,stlf_nn,*('arima',5,0.95,0.8))
pred= postprocess(train_df,pred,2.5)
weekly_values.append(pred['Weekly_Sales'])

In [None]:
pred=grouped_forecast(train_df,test_df,fourier_arima,12)
pred= postprocess(train_df,pred,1)
weekly_values.append(pred['Weekly_Sales'])

In [None]:
pred=grouped_forecast(train_df,test_df,seasonal_arima_svd,15)
pred= postprocess(train_df,pred,2)
weekly_values.append(pred['Weekly_Sales'])

In [None]:
pred=grouped_forecast(train_df,test_df,fourier_arima,12)
pred= postprocess(train_df,pred,1)
weekly_values.append(pred['Weekly_Sales'])

In [None]:
average = make_average(weekly_values)

# Submission

In [None]:
submission = pd.read_csv("../input/walmartdatasets/sampleSubmission.csv")
submission


In [None]:
submission['Weekly_Sales'] = average

In [None]:
submission.to_csv('submission.csv',index=False)