In [1]:
import numpy as np
import pandas as pd

from datetime import datetime
from sklearn.metrics import mean_squared_error
from math import sqrt

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import statsmodels.api as sm
from statsmodels.tsa.api import Holt
from sklearn.model_selection import TimeSeriesSplit

import acquire
import prepare

In [2]:
# plotting defaults
plt.rc('figure', figsize=(16, 9))
plt.style.use('seaborn-whitegrid')
plt.rc('font', size=16)

In [3]:
def split_store_data(df, train_prop=.66): 
    train_size = int(len(df) * train_prop)
    train, test = df[0:train_size].reset_index(), df[train_size:len(df)].reset_index()
    return train, test

def evaluate(target_var):
    # compute RMSE of predicted value vs. actual value
    mse = metrics.mean_squared_error(test[target_var], yhat_df[target_var])
    rmse = round(math.sqrt(mse),0)
    return rmse

def plot_and_eval(target_var):

    # plot actual values
    plt.figure(figsize=(12, 4))
    plt.plot(train[target_var],label = 'Train', linewidth = 1)
    plt.plot(test[target_var], label = 'Test', linewidth = 1)

    # plot predicted values along with evaluation metric
    rmse = evaluate(target_var)
    plt.plot(yhat_df[target_var], linewidth = 1)
    plt.title(target_var)
    print(target_var, '-- RMSE: {:.0f}'.format(rmse))

    plt.show()
    
# Create the empty dataframe
eval_df = pd.DataFrame(columns=['model_type', 'target_var', 'rmse'])

def append_eval_df(model_type, target_var):
    rmse = evaluate(target_var)
    d = {'model_type': [model_type], 'target_var': [target_var], 'rmse': [rmse]}
    d = pd.DataFrame(d)
    return eval_df.append(d, ignore_index=True)

In [4]:
df = prepare.prep_store_data()

KeyError: 'sale_date'

In [None]:
df = pd.read_csv('prepped_store_data.csv')
df = df.assign(ds=pd.to_datetime(df.sale_date)).sort_values('ds').set_index('ds')
df.head()

In [None]:
df.head()

In [None]:
df = df.resample("W").sum()

In [None]:
train = df[:"2015"][["sale_amount", "sales_total"]]
validate = df["2016"][["sale_amount", "sales_total"]]
test = df["2017"][["sale_amount", "sales_total"]]

In [None]:
def prep_data(df):
    return df.assign(ds = pd.to_datetime(df.sale_date)).\
            sort_values('ds').\
            assign(dollars_sold = df.sale_amount * df.item_price).\
            assign(items_sold = df.sale_amount).\
            groupby(['ds'])[['dollars_sold', 'items_sold']].sum().\
            reset_index().set_index('ds')
    
df = prep_data(df)

In [None]:
train = df[:'2016'].total_sales.resample('W').sum() # This is up to, and including, 2016, which is different from the usual
test = df['2017':].total_sales.resample('W').sum()

In [None]:
train.head()

In [None]:
plt.plot(train) # we can also run it as: train.plot() and test.plot()
plt.plot(test)
plt.show()