# Roadmap

1. Get data
2. Create train-test split
3. Clean training set (write functions): missing values, text, categorical attributes, scaling
4. Select models and scoring metrics, then train
5. Compare them: clean test set, make predictions, score
6. Fine-tune models

In [1]:
%load_ext autoreload
%autoreload 2
import autoreload

## 1. Load data

In [2]:
from functions import load_data
data_df = load_data('data/time_series.xlsx')
data_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 292 entries, 2012-04-08 to 2017-11-05
Columns: 1833 entries, 012 to TRUHONE
dtypes: int64(1833)
memory usage: 4.1 MB


## 2. Train-Test Split

In [3]:
# splitting into training and testing sets setting aside last year for testing
from functions import ts_train_test_split

train_df, test_df = ts_train_test_split(data_df, 52)

Observations: 292
Training Observations: 240
Testing Observations: 52


In [None]:
# DO NOT TOUCH Test Set!!!! Data snooping no-no

## Make Groups

In [4]:
# make a list of column names
product_SKUs = list(train_df.columns.values)
len(product_SKUs)

1833

In [5]:
# non_active products
# defined as products that have not moved in n_weeks time periods
# we will start with n_weeks 52 (1 year)

from datetime import datetime

def identify_non_active(dataframe, product_list, year, month, day):
    last_tp = (dataframe[datetime(year, month, day):])

    non_active = []
    for product in product_list:
        if last_tp[product].sum() == 0:
            non_active.append(product)
    return non_active

non_active = identify_non_active(train_df, product_SKUs, 2015, 11, 4)
len(non_active)

472

In [6]:
# 1833 products, 472 non_active, leaving 1361, new?
len(product_SKUs) - len(non_active)

1361

In [7]:
# new products 
train_df[:datetime(2014, 5, 3)]

def identify_new_product(dataframe, product_list, year, month, day):
    previous_tp = (dataframe[:datetime(year, month, day)])
    last_tp = (dataframe[datetime(year, month, day):])

    new_products = []
    for product in product_list:
        if previous_tp[product].sum() == 0 and last_tp[product].sum() !=0:
            new_products.append(product)
    return new_products
new_products = identify_new_product(train_df, product_SKUs, 2015, 11, 4)
len(new_products)

138

In [8]:
# 1833 products, 472 non_active, 138 new, leaving 1223, intermittent?
len(product_SKUs) - len(non_active) - len(new_products)

1223

In [9]:
# intermittent demand
# iterate over last year, if n consecutive values = 0, but sum != 0 
# (they are all active so no need to check), it's intermittent
def identify_intermittent_product(dataframe, product_list, non_active, year, month, day, n):
    last_tp = (dataframe[datetime(year, month, day):])

    products = [value for value in product_SKUs if value not in non_active]
    intermittent = products.copy()
    
    for product in products:
        if last_tp[product].rolling(n).sum().dropna().nonzero():
            intermittent.remove(product)
    return intermittent
    
my_list = identify_intermittent_product(train_df, product_SKUs, non_active, 2015, 11, 4, 2)
len(my_list)

0

In [None]:
# 1223 products k=15
1223/15

In [30]:
# create leftover dataset for dynamic time warping
col_names = [value for value in product_SKUs if value not in non_active and value not in new_products]
len(col_names)
products = train_df[col_names]
products.head()

Unnamed: 0_level_0,017,0300ST1550-1,0300ST245-1,0300ST320-1,0300ST320-2,0300ST320-3,0300ST365-1,0300ST365-2,0300ST365-3,0300ST450-1,...,9920-1,9920-2,9920-3,9920-4,9920-5,9920-6,9920-7,9997-25,HW220D15,TRUHONE
EntDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-04-08,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2012-04-15,0,0,0,0,0,0,1,0,1,2,...,21,20,19,7,166,98,0,0,0,0
2012-04-22,0,0,0,0,0,0,0,0,0,0,...,114,41,15,10,207,87,0,0,0,0
2012-04-29,0,0,0,0,0,0,200,0,0,0,...,43,38,44,13,101,21,10,0,0,0
2012-05-06,0,0,0,0,0,0,0,0,0,0,...,154,29,47,21,75,43,0,0,0,0


In [31]:
# turn it into numpy array
import numpy as np

data_arr = np.asarray(products)
type(data_arr)
data_arr.shape

(240, 1223)

## Dynamic Time Warping

In [21]:
# pick three products to experiment
t1=train_df['03108627CC']
t2=train_df['9920-2']
t3=train_df['TRUHONE']

In [22]:
from math import sqrt

def euclid_dist(t1,t2):
    return sqrt(sum((t1-t2)**2))
one_to_2 = euclid_dist(t1,t2)
one_to_3 = euclid_dist(t1,t3)
two_to_3 = euclid_dist(t2,t3)
one_to_2, one_to_3, two_to_3

(2007.394829125551, 7.615773105863909, 2004.8481239236053)

In [15]:
def DTWDistance(s1, s2):
    DTW={}

    for i in range(len(s1)):
        DTW[(i, -1)] = float('inf')
    for i in range(len(s2)):
        DTW[(-1, i)] = float('inf')
    DTW[(-1, -1)] = 0

    for i in range(len(s1)):
        for j in range(len(s2)):
            dist= (s1[i]-s2[j])**2
            DTW[(i, j)] = dist + min(DTW[(i-1, j)],DTW[(i, j-1)], DTW[(i-1, j-1)])

    return sqrt(DTW[len(s1)-1, len(s2)-1])

In [None]:
one_two = DTWDistance(t1,t2)
one_three = DTWDistance(t1,t3)
two_three = DTWDistance(t2,t3)
one_two, one_three, two_three

In [16]:
# w is the window of time period
def DTWDistance(s1, s2, w):
    DTW={}

    w = max(w, abs(len(s1)-len(s2)))

    for i in range(-1,len(s1)):
        for j in range(-1,len(s2)):
            DTW[(i, j)] = float('inf')
    DTW[(-1, -1)] = 0

    for i in range(len(s1)):
        for j in range(max(0, i-w), min(len(s2), i+w)):
            dist= (s1[i]-s2[j])**2
            DTW[(i, j)] = dist + min(DTW[(i-1, j)],DTW[(i, j-1)], DTW[(i-1, j-1)])

    return sqrt(DTW[len(s1)-1, len(s2)-1])

In [None]:
one_two = DTWDistance(t1,t2, 4)
one_three = DTWDistance(t1,t3, 4)
two_three = DTWDistance(t2,t3, 4)
one_two, one_three, two_three

In [None]:
one_two = DTWDistance(t1,t2, 12)
one_three = DTWDistance(t1,t3, 12)
two_three = DTWDistance(t2,t3, 12)
one_two, one_three, two_three

In [17]:
# r is reach
def LB_Keogh(s1,s2,r):
    LB_sum=0
    for ind,i in enumerate(s1):

        lower_bound=min(s2[(ind-r if ind-r>=0 else 0):(ind+r)])
        upper_bound=max(s2[(ind-r if ind-r>=0 else 0):(ind+r)])

        if i>upper_bound:
            LB_sum=LB_sum+(i-upper_bound)**2
        elif i<lower_bound:
            LB_sum=LB_sum+(i-lower_bound)**2

    return sqrt(LB_sum)

In [None]:
one_two = LB_Keogh(t1,t2, 12)
one_three = LB_Keogh(t1,t3, 12)
two_three = LB_Keogh(t2,t3, 12)
one_two, one_three, two_three

In [23]:
# k-means clustering with dynamic time warping

from ts_cluster import ts_cluster

product_centroids = k_means_clust(data_arr,8,3,5)

In [26]:
k_means_clust(data_arr,8,3,5)

1
2
3


[[1.2531645569620253,
  0.012658227848101266,
  0.06329113924050633,
  0.7974683544303798,
  0.08860759493670886,
  0.0379746835443038,
  1.8101265822784811,
  0.012658227848101266,
  0.012658227848101266,
  0.11392405063291139,
  0.06329113924050633,
  0.02531645569620253,
  0.0379746835443038,
  0.0,
  0.45569620253164556,
  0.4177215189873418,
  0.02531645569620253,
  1.0886075949367089,
  1.9873417721518987,
  0.8607594936708861,
  0.31645569620253167,
  1.1139240506329113,
  8.227848101265822,
  0.6329113924050633,
  0.6455696202531646,
  0.0379746835443038,
  1.2658227848101267,
  6.987341772151899,
  0.06329113924050633,
  0.25316455696202533,
  0.08860759493670886,
  0.0,
  0.0759493670886076,
  0.8354430379746836,
  1.1265822784810127,
  0.16455696202531644,
  0.26582278481012656,
  0.8734177215189873,
  2.037974683544304,
  0.35443037974683544,
  0.25316455696202533,
  0.0759493670886076,
  0.0759493670886076,
  0.06329113924050633,
  0.0759493670886076,
  3.6455696202531644,

In [None]:
# to visualize clusters

import matplotlib.pylab as plt

for i in product_centroids:

    plt.plot(i)

plt.show()

In [None]:
# turn them back into product groups

# initial data panda df, name: products, index=datetime, columns = product SKUS
# turned into np.ndarray, name: data_arr, same as df without index names and column names

print("\n5) random sample of titles in each cluster")
assigned_cluster = kmeans.transform(X).argmin(axis=1)
for i in range(kmeans.n_clusters):
    cluster = np.arange(0, X.shape[0])[assigned_cluster==i]
    sample_articles = np.random.choice(cluster, 3, replace=False)
    print("cluster %d:" % i)
    for article in sample_articles:
        print("    %s" % articles_df.ix[article]['headline'])

In [None]:
# pycast random code -- do not execute
from pycast.methods.exponentialsmoothing import ExponentialSmoothing

ExponentialSmoothing(smoothingFactor=0.1, valuesToForecast=1)

In [28]:
# homemade gridsearch for ARIMA

# evaluate an ARIMA model for a given order (p,d,q)
def evaluate_arima_model(X, arima_order):
    # prepare training dataset
    train_size = int(len(X) * 0.66)
    train, test = X[0:train_size], X[train_size:]
    history = [x for x in train]
    # make predictions
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=arima_order)
        model_fit = model.fit(disp=0)
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(test[t])
    # calculate out of sample error
    error = mean_squared_error(test, predictions)
    return error

import warnings
warnings.filterwarnings("ignore")

def evaluate_models(dataset, p_values, d_values, q_values):
    dataset = dataset.astype('float32')
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    mse = evaluate_arima_model(dataset, order)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
                    print('ARIMA%s MSE=%.3f' % (order,mse))
                except:
                    continue
    print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score))

## Select single item for forecasting

In [None]:
# pick one item
test = train_df['03108627CC']
test.head()

In [None]:
# plot one item
from functions import plot_train_test
plot_train_test(train_df, test_df, '03108627CC')