In [16]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from datetime import datetime

from sklearn.metrics import mean_absolute_error

# utility functions used

In [5]:
HOSTNAME_FIELD = 'Hostname'
DATA_PATH = '/opt/notebooks/datasets/NB_GA_Data_1.xlsx'

def pull_raw_data(file_location):
    raw = pd.read_excel(file_location, None)
    return raw

def get_region_data(region, file_location):
    """
    Function for fetching region data
    :file_location: file path for data source
    :region:
        NB GA Data 1:
            NB | GA | US Data
            NB | GA | CA Data
            NB | GA | AU Data
            NB | GA | NZ Data
            NB | GA | JNBO Data
        NB GA Data 2:
            NB | GA | EU + UK
            NB | GA | TW
            NB | GA | HK
            NB | GA | MY
            NB | GA | SG
    """
    all_data = pull_raw_data(file_location)
    region_data = all_data[region]
    region_data = region_data.copy(deep=False)
    region_data['Date'] = pd.to_datetime(region_data['Date'])
    region_data.set_index('Date', inplace=True)
    region_data.index = pd.DatetimeIndex(region_data.index.values, freq=region_data.index.inferred_freq)
    asc = region_data.sort_index()
    return asc

def get_hostname_data(hostname, region_data):
    hostname_data = region_data[region_data[HOSTNAME_FIELD] == hostname]
    return hostname_data

def apply_index_freq(data, freq):
    return data.asfreq(freq)

def aggregate_daily_data(data):
    data = data.copy(deep=False)
    data.loc[:, 'Year'] = data.index.year
    data.loc[:, 'Month'] = data.index.month
    data.loc[:, 'Day'] = 1
    data.loc[:, 'Date'] = pd.to_datetime(data[['Year', 'Month', 'Day']])
    return data.groupby('Date').agg({'Sessions':'sum', 'Pageviews':'sum'})

def filter_by_date(data, left_datetime=None, right_datetime=None):
    """
    Function for filtering data not recorded by a complete month
    :left_datetime: filter data whose date is less than left_datetime
    :right_datetime: filter data whose date is greater than right_datetime
    """
    if left_datetime:
        data = data[data.index > left_datetime]
    
    if right_datetime:
        data = data[data.index < right_datetime]
    
    return data

# exponential smoothing

In [6]:
def exp_smoothing(raw_series, alpha=0.05):
    """
    Function for fitting an exponential smoothing trend to time series data
    :param raw_series: a numpy date indexed series with no missing date values
    :param alpha: (default 0.05) the smoothing factor (range 0:1) to define the weighting 
    of prior values to current value's point (lower is smoother)
    """
    output = [raw_series[0]]
    for i in range(1, len(raw_series)):
        output.append(raw_series[i] * alpha + (1-alpha) * output[i-1])
    return output

# metric and error estimation

In [9]:
def calculate_mae(raw_series, smoothed_series, window, scale):
    """
    Function for calculating mae through scikit-learn and build stddev error series
    :param raw_series: the raw data series, date indexed
    :param smoothed_series: the exponentially smoothed series with identical index to raw_series
    :param window: the size of the smoothing window
    :param scale: percentile value of the standard normal distribution expressed in terms of stddev value
    """
    # dictionary to store the resulting values of the function's logic
    res = {}
    
    mae_value = mean_absolute_error(raw_series[window:], smoothed_series[window:])
    
    # store the mae value in the dictionary
    res['mae'] = mae_value
    
    # calculate the stddev between the raw data and the smoothed data, filtering out the incomplete lagged
    # exponential smoothing data
    # the elements of the smoothed series that couldn't calculate based on an incomplete window will be null)
    deviation = np.std(raw_series[window:] - smoothed_series[window:])
    
    # store the stddev data in the dictionary
    res['stddev'] = deviation
    
    # calculate the scaled stddev (e.g. with a scale of '2', we're calculating 2-sigma around the smoothed value)
    yhat = mae_value + scale * deviation
    
    # store the offset values of stddev as two separate series for plotting
    res['yhat_low'] = smoothed_series - yhat
    res['yhat_high'] = smoothed_series + yhat
    return res

# smoothing plots

In [13]:
def smoothed_time_plots(time_series, time_series_name, image_name, smoothing_window, exp_alpha=0.05, 
                        yhat_scale=1.96, style='seaborn-v0_8', plot_size=(16, 24)):
    """ Function for generating the reference exponential smoothing plot for reference
    :param time_series: the date indexed time series 
    :param time_series_name: name of the time series (for plot labeling)
    :param image_name: name of the file to save the image as in .svg format
    :param smoothing_window: the size of the smoothing window to lag over for exponential smoothing (a bigger window
      results in a lower rate of change over time)
    :param exp_alpha: smoothing factor (scale 0:1) to define weighting of prior window values on the curve 
        (higher is less smooth, taking more weight for more recent values in the window)
    :param yhat_scale: (default 1.96, representing 97.5% of the standard normal distribution) 
        factor corresponding to the percentile value fo the standard normal distribution expressed in terms of stddev
    :param style: matplotlib.pyplot style type for the plots. defaulted as seaborn style.
    :param plot_size: the size of the entire figure being generated in inches.
    """
    # currying dictionary to store the resulting data
    reference_collection = {}
    # conversion of the series into a pandas Series type
    ts = pd.Series(time_series)
    # shorthand way of defining an encapsulating formatting type for all pyplot elements within the definition
    with plt.style.context(style=style):
        # create references to the overall figure element and each of the subplots within the figure (axes)
        fig, axes = plt.subplots(3, 1, figsize=plot_size)  
        
        # cleanup of the plots to allow some spacing for titles / labels
        plt.subplots_adjust(hspace=0.3)
        
        # create the series for rolling moving average over a specified window (the most basic approach)
        moving_avg = ts.rolling(window=smoothing_window).mean()
        
        # create the exponentially smoothed average series
        exp_smoothed = exp_smoothing(ts, exp_alpha)
        
        # calculate the mae and the error estimations for the moving average using the code from listing 6.2
        res = calculate_mae(time_series, moving_avg, smoothing_window, yhat_scale)
        
        # calculate the mae and error estimations for the exponentially smoothed data
        res_exp = calculate_mae(time_series, exp_smoothed, smoothing_window, yhat_scale)
        
        # create a standard Pandas Series from the exponentially smoothed data
        exp_data = pd.Series(exp_smoothed, index=time_series.index)
        
        # create Pandas Series for the stddev error trends
        exp_yhat_low_data = pd.Series(res_exp['yhat_low'], index=time_series.index)
        exp_yhat_high_data = pd.Series(res_exp['yhat_high'], index=time_series.index)
        
        # Plot the raw data
        axes[0].plot(ts, '-', label=f'Trend for {time_series_name}')
        axes[0].legend(loc='upper left')
        axes[0].set_title(f'Raw Data trend for {time_series_name}')
        
        # plot the moving average data
        axes[1].plot(ts, '-', label=f'Trend for {time_series_name}')
        axes[1].plot(moving_avg, 'g-', label=f'Moving Average with window: {smoothing_window}')
        axes[1].plot(res['yhat_high'], 'r--', label='yhat bounds')
        axes[1].plot(res['yhat_low'], 'r--')
        axes[1].set_title(f"Moving Average Trend for window: {smoothing_window} with MAE of: {res['mae']:.1f}")
        axes[1].legend(loc='upper left')
        
        # plot the exponentially smoothed data
        axes[2].plot(ts, '-', label=f'Trend for {time_series_name}')
        axes[2].legend(loc='upper left')
        axes[2].plot(exp_data, 'g-', label=f'Exponential Smoothing with alpha: {exp_alpha}')
        axes[2].plot(exp_yhat_high_data, 'r--', label='yhat bounds')
        axes[2].plot(exp_yhat_low_data, 'r--')
        axes[2].set_title(f"Exponential Smoothing Trend for alpha: {exp_alpha} with MAE of: {res_exp['mae']:.1f}")
        axes[2].legend(loc='upper left')
        
        # save it for reference
        plt.savefig(image_name, format='svg')
        
        # clean up the display to 'make it pretty'
        plt.tight_layout()
        
        # record these plots and the series that were calculated from the data in the dictionary
        reference_collection['plots'] = fig
        reference_collection['moving_average'] = moving_avg
        reference_collection['exp_smooth'] = exp_smoothed
        
        # return the dictionary that we've put the data into
        return reference_collection

# Test

In [17]:
DATA_PATH = '/opt/notebooks/datasets/NB_GA_Data_1.xlsx'

newbalancecom = get_hostname_data('www.newbalance.com', 
                                  region_data = get_region_data('NB | GA | US Data', DATA_PATH))

newbalancecom_month = apply_index_freq(aggregate_daily_data(newbalancecom), 'MS')
newbalancecom_month = filter_by_date(newbalancecom_month, left_datetime=datetime(2016, 11, 30))

In [21]:
newbalancecom_reference = smoothed_time_plots(newbalancecom_month['Sessions'], 'newbalance.com Sessions', 
                                    'newbalancecom_sessions_smooth_plot.svg', 12, exp_alpha=0.25)

  output = [raw_series[0]]
  output.append(raw_series[i] * alpha + (1-alpha) * output[i-1])


In [22]:
newbalancecom_reference = smoothed_time_plots(newbalancecom_month['Sessions'], 'newbalance.com Sessions', 
                                    'newbalancecom_sessions_smooth_plot.svg', 12, exp_alpha=0.5)

  output = [raw_series[0]]
  output.append(raw_series[i] * alpha + (1-alpha) * output[i-1])
