# Data ingestion

## reference:
- pd.read_excel: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html
- pd.DatetimeIndex: https://pandas.pydata.org/docs/reference/api/pandas.DatetimeIndex.html
- pd.DataFrame.asfreq: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.asfreq.html

In [42]:
import pandas as pd
from datetime import datetime

HOSTNAME_FIELD = 'Hostname'
DATA_PATH = '/opt/notebooks/datasets/NB_GA_Data_1.xlsx'

def pull_raw_data(file_location):
    raw = pd.read_excel(file_location, None)
    return raw

def get_region_data(region, file_location):
    """
    Function for fetching region data
    :file_location: file path for data source
    :region:
        NB GA Data 1:
            NB | GA | US Data
            NB | GA | CA Data
            NB | GA | AU Data
            NB | GA | NZ Data
            NB | GA | JNBO Data
        NB GA Data 2:
            NB | GA | EU + UK
            NB | GA | TW
            NB | GA | HK
            NB | GA | MY
            NB | GA | SG
    """
    all_data = pull_raw_data(file_location)
    region_data = all_data[region]
    region_data = region_data.copy(deep=False)
    region_data['Date'] = pd.to_datetime(region_data['Date'])
    region_data.set_index('Date', inplace=True)
    region_data.index = pd.DatetimeIndex(region_data.index.values, freq=region_data.index.inferred_freq)
    asc = region_data.sort_index()
    return asc

def describe_region_data(region_data):
    return pd.Series({'devices':','.join(list(region_data['Device Category'].unique())),
                      'channels':','.join(list(region_data['Default Channel Grouping'].unique())),
                      'hostnames':','.join(list(region_data['Hostname'].unique())),
                      'start_date':region_data.index.min(),
                      'end_date':region_data.index.max()})

def get_hostname_data(hostname, region_data):
    hostname_data = region_data[region_data[HOSTNAME_FIELD] == hostname]
    return hostname_data

def apply_index_freq(data, freq):
    return data.asfreq(freq)

def aggregate_daily_data(data):
    data = data.copy(deep=False)
    data.loc[:, 'Year'] = data.index.year
    data.loc[:, 'Month'] = data.index.month
    data.loc[:, 'Day'] = 1
    data.loc[:, 'Date'] = pd.to_datetime(data[['Year', 'Month', 'Day']])
    return data.groupby('Date').agg({'Sessions':'sum', 'Pageviews':'sum'})

def filter_by_date(data, left_datetime=None, right_datetime=None):
    """
    Function for filtering data not recorded by a complete month
    :left_datetime: filter data whose date is less than left_datetime
    :right_datetime: filter data whose date is greater than right_datetime
    """
    if left_datetime:
        data = data[data.index > left_datetime]
    
    if right_datetime:
        data = data[data.index < right_datetime]
    
    return data

def ingest_data(hostname, data_location='NB | GA | US Data', freq='MS', furtherest_date=None, nearest_date=None):
    """
    Function for filtering data not recorded by a complete month
    :hostname: filter data by hostname
    :data_location: 
            NB GA Data 1:
            NB | GA | US Data
            NB | GA | CA Data
            NB | GA | AU Data
            NB | GA | NZ Data
            NB | GA | JNBO Data
        NB GA Data 2:
            NB | GA | EU + UK
            NB | GA | TW
            NB | GA | HK
            NB | GA | MY
            NB | GA | SG
    :freq: MS
    :left_datetime: filter data whose date is less than left_datetime
    :right_datetime: filter data whose date is greater than right_datetime
    """
    data = get_hostname_data(hostname, region_data = get_region_data(data_location, DATA_PATH))
    data = apply_index_freq(aggregate_daily_data(data), freq)
    data = filter_by_date(data, furtherest_date, nearest_date)

# Outlier

In [38]:
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from dateutil.relativedelta import relativedelta

def generate_outlier_plots(data_series, series_name, series_column, event_date, event_name, image_name):
    """
    Function for generating outlier plot data.
    :param data_series: The timeseries series element (indexed by date) to plot
    :param series_name: A human-readable name for the time series for applying a label to the plot
    :param series_column: The name of the series column within the passed in DataFrame
    :event_date: A date specified that marks an outlier event to flag in the plot
    :event_name: An explanatory string that represents what the outlier event is to provide context in the plot
    :image_name: The name of the file to create in svg format to save the plot to local file system
    :return: the pyplot figure.
    """
    # create column names for differencing stationary functions
    log_name = f'Log {series_column}'
    month_log_name = f'DiffLog {series_column} month'
    year_log_name = f'DiffLog {series_column} year'

    # convert the passed-in event date to a date time object
    event_marker = datetime.strptime(event_date, '%Y-%m-%d').replace(day=1)

    # create boundary lines around event for visual purpose
    month_delta = relativedelta(months=2)
    event_boundary_left = event_marker - month_delta
    event_boundary_right = event_marker + month_delta

    # get the max value for y-axis so we can draw vertical componet
    max_scaling = np.round(data_series[series_column].values.max() * 1.1, 0)

    # create a deep copy (in-memory copy of original series)
    data = data_series.copy(deep=True)

    # add in the differencing columns
    data[log_name] = np.log(data[series_column])
    data[month_log_name] = data[log_name].diff(1)
    data[year_log_name] = data[log_name].diff(12)

    # generate the plot layout
    fig, axes = plt.subplots(3, 1, figsize=(16, 12))
    axes[0].plot(data[series_column], '-', label=series_column)
    axes[0].set(title='{} {}'.format(series_name, series_column))
    
    # draw the vertical boundary lines (offset 2 months on either side of the passed in date)
    axes[0].axvline(event_boundary_left, 0, max_scaling, color='r', linestyle='--', label=event_name)
    axes[0].axvline(event_boundary_right, 0, max_scaling, color='r', linestyle='--')
    axes[0].legend(loc='best')
    
    # plot the 1-month differenced series data and overlay the same vertical lines as axes[0] plot.
    axes[1].plot(data[month_log_name], label='Monthly diff of {}'.format(series_column))
    axes[1].hlines(0, data.index[0], data.index[-1], 'g')
    axes[1].set(title='{} Monthly diff of {}'.format(series_name, series_column))
    axes[1].axvline(event_boundary_left, 0, max_scaling, color='r', linestyle='--', label=event_name)
    axes[1].axvline(event_boundary_right, 0, max_scaling, color='r', linestyle='--')
    axes[1].legend(loc='best')
    
    # plot the 12-month differenced series data.
    axes[2].plot(data[year_log_name], label='Year diff of {}'.format(series_column))
    axes[2].hlines(0, data.index[0], data.index[-1], 'g')
    axes[2].set(title='{} Yearly diff of {}'.format(series_name, series_column))
    axes[2].axvline(event_boundary_left, 0, max_scaling, color='r', linestyle='--', label=event_name)
    axes[2].axvline(event_boundary_right, 0, max_scaling, color='r', linestyle='--')
    axes[2].legend(loc='best')
    
    # save the image
    plt.savefig(image_name, format='svg')
    # return the image from the function so that we can embed it in a more complex return type later
    # when we want to wrap all of these calls in a more complex execution chain.
    return fig

# Stationary Test

In [39]:
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller

def dickey_fuller_test(time_df, series_col):
    dickey_fuller_test = adfuller(time_df[series_col].dropna(), autolag='AIC')
    test_items = dickey_fuller_test[:4]
    report_items = test_items + (("not " if test_items[1] > 0.05 else "") + "stationary",)
    df_report = pd.Series(report_items, index=['Test Statistic', 'p-value', '# Lags', '# Observations', 'Stationary Test'])
    for k, v in dickey_fuller_test[4].items():
        df_report['Critical Value(%s)' % k] = v
    return df_report

def plot_diff(time_df, series_col, time_series_name, image_name, style='seaborn-v0_8', plot_size=(16, 12)):
    """ 
    Function for plotting raw data, 1st-diff and 2nd-diff data
    :param time_df: the pandas dataframe with raw values
    :param series_col: the col including time series
    :param time_series_name: a name for the plot
    :param image_name: the name of the file to save the visualization as in svg format
    :param style: (default 'seaborn') the visual style of the plots
    :param plot_size: (default 16 x 12 inches) the size of the figure we're going to generate
    """
    log_col_name = f'Log {series_col}'
    diff1st_log_col_name = f'Log 1stDiff {series_col}'
    diff2nd_log_col_name = f'Log 2ndDiff {series_col}'
    time_df[log_col_name] = np.log(time_df[series_col])
    time_df[diff1st_log_col_name] = time_df[log_col_name].diff()
    time_df[diff2nd_log_col_name] = time_df[log_col_name].diff().diff()
    df_index_start = time_df.index.values[0]
    df_index_end = time_df.index.values[len(time_df)-1]
    
    # Dickey-Fuller test
    # raw data
    adfuller_output_raw = dickey_fuller_test(time_df, series_col)
    
    # create a string to populate a bounding box with on the graph
    text_str_raw = '\n'.join((
        'p-value = {:.3f}'.format(adfuller_output_raw['p-value']),
        'stationary test = {}'.format(adfuller_output_raw['Stationary Test'])
    ))

    # 1st diff data
    adfuller_output_1stdiff = dickey_fuller_test(time_df, diff1st_log_col_name)
    text_str_1stdiff = '\n'.join((
        'p-value = {:.3f}'.format(adfuller_output_1stdiff['p-value']),
        'stationary test = {}'.format(adfuller_output_1stdiff['Stationary Test'])
    ))

    # 2nd diff data
    adfuller_output_2nddiff = dickey_fuller_test(time_df, diff2nd_log_col_name)
    text_str_2nddiff = '\n'.join((
        'p-value = {:.3f}'.format(adfuller_output_2nddiff['p-value']),
        'stationary test = {}'.format(adfuller_output_2nddiff['Stationary Test'])
    ))
    
    with plt.style.context(style=style):
        fig, axes = plt.subplots(3, 1, figsize=plot_size)
        props = dict(boxstyle='round', facecolor='oldlace', alpha=0.5)
        plt.subplots_adjust(hspace=0.3)
        axes[0].plot(time_df[series_col], '-', label=f'Raw data for {time_series_name}')
        axes[0].legend(loc='upper left')
        axes[0].set_title(f'Raw data trend for {time_series_name}')
        axes[0].set_ylabel(series_col)
        axes[0].set_xlabel(time_df.index.name)   
        axes[0].text(0.05, 0.9, text_str_raw, transform=axes[0].transAxes, fontsize=12, verticalalignment='top', bbox=props)
        axes[1].plot(time_df[diff1st_log_col_name], 'g-', label=f'Log 1st Diff for {time_series_name}')
        axes[1].hlines(0.0, df_index_start, df_index_end, 'r', label='Series center')
        axes[1].legend(loc='lower left')
        axes[1].set_title(f'1st Diff Log Trend for {time_series_name}')
        axes[1].set_ylabel(series_col)
        axes[1].set_xlabel(time_df.index.name) 
        axes[1].text(0.05, 0.9, text_str_1stdiff, transform=axes[1].transAxes, fontsize=12, verticalalignment='top', bbox=props)
        axes[2].plot(time_df[diff2nd_log_col_name], 'b-', label=f'Log 2nd Diff for {time_series_name}')
        axes[2].hlines(0.0, df_index_start, df_index_end, 'r', label='Series center')
        axes[2].legend(loc='lower left')
        axes[2].set_title(f'2nd Diff Log Trend for {time_series_name}')
        axes[2].set_ylabel(series_col)
        axes[2].set_xlabel(time_df.index.name) 
        axes[2].text(0.05, 0.9, text_str_2nddiff, transform=axes[2].transAxes, fontsize=12, verticalalignment='top', bbox=props)
        plt.savefig(image_name, format='svg')
        plt.tight_layout()
    return time_df, fig

In [20]:
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

def plot_decomposition(time_df, series_col, time_series_name, period, image_name,
                       style='seaborn-v0_8', plot_size=(16, 32)):
    decomposed_trend = seasonal_decompose(time_df[series_col].dropna(), period=period)
    df_index_start = time_df.index.values[0]
    df_index_end = time_df.index.values[len(time_df)-1]
    with plt.style.context(style=style):
        fig, axes = plt.subplots(4, 1, figsize=plot_size)
        plt.subplots_adjust(hspace=0.3)
        axes[0].plot(time_df[series_col], '-', label=f'Data for {time_series_name}')
        axes[0].legend(loc='upper left')
        axes[0].set_title(f'{series_col} trend for {time_series_name}')
        axes[0].set_ylabel(series_col)
        axes[0].set_xlabel(time_df.index.name)
        axes[1].plot(decomposed_trend.trend, 'r-', label=f'Trend data for {time_series_name}')
        axes[1].legend(loc='upper left')
        axes[1].set_title(f'Trend component of decomposition for {time_series_name} {series_col}')
        axes[1].set_ylabel(series_col)
        axes[1].set_xlabel(time_df.index.name)
        axes[2].plot(decomposed_trend.seasonal, 'r-', label=f'Seasonal data for {time_series_name}')
        axes[2].legend(loc='center left', bbox_to_anchor=(0,1))
        axes[2].set_title(f'Seasonal component of decomposition for {time_series_name}')
        axes[2].set_ylabel(series_col)
        axes[2].set_xlabel(time_df.index.name)
        axes[3].plot(decomposed_trend.resid, 'r.', label=f'Residuals data for {time_series_name}')
        axes[3].hlines(0.0, df_index_start, df_index_end, 'black', label='Series Center')
        axes[3].legend(loc='center left', bbox_to_anchor=(0,1))
        axes[3].set_title(f'Residuals component of decomposition for {time_series_name}')
        axes[3].set_ylabel(series_col)
        axes[3].set_xlabel(time_df.index.name)
        plt.savefig(image_name, format='svg')
        plt.tight_layout()
    return fig

In [40]:
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

def plot_acf_pacf(time_df, series_col, lags, image_name, style='seaborn-v0_8', plot_size=(16, 32)):
    with plt.style.context(style=style):
        fig, axes = plt.subplots(2, 1, figsize=plot_size)
        fig = plot_acf(time_df[series_col].dropna(), lags=lags, ax=axes[0])
        fig = plot_pacf(time_df[series_col].dropna(), lags=lags, ax=axes[1])
        axes[0].set_xlabel('lags')
        axes[0].set_ylabel('correlation')
        axes[1].set_xlabel('lags')
        axes[1].set_ylabel('correlation')
        plt.savefig(image_name, format='svg')
        plt.tight_layout()
    return fig

# Train Test Split

In [44]:
from dateutil.parser import parse

def split_correctness(data, train, test):
    """
    Utility function for making sure that the split that we conducted split the data correctly
    :param data: the full data set
    :param train: the train portion of the data set
    :param test: the test portion of the data set
    """
    assert data.size == train.size + test.size, \
    "Train count {} and test count {} did not match to source count {}".format(train.size, test.size, data.size)

# parse: https://dateutil.readthedocs.io/en/stable/parser.html
def generate_splits(data, date):
    """
    Function for splitting raw data between train and test at a boundary point
    that is specified as a parse-able date format.
    :param data: the raw data
    :param date: a date, in a format that 'can be parsed' to serve as the boundary point
    """
    parsed_date = parse(date, fuzzy=True)
    nearest_date = data[:parsed_date].iloc(0)[-1].name
    train = data[:nearest_date]
    test = data[nearest_date:][1:]
    split_correctness(data, train, test)
    return train, test

# Metric and Error Estimation

In [None]:
def calculate_mae(raw_series, smoothed_series, window, scale):
    """
    Function for calculating mae through scikit-learn and build stddev error series
    :param raw_series: the raw data series, date indexed
    :param smoothed_series: the exponentially smoothed series with identical index to raw_series
    :param window: the size of the smoothing window
    :param scale: percentile value of the standard normal distribution expressed in terms of stddev value
    """
    # dictionary to store the resulting values of the function's logic
    res = {}
    
    mae_value = mean_absolute_error(raw_series[window:], smoothed_series[window:])
    
    # store the mae value in the dictionary
    res['mae'] = mae_value
    
    # calculate the stddev between the raw data and the smoothed data, filtering out the incomplete lagged
    # exponential smoothing data
    # the elements of the smoothed series that couldn't calculate based on an incomplete window will be null)
    deviation = np.std(raw_series[window:] - smoothed_series[window:])
    
    # store the stddev data in the dictionary
    res['stddev'] = deviation
    
    # calculate the scaled stddev (e.g. with a scale of '2', we're calculating 2-sigma around the smoothed value)
    yhat = mae_value + scale * deviation
    
    # store the offset values of stddev as two separate series for plotting
    res['yhat_low'] = smoothed_series - yhat
    res['yhat_high'] = smoothed_series + yhat
    return res

def mape(y_true, y_pred):
    """ 
    Function for calculating mean absolute percentage error 
    (for comparing models of different series magnitudes to one another for objective quality comparison)
    :param y_true: the 'validation' series (a.k.a. 'test')
    :param y_pred: the forecast series (a.k.a. the result of a '.predict()' method call)
    """
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def calculate_errors(y_true, y_pred):
    """ 
    Function to calculate the 'core' forecasting error metrics for a regression problem.
    :param y_true: the test series (ground truth holdout data)
    :param y_pred: the forecast (predicted) series
    """
    # create a dictionary to store all of the metrics
    error_scores = {}
    
    # define a variable for mse
    # it's also going to be used to calculate the rmse value
    mse = mean_squared_error(y_true, y_pred)
    
    # store all of the metrics into the dictionary for ease of access later. 
    #  (don't return tuples - it's a bad practice because it's really hard for other humans to read)
    error_scores['mae'] = mean_absolute_error(y_true, y_pred)
    error_scores['mape'] = mape(y_true, y_pred)
    error_scores['mse'] = mse
    error_scores['rmse'] = sqrt(mse)
    error_scores['explained_var'] = explained_variance_score(y_true, y_pred)
    error_scores['r2'] = r2_score(y_true, y_pred)
    return error_scores

def plot_predictions(y_true, y_pred, time_series_name, value_name, image_name, style='seaborn-v0_8', plot_size=(16, 12)):
    """ Function for a standardized forecasting visualization
    :param y_true: the ground-truth values during the forecasting period
    :param y_pred: the forecast values during the holdout period
    :param time_series_name: a name for the plot
    :param value_name: the name for our y-axis on the plot
    :param image_name: the name of the file to save the visualization as in svg format
    :param style: (default 'seaborn') the visual style of the plots
    :param plot_size: (default 16 x 12 inches) the size of the figure we're going to generate
    """
    # dictionary for currying
    validation_output = {} 
    
    # full error metrics suite as shown in listing 6.6
    error_values = calculate_errors(y_true, y_pred)
    
    # store all of the raw values of the errors
    validation_output['errors'] = error_values
    
    # create a string to populate a bounding box with on the graph
    text_str = '\n'.join((
        'mae = {:.3f}'.format(error_values['mae']),
        'mape = {:.3f}'.format(error_values['mape']),
        'mse = {:.3f}'.format(error_values['mse']),
        'rmse = {:.3f}'.format(error_values['rmse']),
        'explained var = {:.3f}'.format(error_values['explained_var']),
        'r squared = {:.3f}'.format(error_values['r2']),
    )) 
    with plt.style.context(style=style):
        fig, axes = plt.subplots(1, 1, figsize=plot_size)
        axes.plot(y_true, 'b-', label='Test data for {}'.format(time_series_name))
        axes.plot(y_pred, 'r-', label='Forecast data for {}'.format(time_series_name))
        axes.legend(loc='upper left')
        axes.set_title('Raw and Predicted data trend for {}'.format(time_series_name))
        axes.set_ylabel(value_name)
        axes.set_xlabel(y_true.index.name)

         # create an overlay bounding box so that all of our metrics are displayed on the plot
        props = dict(boxstyle='round', facecolor='oldlace', alpha=0.5)
        axes.text(0.05, 0.9, text_str, transform=axes.transAxes, fontsize=12, verticalalignment='top', bbox=props)
        validation_output['plot'] = fig
        plt.savefig(image_name, format='svg')
        plt.tight_layout()
    return validation_output