In [7]:
import pandas as pd
from contextlib import contextmanager, redirect_stdout, redirect_stderr
from functools import partial
from os import devnull

## Utility Functions and Variables
- they will be ported over to two separate classes performing these functions in production code
- one class for data ingestion, filtering, aggreating, and application of the index
- one class for data splitting and validation of the splits

In [5]:
@contextmanager
def suppress_annoying_prints():
    """
    A contextmanager decorated function that allows for 'muting' stdout and stderr while wrapped around
    another function execution.
    Specifically used for allowing for hiding the verbosity of hyperopt's minimization function.
    """
    with open(devnull, 'w') as black_hole:
        with redirect_stdout(black_hole) as chatter, redirect_stderr(black_hole) as noisy_errors:
            yield (chatter, noisy_errors)

In [6]:
DATA_PATH = '/opt/notebooks/datasets/NB_GA_Data_1.xlsx'
HOSTNAME_FIELD = 'Hostname'

In [None]:
# data acquisition, formatting, splitting, and validation
def apply_index_freq(data, freq):
    """
    Function for applying the correct temporal frequency to the Index of the Pandas DataFrame
    :param data: a Pandas DataFrame that has a date or datetime Index set
    :param freq: a timespan frequency string
        See https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
        for more information on allowable frequency strings
    """
    return data.asfreq(freq)

def pull_raw_traffic_data(file_location):
    """
    Function for reading in the raw .xlsx file containing the traffic timeseries data
    :param file_location: the location on disk that .xlsx is stored.
    """
    raw = pd.read_excel(file_location, None)
    # note the use of None here. It allows for reading in data from all worksheets
    # See https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html for more information on pd.read_excel()
    raw = raw.copy(deep=False)
    raw['Date'] = pd.to_datetime(raw['Date']
    raw.set_index('Date', inplace=True)
    raw.index = pd.DatetimeIndex(raw.index.values, freq=raw.index.inferred_freq)
    asc = raw.sort_index()
    return asc

def get_hostname_data_from_df(full_file, freq, hostname):
    """
    Function for filtering out a single hostname from the full file
    :param full_file: the read-in DataFrame of the hostname .xlsx
    :param freq: the timespan frequency string for the index temporal component
    :param hostname: the hostname key code to filter upon
    """
    filtered = full_file[full_file[HOSTNAME_FIELD] == hostname]
    return apply_index_freq(filtered, freq)

def get_hostname_data(file_location, freq, hostname):
    """
    Function for acquiring the raw file and filtering to a single hostname (mostly for testing)
    :param file_location: the read-in DataFrame of the hostname .xlsx
    :param freq: the timespan frequency string for the index temporal component
    :param hostname: the hostname key code to filter upon
    """
    all_data = pull_raw_traffic_data(file_location)
    # note the use of a partial function here. While not expressly required for this particular use case
    # this is a functional programming paradigm introduced by Ben to introduce the concept of currying within ML code
    # When there is more complex chaining of functional chains of operations, using partial functions can
    # make code base MUCH cleaner and easier to maintain
    return partial(get_hostname_data_from_df, all_data, freq)(hostname)

def get_all_hostnames_from_df(full_file):
    """
    Function for returning the unique list of hostnames contained within the data set.
    :param full_file:
    """
    return sorted(full_file[HOSTNAME_FIELD].unique())

def get_hostname_monthly_traffic_from_hostname_daily_traffic(hostname_daily_traffic):
    """
    Functions for aggregating daily traffic into monthly traffic.
    :param daily_traffic: a Pandas DataFrame that has a date Index and only includes traffic from one specific hostname
    """
    hostname_daily_traffic.loc[:, 'Year'] = hostname_daily_traffic.index.year
    hostname_daily_traffic.loc[:, 'Month'] = hostname_daily_traffic.index.month
    hostname_daily_traffic.loc[:, 'Day'] = 1
    hostname_daily_traffic.loc[:, 'Date'] = pd.to_datetime(hostname_daily_traffic[['Year', 'Month', 'Day']])
    return hostname_daily_traffic.groupby('Date').agg({'Sessions':'sum', 'Pageviews':'sum'})
    

def generate_splits_by_month(data, months):
    """
    Functions for creating the train and test splits for the models, filling in missing index sequences
        with a forward fill first (conservative estimation), followed by a backfill if the beginning of the
        series starts with a NaN values
    :param data: a Pandas DataFrame object with a monthly date index
    :param months: the number of months to utilize for validation through the test set row count.
    """
    train = data[:-months].fillna(method='ffill').fillna(method='bfill')
    test = data[-months:].fillna(method='ffill').fillna(method='bfill')
    return train, test

def validate_data_counts(data, split_count):
    """
    A utility validation function that checks to make sure that we have enough data to train on.
        The return type of this is a boolean that is used to determine whether we should attempt to
        model this hostname or not (if we can't, we can fallback to a simpler methodology that, although
        likely less accurate, can be a placeholder until we have enough data to trust in the predictions that
        a statistical model will give)
    :param data: the full data set prior to splitting into test and train
    :param split_count: the cutoff count for test data to ensure that we have enough of a ratio for the
        model's training to hit at least a ratio of 80% train to 20% test.
    """
    return split_count / 0.2 < len(data) * 0.8