In [None]:
import itertools
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
import pandas as pd

from multi_city_model import (
    load_and_clean_locations_near_airlocation,
    generate_AQI_inputs_and_outputs,
    generate_predictions_two_step,
    cut_off_end_split_function
)


def generate_errors_dict(df, indices_ahead_to_predict_range, pred_col_suffix, pred_col_prefix="{}_ahead", denormalize=True):
    '''
    Calculates the mean absolute error for the pred_col for each in indices_ahead_to_predict_range.
    Returns a dictionary with format:
    {'2_ahead': 123.4, '6_ahead': 234.5, '10_ahead': 345.6, ...}
    '''
    error_dict = {}
    for index_ahead_to_predict in indices_ahead_to_predict_range:
        pred_col = "{}_{}".format(
            pred_col_prefix.format(index_ahead_to_predict),
            pred_col_suffix
        )

        if denormalize:
            df[pred_col] = df[pred_col].apply(lambda x: x * 300)

        error_dict['{}_ahead'.format(index_ahead_to_predict)] = mean_absolute_error(
            df['{}_ahead_AQI'.format(index_ahead_to_predict)],
            df[pred_col]
        )
    return error_dict


def generate_errors_rows(df,
                         indices_ahead_to_predict_range,
                         number_of_nearby_locations_to_use,
                         first_cells_of_row):
    error_rows = []

    # generate error results rows for the final predictions on this iteration
    errors_dict = generate_errors_dict(df, indices_ahead_to_predict_range, 'pred')
    errow_row = list(itertools.chain(
        first_cells_of_row,
        ['final', errors_dict]
    ))
    error_rows.append(errow_row)

    # generate error results rows for all the first step predictions on this iteration
    for location_number in range(1, number_of_nearby_locations_to_use + 1):
        pred_col_suffix = "loc_{}__first_step_pred".format(location_number)
        errors_dict = generate_errors_dict(df, indices_ahead_to_predict_range, pred_col_suffix)
        errow_row = list(itertools.chain(
            first_cells_of_row,
            ['first_step_loc_{}'.format(location_number), errors_dict]
        ))
        error_rows.append(errow_row)

    return error_rows


def test_hyperparams_on_airlocation(
        airlocation_id,
        indices_ahead_to_predict_range,
        list_of_indices_behind_to_use_range,
        list_of_number_of_nearby_locations_to_use,
        list_of_hidden_layer_sizes,
        list_of_alphas):

    # load, align, and clean the data from db
    max_locations_needed = max(list_of_number_of_nearby_locations_to_use)
    df, airlocation_ids, continuous_time_series = load_and_clean_locations_near_airlocation(
        airlocation_id, 50, max_locations_needed
    )

    # generate inputs and target outputs for the model
    df, continuous_time_series, input_columns, output_columns = generate_AQI_inputs_and_outputs(
        df,
        continuous_time_series,
        indices_ahead_to_predict_range,
        max(list_of_indices_behind_to_use_range),
        max_locations_needed
    )

    city_specific_input_columns = filter(lambda col: 'loc' in col, input_columns)
    other_input_columns = list(set(input_columns) ^ set(city_specific_input_columns))

    results_df = pd.DataFrame(columns=[
        'indices_behind_to_use_range',
        'number_of_nearby_locations_to_use',
        'hidden_layer_sizes',
        'alpha',
        'airlocation_id',
        'nearby_airlocation_ids',
        # one of: 'final', 'first_step_loc_x' (where x is an int), 'baseline'
        'prediction_type',
        # error results dictionary with format:
        # {'2_ahead': 123.4, '6_ahead': 234.5, '10_ahead': 345.6, ...}
        'error_results'
    ])

    # First, get error results for the baseline predictions once for the location
    baseline_errors_dict = generate_errors_dict(
        df,
        indices_ahead_to_predict_range,
        pred_col_prefix="loc_1",
        pred_col_suffix='aqi',
        denormalize=False
    )
    results_df.append([
        None, None, None, None, airlocation_id, airlocation_ids, 'baseline', baseline_errors_dict
    ])

    # for each hyperparameter
    for indices_behind_to_use_range in list_of_indices_behind_to_use_range:
        for number_of_nearby_locations_to_use in list_of_number_of_nearby_locations_to_use:
            for hidden_layer_sizes in list_of_hidden_layer_sizes:
                for alpha in list_of_alphas:

                    # train model and make predictions
                    locations = ['loc_{}'.format(str(i))
                                 for i in range(1, number_of_nearby_locations_to_use + 1)]

                    regressor_params = {'alpha': alpha, 'hidden_layer_sizes': hidden_layer_sizes}
                    first_step_regressors = [MLPRegressor(**regressor_params)
                                             for i in locations]
                    second_step_regressors = [MLPRegressor(**regressor_params)
                                              for i in indices_ahead_to_predict_range]

                    this_iteration_df = df.copy(deep=True)
                    this_iteration_df = generate_predictions_two_step(
                        this_iteration_df,
                        other_input_columns,
                        output_columns,
                        cut_off_end_split_function,
                        locations,
                        first_step_regressors,
                        second_step_regressors,
                        indices_ahead_to_predict_range,
                        indices_behind_to_use_range,
                        "ahead_single_city_pred",
                        False
                    )

                    # calculate and store the errors
                    first_cells_of_error_row = [
                        indices_behind_to_use_range,
                        number_of_nearby_locations_to_use,
                        hidden_layer_sizes,
                        alpha,
                        airlocation_id,
                        airlocation_ids
                    ]
                    error_results_rows = generate_errors_rows(
                        this_iteration_df,
                        indices_ahead_to_predict_range,
                        number_of_nearby_locations_to_use,
                        first_cells_of_error_row
                    )

                    for error_results_row in error_results_rows:
                        results_df.append(
                            error_results_row
                        )

                    del this_iteration_df
    return results_df

In [6]:
MAX_INDEX = 48
MAX_NUMBER_OF_CITIES_TO_USE = 5

list_of_alphas = [0.0001]
list_of_hidden_layer_sizes = [(100, 10)]
indices_ahead_to_predict_range = range(2, MAX_INDEX + 1, 11)
list_of_indices_behind_to_use_range = [range(0, i + 1, 2) for i in range(0, 49, 8)]
list_of_number_of_nearby_locations_to_use = list(range(1, 6))

print("Indices ahead to predict range: {}".format(str(list(indices_ahead_to_predict_range))))

print("Hyper-parameters:")
print("List of alphas: {}".format(str(list_of_alphas)))
print("List of hidden layer sizes {}".format(str(list_of_hidden_layer_sizes)))
print("List of indices behind to use range: {}".format(str(list_of_indices_behind_to_use_range)))
print("List of number of nearby locations to use: {}".format(str(list_of_number_of_nearby_locations_to_use)))

Indices ahead to predict range: [2, 13, 24, 35, 46]
Hyper-parameters:
List of alphas: [0.0001]
List of hidden layer sizes [(100, 10)]
List of indices behind to use range: [range(0, 1, 2), range(0, 9, 2), range(0, 17, 2), range(0, 25, 2), range(0, 33, 2), range(0, 41, 2), range(0, 49, 2)]
List of number of nearby locations to use: [1, 2, 3, 4, 5]


In [None]:
results_df = test_hyperparams_on_airlocation(
    airlocation_id,
    indices_ahead_to_predict_range,
    list_of_indices_behind_to_use_range,
    list_of_number_of_nearby_locations_to_use,
    list_of_hidden_layer_sizes,
    list_of_alphas
)