In [4]:
import itertools
import datetime
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
import pandas as pd

from predict_aqi.multi_city_model import (
    load_and_clean_locations_near_airlocation,
    generate_AQI_inputs_and_outputs,
    generate_predictions_two_step,
    generate_baseline_predictions,
    cut_off_end_split_function
)


def generate_errors_dict(df, indices_ahead_to_predict_range, pred_col_suffix, pred_col_prefix="{}_ahead"):
    '''
    Calculates the mean absolute error for the pred_col for each in indices_ahead_to_predict_range.
    Returns a dictionary with format:
    {'2_ahead': 123.4, '6_ahead': 234.5, '10_ahead': 345.6, ...}
    '''
    error_dict = {}
    for index_ahead_to_predict in indices_ahead_to_predict_range:
        pred_col = "{}_{}".format(
            pred_col_prefix.format(index_ahead_to_predict),
            pred_col_suffix
        )

        error_dict['{}_ahead'.format(index_ahead_to_predict)] = 300 * mean_absolute_error(
            df['{}_ahead_AQI'.format(index_ahead_to_predict)],
            df[pred_col]
        )
    return error_dict


def generate_errors_rows(df,
                         indices_ahead_to_predict_range,
                         number_of_nearby_locations_to_use,
                         first_cells_of_row):
    error_rows = []

    # generate error results rows for the final predictions on this iteration
    errors_dict = generate_errors_dict(df, indices_ahead_to_predict_range, 'pred')
    errow_row = list(itertools.chain(
        first_cells_of_row,
        ['final', errors_dict]
    ))
    error_rows.append(errow_row)

    # generate error results rows for all the first step predictions on this iteration
    for location_number in range(1, number_of_nearby_locations_to_use + 1):
        pred_col_suffix = "loc_{}_first_step_pred".format(location_number)
        errors_dict = generate_errors_dict(df, indices_ahead_to_predict_range, pred_col_suffix)
        errow_row = list(itertools.chain(
            first_cells_of_row,
            ['first_step_loc_{}'.format(location_number), errors_dict]
        ))
        error_rows.append(errow_row)

    return error_rows


def append_row(df, row):
    df.loc[df.shape[0]] = row


def test_hyperparams_on_airlocation(
        airlocation_id,
        indices_ahead_to_predict_range,
        list_of_indices_behind_to_use_range,
        list_of_number_of_nearby_locations_to_use,
        list_of_hidden_layer_sizes,
        list_of_alphas):
    
    start = datetime.datetime.now()
    
    # load, align, and clean the data from db
    max_locations_needed = max(list_of_number_of_nearby_locations_to_use)
    df, airlocation_ids, continuous_time_series = load_and_clean_locations_near_airlocation(
        airlocation_id, 50, max_locations_needed
    )

    # generate inputs and target outputs for the model
    df, continuous_time_series, input_columns, output_columns = generate_AQI_inputs_and_outputs(
        df,
        continuous_time_series,
        indices_ahead_to_predict_range,
        max(list_of_indices_behind_to_use_range, key=lambda a: len(list(a))),
        max_locations_needed
    )

    city_specific_input_columns = filter(lambda col: 'loc' in col, input_columns)
    other_input_columns = list(set(input_columns) ^ set(city_specific_input_columns))

    results_df = pd.DataFrame(columns=[
        'indices_behind_to_use_range',
        'number_of_nearby_locations_to_use',
        'hidden_layer_sizes',
        'alpha',
        'airlocation_id',
        'nearby_airlocation_ids',
        # one of: 'final', 'first_step_loc_x' (where x is an int), 'baseline'
        'prediction_type',
        # error results dictionary with format:
        # {'2_ahead': 123.4, '6_ahead': 234.5, '10_ahead': 345.6, ...}
        'error_results'
    ])

    # for each hyperparameter
    for indices_behind_to_use_range in list_of_indices_behind_to_use_range:
        for number_of_nearby_locations_to_use in list_of_number_of_nearby_locations_to_use:
            for hidden_layer_sizes in list_of_hidden_layer_sizes:
                for alpha in list_of_alphas:

                    # train model and make predictions
                    locations = ['loc_{}'.format(str(i))
                                 for i in range(1, number_of_nearby_locations_to_use + 1)]

                    regressor_params = {'alpha': alpha, 'hidden_layer_sizes': hidden_layer_sizes}
                    first_step_regressors = [MLPRegressor(**regressor_params)
                                             for i in locations]
                    second_step_regressors = [MLPRegressor(**regressor_params)
                                              for i in indices_ahead_to_predict_range]

                    this_iteration_df = df.copy(deep=True)
                    this_iteration_df = generate_predictions_two_step(
                        this_iteration_df,
                        other_input_columns,
                        output_columns,
                        cut_off_end_split_function,
                        locations,
                        first_step_regressors,
                        second_step_regressors,
                        indices_ahead_to_predict_range,
                        indices_behind_to_use_range,
                        "ahead_pred",
                        False
                    )

                    # calculate and store the errors
                    first_cells_of_error_row = [
                        indices_behind_to_use_range,
                        number_of_nearby_locations_to_use,
                        hidden_layer_sizes,
                        alpha,
                        airlocation_id,
                        airlocation_ids
                    ]
                    error_results_rows = generate_errors_rows(
                        this_iteration_df,
                        indices_ahead_to_predict_range,
                        number_of_nearby_locations_to_use,
                        first_cells_of_error_row
                    )

                    for error_results_row in error_results_rows:
                        append_row(results_df, error_results_row)

                    del this_iteration_df
                    
    # Finally, get error results for the baseline predictions once for the location
    df = generate_baseline_predictions(df, ['{}_ahead_AQI'.format(i) for i in indices_ahead_to_predict_range])
    baseline_errors_dict = {}
    for index_ahead_to_predict in indices_ahead_to_predict_range:
        pred_col = "{}_ahead_baseline_pred".format(index_ahead_to_predict)
        df[pred_col] = df[pred_col].apply(lambda x: x / 300)
        baseline_errors_dict['{}_ahead'.format(index_ahead_to_predict)] = 300 * mean_absolute_error(
            df['{}_ahead_AQI'.format(index_ahead_to_predict)],
            df[pred_col]
        )
    append_row(
        results_df, 
        [None, None, None, None, airlocation_id, airlocation_ids, 'baseline', baseline_errors_dict]
    )
    
    end = datetime.datetime.now()
    iteration_count = len(list_of_indices_behind_to_use_range) * \
                      len(list_of_number_of_nearby_locations_to_use) * \
                      len(list_of_hidden_layer_sizes) * \
                      len(list_of_alphas)
    print(
        "Trying {} hyperparameter combinations took {} seconds".format(
            iteration_count,
            (end - start).total_seconds()
        )
    )
    return results_df

In [5]:
MAX_INDEX = 48
MAX_NUMBER_OF_CITIES_TO_USE = 5

list_of_alphas = [0.001, 0.0005, 0.0001, 0.00005]
list_of_hidden_layer_sizes = [(24,), (100,), (100, 10)]
indices_ahead_to_predict_range = range(2, MAX_INDEX + 1, 11)
list_of_indices_behind_to_use_range = [range(0, i + 1, 2) for i in range(0, 49, 8)]
list_of_number_of_nearby_locations_to_use = list(range(1, 6))

print("Indices ahead to predict range: {}".format(str(list(indices_ahead_to_predict_range))))

print("\nHyper-parameters:")
combinations = len(list_of_alphas) * \
    len(list_of_hidden_layer_sizes) * \
    len(list_of_indices_behind_to_use_range) * \
    len(list_of_number_of_nearby_locations_to_use)
print("{} different combinations, will take approx {} minutes.".format(
    combinations, combinations / 10
))
print("List of alphas: {}".format(str(list_of_alphas)))
print("List of hidden layer sizes {}".format(str(list_of_hidden_layer_sizes)))
print("List of indices behind to use range: {}".format(str(list_of_indices_behind_to_use_range)))
print("List of number of nearby locations to use: {}".format(str(list_of_number_of_nearby_locations_to_use)))

Indices ahead to predict range: [2, 13, 24, 35, 46]

Hyper-parameters:
420 different combinations, will take approx 42.0 minutes.
List of alphas: [0.001, 0.0005, 0.0001, 5e-05]
List of hidden layer sizes [(24,), (100,), (100, 10)]
List of indices behind to use range: [range(0, 1, 2), range(0, 9, 2), range(0, 17, 2), range(0, 25, 2), range(0, 33, 2), range(0, 41, 2), range(0, 49, 2)]
List of number of nearby locations to use: [1, 2, 3, 4, 5]


In [6]:
from predict_aqi.load_data import load_nearby_locations

for airlocation_id in range(977, 978):
    nearby_locations = load_nearby_locations(1160, 50)
    if len(nearby_locations) < 5:
        continue
    nearby_location_ids = [l[0] for l in nearby_locations]
    results_df = test_hyperparams_on_airlocation(
        airlocation_id,
        indices_ahead_to_predict_range,
        list_of_indices_behind_to_use_range,
        list_of_number_of_nearby_locations_to_use,
        list_of_hidden_layer_sizes,
        list_of_alphas
    )
    results_df.to_csv('predict_aqi/results_data/{}.csv'.format(airlocation_id))
    del results_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  new_df['is_dirty'] = new_df[output_columns].apply(lambda x: any(map(np.isnan, x)), axis=1)


  y = column_or_1d(y, warn=True)


Trying 420 hyperparameter combinations took 3009.450012 seconds
