In [1]:
import math
import itertools

import pandas as pd
from sqlalchemy.sql import select

from predict_aqi.load_data import (
    AirLocation, get_db_session, load_air_location_data, load_measurement_data,
    load_nearby_location_measurement_data
)

# Select some airlocation with bad aqi
session = get_db_session()
s = select([AirLocation]).where(AirLocation.aqi > 100)
some_airlocation_id = session.execute(s).scalar()
some_airlocation = load_air_location_data(some_airlocation_id)

print("{}, {}".format(some_airlocation.short_name, some_airlocation.en_country_name))

measurements = load_measurement_data(some_airlocation_id)

  (attype, name))


Mudanjiang, China


In [2]:
# Select classifier to use
from sklearn.neural_network import MLPRegressor
from predict_aqi.model import generate_AQI_inputs_and_outputs
from predict_aqi import config


indices_ahead_to_predict = range(2, config.NUMBER_AHEAD_TO_PREDICT + 2, 2)
indices_behind_to_use = range(1, 16)
df, input_columns, output_columns = generate_AQI_inputs_and_outputs(
    measurements, indices_behind_to_use, indices_ahead_to_predict
)

In [3]:
# Define functions for generate_predictions method

from predict_aqi.transform_data import generate_time_inputs


def first_step_format_inputs_outputs(all_data):
    return all_data, input_columns, output_columns

 
def first_step_split_function(all_data, input_columns, output_columns):
    # Split by the first 90% of data. That means we're our testing set is extrapolation.
    row_count = all_data.count()[0]
    split_row = int(round(row_count * 0.90))
    # x_train, y_train, x_test, y_test
    return (all_data[input_columns][:split_row], 
            all_data[output_columns][:split_row],
            all_data[input_columns][split_row:],
            all_data[output_columns][split_row:])


def second_step_format_inputs_outputs(all_data):
    all_data, time_columns = generate_time_inputs(all_data)
    first_step_prediction_columns = ['{}_ahead_first_step_pred'.format(str(i)) 
                                     for i in indices_ahead_to_predict]
    second_input_columns = list(itertools.chain(time_columns, first_step_prediction_columns))
    # return the right columns
    return all_data, second_input_columns, output_columns

second_step_split_function = first_step_split_function

In [18]:
from predict_aqi.model import generate_predictions
from sklearn.metrics import mean_absolute_error
import copy


row_count = df.count()[0]
split_row = int(round(row_count * 0.90))
largest_input_columns = copy.deepcopy(input_columns)

errors = {}

for indices_behind_to_use_index in range(2, 17, 1):
    input_columns = largest_input_columns[:indices_behind_to_use_index]
    first_step_regressor = MLPRegressor()
    second_step_regressor = MLPRegressor()
    this_iteration_df = df.copy(deep=True)
    this_iteration_df = generate_predictions(
        this_iteration_df,
        first_step_format_inputs_outputs,
        first_step_split_function,
        second_step_format_inputs_outputs,
        second_step_split_function,
        first_step_regressor,
        second_step_regressor,
        indices_ahead_to_predict,
    )
    this_iteration_error_dict = {'first_step': {}, 'second_step': {}}
    for i in range(12, 50, 2):
        this_iteration_error_dict['first_step'][i] = mean_absolute_error(
            this_iteration_df['{}_ahead_AQI'.format(i)][:split_row], 
            this_iteration_df['{}_ahead_first_step_pred'.format(i)][:split_row]
        )
        this_iteration_error_dict['second_step'][i] = mean_absolute_error(
            this_iteration_df['{}_ahead_AQI'.format(i)][:split_row], 
            this_iteration_df['{}_ahead_second_step_pred'.format(i)][:split_row]
        )
    errors[indices_behind_to_use_index] = this_iteration_error_dict
    del this_iteration_df

In [19]:
distances = range(2, 17, 1)
first_step_errors = {
    i: [n * config.MAX_AQI for n in errors[i]['first_step'].values()]
    for i in distances
}

second_step_errors = {
    i: [n * config.MAX_AQI for n in errors[i]['second_step'].values()]
    for i in distances
}


In [34]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15, 10))
plt.axis([8.0, 25.0, 0.0, 25.0])
for distance in sorted(first_step_errors.keys()):
    plt.plot(
        range(6, 25, 1), 
        first_step_errors[distance], 
        label="{}hrs input first step error".format(distance // 2),
        color=str(distance / 250)
    )
    plt.plot(
        range(6, 25, 1), 
        second_step_errors[distance], 
        label="{}hrs input second step error".format(distance // 2),
        color=str(distance / 250)
    )
plt.legend(loc=6, bbox_to_anchor=(0, 0.3))
plt.suptitle("Absolute AQI error on predictions for the next 24 hours", fontsize=20)


In [35]:
for distance in sorted(first_step_errors.keys()):
    error_values = second_step_errors[distance]
    dict_len = len(error_values)
    print("Distance into the past of inputs used: {}".format(distance))
    print("Sum of absolute AQI error in first 12 hours of prediction: {}".format(
        sum(error_values[:dict_len // 2])
    ))
    print("Sum of absolute AQI error in last 12 hours of prediction: {}".format(
        sum(error_values[dict_len // 2:])
    ))