In [1]:
import math
import itertools

import pandas as pd
from sqlalchemy.sql import select

from predict_aqi.load_data import (
    AirLocation, get_db_session, load_air_location_data, load_measurement_data,
    load_nearby_location_measurement_data
)

# Select some airlocation with bad aqi
session = get_db_session()
s = select([AirLocation]).where(AirLocation.aqi > 100)
some_airlocation_ids = [x['id'] for x in session.execute(s)]

In [2]:
# Select classifier to use
from sklearn.neural_network import MLPRegressor
from predict_aqi.model import generate_AQI_inputs_and_outputs
from predict_aqi import config
from predict_aqi.transform_data import clean_data


def load_data_for_airlocation(airlocation_id):
    indices_ahead_to_predict = range(2, config.NUMBER_AHEAD_TO_PREDICT + 2, 2)
    indices_behind_to_use = range(1, 48)
    some_airlocation = load_air_location_data(airlocation_id)
    
    print("Loading {}, {}".format(some_airlocation.short_name, some_airlocation.en_country_name))
    
    measurements = load_measurement_data(airlocation_id)
    df, input_columns, output_columns = generate_AQI_inputs_and_outputs(
        measurements, indices_behind_to_use, indices_ahead_to_predict
    )
    return clean_data(df, input_columns), input_columns, output_columns

In [3]:
from predict_aqi.model import (
    generate_predictions, get_first_step_functions, get_second_step_functions
)
from sklearn.metrics import mean_absolute_error
import copy


def generate_errors_for_airlocation_predictions(df, input_columns, output_columns):
    indices_ahead_to_predict = range(2, config.NUMBER_AHEAD_TO_PREDICT + 2, 2)
    indices_behind_to_use = range(1, 48)
    row_count = df.count()[0]
    split_row = int(round(row_count * 0.90))
    
    errors = {}
    
    for indices_behind_to_use_index in range(2, 49, 4):
        current_input_columns = copy.deepcopy(input_columns[:indices_behind_to_use_index])
        first_step_format_inputs_outputs, first_step_split_function = get_first_step_functions(
            current_input_columns, output_columns
        )
        second_step_format_inputs_outputs, second_step_split_function = get_second_step_functions(
            current_input_columns, output_columns, indices_ahead_to_predict
        )
        
        first_step_regressor = MLPRegressor()
        second_step_regressor = MLPRegressor()
        this_iteration_df = df.copy(deep=True)
        this_iteration_df = generate_predictions(
            this_iteration_df,
            first_step_format_inputs_outputs,
            first_step_split_function,
            second_step_format_inputs_outputs,
            second_step_split_function,
            first_step_regressor,
            second_step_regressor,
            indices_ahead_to_predict,
            False
        )
        this_iteration_error_dict = {'first_step': {}, 'second_step': {}}
        for i in range(2, 50, 4):
            this_iteration_error_dict['first_step'][i] = mean_absolute_error(
                this_iteration_df['{}_ahead_AQI'.format(i)][:split_row], 
                this_iteration_df['{}_ahead_first_step_pred'.format(i)][:split_row]
            )
            this_iteration_error_dict['second_step'][i] = mean_absolute_error(
                this_iteration_df['{}_ahead_AQI'.format(i)][:split_row], 
                this_iteration_df['{}_ahead_second_step_pred'.format(i)][:split_row]
            )
        errors[indices_behind_to_use_index] = this_iteration_error_dict
        del this_iteration_df
    return errors

In [4]:
import matplotlib.pyplot as plt


def generate_error_graph(errors):
    distances = range(2, 49, 4)
    first_step_errors = {
        i: [n * config.MAX_AQI for n in errors[i]['first_step'].values()]
        for i in distances
    }
    
    second_step_errors = {
        i: [n * config.MAX_AQI for n in errors[i]['second_step'].values()]
        for i in distances
    }
    plt.figure(figsize=(15, 10))
    plt.axis([0.0, 25.0, 0.0, 35.0])
    for distance in sorted(first_step_errors.keys()):
        plt.plot(
            [d // 2 for d in distances], 
            first_step_errors[distance], 
            label="{}hrs input first step error".format(distance / 2),
            color=str(distance / 48)
        )
        plt.plot(
            [d // 2 for d in distances],
            second_step_errors[distance], 
            label="{}hrs input second step error".format(distance / 2),
            color=str(distance / 48)
        )
    plt.legend(loc=6, bbox_to_anchor=(0, 0))
    plt.suptitle("Absolute AQI error on predictions for the next 24 hours", fontsize=20)
    return first_step_errors, second_step_errors

In [5]:
all_errors = []
for airlocation_id in some_airlocation_ids[0:5]:
    df, input_columns, output_columns = load_data_for_airlocation(airlocation_id)
    errors = generate_errors_for_airlocation_predictions(df, input_columns, output_columns)
    first_step_errors, second_step_errors = generate_error_graph(errors)
    all_errors.append((first_step_errors, second_step_errors))
    del df

In [6]:
plt.figure(figsize=(15, 10))
#plt.axis([10.0, 40.0, 0.0, 25.0])
for index, first_step_errors, second_step_errors in enumerate(all_errors):
    xs = [d / 2 for d in sorted(first_step_errors.keys())]
    plt.plot(
        xs, 
        [sum(first_step_errors[d]) for d in sorted(first_step_errors.keys())],
        label="City #{} first step error".format(some_airlocation_ids[index]),
        color=str(index / len(all_errors))
    )
    plt.plot(
        xs,
        [sum(second_step_errors[d]) for d in sorted(second_step_errors.keys())],
        label="City #{} second step error".format(some_airlocation_ids[index]),
        color=str(index / len(all_errors))
    )
plt.legend(loc=6, bbox_to_anchor=(0, 0.3))
plt.suptitle("Sum of absolute AQI error on predictions for different amounts of inputs", fontsize=20)


# y = [sum(first_step_errors[d]) for d in sorted(first_step_errors.keys())]
# for distance in sorted(first_step_errors.keys()):
#     print("Distance into the past of inputs used: {}hrs".format(distance / 2))
#     print("Sum of absolute first step AQI prediction error: {}".format(
#         sum(first_step_errors[distance])
#     ))
#     print("Sum of absolute second step AQI prediction error: {}".format(
#         sum(second_step_errors[distance])
#     ))