# Comparing model MAE for one pollutant at one station predicting one hour in the future

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import sys, os, glob, pickle
import pandas as pd
import seaborn as sns

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src import plotting, modelling

%matplotlib inline

In [None]:
# Load data and filter for a single station

pollutant = 'SO2'
station = 101
df = pd.read_pickle('../src/data/processed/{}_0.pkl'.format(pollutant))
df = df.loc[df['station'] == station]


In [None]:
# Create data for model

df_data = df['value']
df_data.index = df['datetime']
df_data.plot(subplots=True)


In [None]:
# drop datetime

df_data = df_data.values


In [None]:
# Standardization

train_split = 24*365*2

df_train_mean = df_data[:train_split].mean()
df_train_std = df_data[:train_split].std()

df_data = (df_data-df_train_mean)/df_train_std


In [None]:
# train-validation split

history_size = 20
future_size = 0

x_train, y_train = modelling.chop_data(df_data, 0, train_split,
                             history_size,
                             future_size)
x_val, y_val = modelling.chop_data(df_data, train_split, None,
                         history_size,
                         future_size)

## Begin loading different models and histories using tensorflow

In [None]:
file_paths = [f for f in glob.glob('../src/models/{}/*.h5'.format(pollutant))]
model_names = [f[f.rfind('/')+1:-3] for f in file_paths if f.endswith('.h5')]
print(model_names)

In [None]:
losses = []
labels = []
epoch = []

for model_name in model_names:
    model_hist = modelling.load_history(model_name, subd='{}/'.format(pollutant))
    losses.append(min(model_hist['mae']))
    epoch.append(model_hist['mae'].index(min(model_hist['mae'])))
    labels.append(model_name)

df_losses = pd.DataFrame(data={'losses': losses, 'epoch': epoch, 'model': labels})

sns.scatterplot(x=df_losses.epoch, y=df_losses.losses,
                     hue=df_losses.model, legend='full')