In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error, mean_absolute_error

from airpollution_trf_graph_loader import AirpollutionDatasetLoader

In [2]:
_T= 12 #target time horizon to analyze
_city= 'madrid'

loader= AirpollutionDatasetLoader(_city)
dataset=loader.get_dataset(T=_T)

feature_dim= loader.get_feature_dim()
feature_dim

{'trf': 4, 'ap0': 2, 'ap1': 5, 'ap2': 2, 'ap3': 1}

In [3]:
target_nodes= list(feature_dim.keys())
target_nodes

['trf', 'ap0', 'ap1', 'ap2', 'ap3']

In [4]:
y_hat_dict= {}
y_true_dict= {}

for k in target_nodes:
    _df= pd.read_csv(os.path.join('results',f'y_hat_{_city}_{_T}_{k}.csv'), index_col=0)
    y_hat_dict[k]= _df

    _df= pd.read_csv(os.path.join('results',f'y_true_{_city}_{_T}_{k}.csv'), index_col=0)
    y_true_dict[k]= _df

In [5]:
y_true_dict

{'trf':        Car  Motorcycle   Bus  Truck
 0     14.0         0.0   0.0    0.0
 1     85.0         0.0   8.0    2.0
 2     50.0         0.0   3.0    2.0
 3     21.0         1.0   0.0    0.0
 4    105.0         1.0   2.0    7.0
 ..     ...         ...   ...    ...
 177   60.0         0.0  21.0   11.0
 178   25.0         4.0   2.0   15.0
 179   48.0         1.0   2.0   30.0
 180    3.0         0.0   0.0    1.0
 181   71.0         2.0  15.0    9.0
 
 [182 rows x 4 columns],
 'ap0':      CO  SO2
 0   0.7  2.0
 1   0.6  1.0
 2   0.5  1.0
 3   0.4  1.0
 4   0.4  1.0
 5   0.4  1.0
 6   0.3  1.0
 7   0.3  1.0
 8   0.4  1.0
 9   0.4  1.0
 10  0.6  3.0
 11  0.8  3.0
 12  0.7  3.0
 13  0.5  2.0,
 'ap1':      CO    O3  PM10  PM25  SO2
 0   1.1   4.0  82.0  26.0  1.0
 1   1.3   4.0  65.0  22.0  1.0
 2   0.7   7.0  48.0  18.0  1.0
 3   0.2  45.0  34.0  17.0  1.0
 4   0.1  65.0  19.0  10.0  1.0
 5   0.1  52.0  24.0  12.0  1.0
 6   0.1  52.0  22.0  11.0  1.0
 7   0.1  52.0  16.0   7.0  1.0
 8   0.1 

In [6]:
def mape_fn(actual, pred):
    #print(actual)
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / (actual+0.01))) * 100

def compute_metrics_as_dataframe_fn(y_valid, y_hat, particle_name):
    metrics= []
    metrics_global = {'mse':[],'rmse':[],'mae':[],'cvrmse':[],'mape':[],}

    try:
        mae = mean_absolute_error(y_valid, y_hat)
        mse = mean_squared_error(y_valid, y_hat)
        rmse= mean_squared_error(y_valid, y_hat, squared = False)
        cvrmse =  (rmse/np.mean(y_valid))*100 # it is a percentage
        mape = mape_fn(y_valid, y_hat)

        metrics.append((time_horizon, str(particle_name), mae, mse, rmse, cvrmse, mape))


        metrics_df = pd.DataFrame.from_records(metrics, columns='T particle MAE MSE RMSE CVRMSE MAPE'.split())

        return metrics_df
    except:
        return None

def compute_metrics(y_valid, y_hat):
    metrics= []
    metrics_global = {'mse':[],'rmse':[],'mae':[],'cvrmse':[],'mape':[],}

    mae = mean_absolute_error(y_valid, y_hat)
    mse = mean_squared_error(y_valid, y_hat)
    rmse= mean_squared_error(y_valid, y_hat, squared = False)
    cvrmse =  (rmse/np.mean(y_valid))*100 # it is a percentage
    mape = mape_fn(y_valid, y_hat)

    return mae, mse, rmse, cvrmse, mape


In [26]:
metrics_by_sensors= []
metrics_by_pollutants= []
for k in target_nodes:
    if k != 'trf':
        y_true_df= y_true_dict[k]
        y_hat_df= y_hat_dict[k]
    
        #Metris by station
        for i in range(y_true_df.shape[0]):
            mae, mse, rmse, cvrmse, mape= compute_metrics(y_true_df.iloc[i], y_hat_df.iloc[i])
            #print(y_true_df, y_hat_df, mae, mse, rmse, cvrmse, mape)
            metrics_by_sensors.append((_T, k, i, mae, mse, rmse, cvrmse, mape))
    
        for c in y_true_df.columns:
            c_hat= y_hat_df[c].T
            c_true= y_true_df[c].T
            mae, mse, rmse, cvrmse, mape= compute_metrics(c_true, c_hat)
            metrics_by_pollutants.append((_T, k, c, mae, mse, rmse, cvrmse, mape))


metrics_by_sensors_df = pd.DataFrame.from_records(metrics_by_sensors, columns='T sensor t MAE MSE RMSE CVRMSE MAPE'.split())
metrics_by_pollutants_df = pd.DataFrame.from_records(metrics_by_pollutants, columns='T sensor pollutant MAE MSE RMSE CVRMSE MAPE'.split())

metrics_by_sensors_df.to_csv(os.path.join('results',f'y_hat_{_city}_{_T}_{k}.csv'))


In [27]:
metrics_by_sensors_df

Unnamed: 0,T,sensor,t,MAE,MSE,RMSE,CVRMSE,MAPE
0,12,ap0,0,0.708784,0.901121,0.949274,70.3166,38.784603
1,12,ap0,1,0.260853,0.105783,0.325243,40.655375,27.988694
2,12,ap0,2,0.245049,0.107424,0.327756,43.700796,25.591688
3,12,ap0,3,0.2958,0.117956,0.343448,49.063989,38.073013
4,12,ap0,4,0.2958,0.117956,0.343448,49.063989,38.073013
5,12,ap0,5,0.2958,0.117956,0.343448,49.063989,38.073013
6,12,ap0,6,0.346548,0.137379,0.370647,57.022613,58.354409
7,12,ap0,7,0.346548,0.137379,0.370647,57.022613,58.354409
8,12,ap0,8,0.2958,0.117956,0.343448,49.063989,38.073013
9,12,ap0,9,0.2958,0.117956,0.343448,49.063989,38.073013


In [32]:
metrics_by_sensors_df.groupby('sensor').mean()

Unnamed: 0_level_0,T,t,MAE,MSE,RMSE,CVRMSE,MAPE
sensor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ap0,12.0,6.5,0.541636,0.747207,0.698839,60.604468,40.701101
ap1,12.0,6.5,14.600348,561.209157,22.627984,140.17178,93.224983
ap2,12.0,6.5,17.794367,433.186106,19.166946,94.938969,86.111546
ap3,12.0,6.5,24.262908,829.652395,24.262908,85.135866,85.087438


In [33]:
metrics_by_sensors_df.groupby('sensor').std()

Unnamed: 0_level_0,T,t,MAE,MSE,RMSE,CVRMSE,MAPE
sensor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ap0,0.0,4.1833,0.365556,1.000831,0.52796,16.777201,9.19441
ap1,0.0,4.1833,3.915036,341.607989,7.277833,14.763683,25.94701
ap2,0.0,4.1833,7.871798,421.2093,8.418844,5.494585,6.090943
ap3,0.0,4.1833,16.108984,807.941986,16.108984,11.081809,11.11381


In [28]:
metrics_by_pollutants_df

Unnamed: 0,T,sensor,pollutant,MAE,MSE,RMSE,CVRMSE,MAPE
0,12,ap0,CO,0.108233,0.014844,0.121835,24.367051,25.971251
1,12,ap0,SO2,0.975039,1.479571,1.216376,77.405746,55.43095
2,12,ap1,CO,0.349834,0.162275,0.402833,110.581664,165.780764
3,12,ap1,O3,29.210154,1303.918982,36.109818,113.349206,80.463353
4,12,ap1,PM10,31.341022,1333.316818,36.514611,110.890359,93.968019
5,12,ap1,PM25,11.702805,168.484613,12.980162,98.228255,86.514429
6,12,ap1,SO2,0.397923,0.163099,0.403855,40.385514,39.39835
7,12,ap2,PM10,24.728853,724.19426,26.910858,99.145265,89.842223
8,12,ap2,PM25,10.859882,142.177953,11.92384,93.783009,82.380869
9,12,ap3,O3,24.262908,829.652395,28.803687,107.821289,85.087438


In [29]:
metrics_by_pollutants_df.drop(columns='sensor').groupby('pollutant').mean()

Unnamed: 0_level_0,T,MAE,MSE,RMSE,CVRMSE,MAPE
pollutant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CO,12.0,0.229033,0.088559,0.262334,67.474357,95.876007
O3,12.0,26.736531,1066.785689,32.456753,110.585247,82.775396
PM10,12.0,28.034937,1028.755539,31.712734,105.017812,91.905121
PM25,12.0,11.281343,155.331283,12.452001,96.005632,84.447649
SO2,12.0,0.686481,0.821335,0.810116,58.89563,47.41465


In [30]:
metrics_by_pollutants_df.drop(columns='sensor').groupby('pollutant').std()

Unnamed: 0_level_0,T,MAE,MSE,RMSE,CVRMSE,MAPE
pollutant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CO,0.0,0.170838,0.104249,0.198696,60.962937,98.860255
O3,0.0,3.498231,335.35712,5.166215,3.908827,3.269722
PM10,0.0,4.675509,430.714691,6.790879,8.305035,2.917379
PM25,0.0,0.596037,18.601618,0.746933,3.143264,2.922869
SO2,0.0,0.408082,0.930886,0.574539,26.177257,11.33676


In [34]:
print("That's all folks!")

That's all folks!
