In [58]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, mean_absolute_error

from airpollution_trf_graph_loader import AirpollutionDatasetLoader

In [59]:
T_lst= [12,24] #target time horizons to analyze
_city= 'madrid'
_include_trf= True # include or not traffic data as input

loader= AirpollutionDatasetLoader(_city, _include_trf)
dataset=loader.get_dataset(T=T_lst[0])

feature_dim= loader.get_feature_dim()
feature_dim

{'trf': 4, 'ap0': 2, 'ap1': 5, 'ap2': 2, 'ap3': 1}

In [60]:
target_nodes= list(feature_dim.keys())
target_nodes

['trf', 'ap0', 'ap1', 'ap2', 'ap3']

In [61]:
y_hat_dict= {}
y_true_dict= {}

for _trf_str in ['trf', 'no_trf']:    
    for _T in T_lst:
        for k in target_nodes:
            if (_trf_str !=  'no_trf') or (k != 'trf'):
                _df= pd.read_csv(os.path.join('results',f'y_hat_{_city}_{_T}_{k}_{_trf_str}.csv'), index_col=0)
                y_hat_dict['_'.join([str(_T),k,_trf_str])]= _df
            
                _df= pd.read_csv(os.path.join('results',f'y_true_{_city}_{_T}_{k}_{_trf_str}.csv'), index_col=0)
                y_true_dict['_'.join([str(_T),k,_trf_str])]= _df

In [62]:
y_true_dict

{'12_trf_trf':        Car  Motorcycle   Bus  Truck
 0     85.0         0.0   8.0    2.0
 1     14.0         0.0   0.0    0.0
 2    105.0         1.0   2.0    7.0
 3     38.0         4.0   0.0    0.0
 4     32.0        43.0   5.0    1.0
 ..     ...         ...   ...    ...
 177   71.0         2.0  15.0    9.0
 178    8.0         0.0   0.0    0.0
 179    3.0         0.0   0.0    1.0
 180   80.0         0.0   2.0   17.0
 181   48.0         1.0   2.0   30.0
 
 [182 rows x 4 columns],
 '12_ap0_trf':      CO  SO2
 0   0.7  2.0
 1   0.6  1.0
 2   0.5  1.0
 3   0.4  1.0
 4   0.4  1.0
 5   0.4  1.0
 6   0.3  1.0
 7   0.3  1.0
 8   0.4  1.0
 9   0.4  1.0
 10  0.6  3.0
 11  0.8  3.0
 12  0.7  3.0
 13  0.5  2.0,
 '12_ap1_trf':      CO    O3  PM10  PM25  SO2
 0   1.1   4.0  82.0  26.0  1.0
 1   1.3   4.0  65.0  22.0  1.0
 2   0.7   7.0  48.0  18.0  1.0
 3   0.2  45.0  34.0  17.0  1.0
 4   0.1  65.0  19.0  10.0  1.0
 5   0.1  52.0  24.0  12.0  1.0
 6   0.1  52.0  22.0  11.0  1.0
 7   0.1  52.0  16.0

In [63]:
def mape_fn(actual, pred):
    #print(actual)
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / (actual+0.01))) * 100

def compute_metrics_as_dataframe_fn(y_valid, y_hat, particle_name):
    metrics= []
    metrics_global = {'mse':[],'rmse':[],'mae':[],'cvrmse':[],'mape':[],}

    try:
        mae = mean_absolute_error(y_valid, y_hat)
        mse = mean_squared_error(y_valid, y_hat)
        rmse= mean_squared_error(y_valid, y_hat, squared = False)
        cvrmse =  (rmse/np.mean(y_valid))*100 # it is a percentage
        mape = mape_fn(y_valid, y_hat)

        metrics.append((time_horizon, str(particle_name), mae, mse, rmse, cvrmse, mape))


        metrics_df = pd.DataFrame.from_records(metrics, columns='T particle MAE MSE RMSE CVRMSE MAPE'.split())

        return metrics_df
    except:
        return None

def compute_metrics(y_valid, y_hat):
    metrics= []
    metrics_global = {'mse':[],'rmse':[],'mae':[],'cvrmse':[],'mape':[],}

    mae = mean_absolute_error(y_valid, y_hat)
    mse = mean_squared_error(y_valid, y_hat)
    rmse= mean_squared_error(y_valid, y_hat, squared = False)
    cvrmse =  (rmse/np.mean(y_valid))*100 # it is a percentage
    mape = mape_fn(y_valid, y_hat)

    return mae, mse, rmse, cvrmse, mape


In [73]:
metrics_by_sensors= []
metrics_by_pollutants= []
for _trf_str in ['trf', 'no_trf']:    
    for _T in T_lst:
        for k in target_nodes:
            if k != 'trf':
                y_true_df= y_true_dict['_'.join([str(_T),k,_trf_str])]
                y_hat_df= y_hat_dict['_'.join([str(_T),k,_trf_str])]
            
                #Metris by station
                for i in range(y_true_df.shape[0]):
                    mae, mse, rmse, cvrmse, mape= compute_metrics(y_true_df.iloc[i], y_hat_df.iloc[i])
                    #print(y_true_df, y_hat_df, mae, mse, rmse, cvrmse, mape)
                    metrics_by_sensors.append((_T, _trf_str, k, i, mae, mse, rmse, cvrmse, mape))
            
                for c in y_true_df.columns:
                    c_hat= y_hat_df[c].T
                    c_true= y_true_df[c].T
                    mae, mse, rmse, cvrmse, mape= compute_metrics(c_true, c_hat)
                    metrics_by_pollutants.append((_T, _trf_str, k, c, mae, mse, rmse, cvrmse, mape))


metrics_by_sensors_df = pd.DataFrame.from_records(metrics_by_sensors, columns='T traffic sensor t MAE MSE RMSE CVRMSE MAPE'.split())
metrics_by_pollutants_df = pd.DataFrame.from_records(metrics_by_pollutants, columns='T traffic sensor pollutant MAE MSE RMSE CVRMSE MAPE'.split())

#_trf_str= 'trf'
#if not _include_trf:
#    _trf_str='no_trf'
    
metrics_by_sensors_df.to_csv(os.path.join('results',f'metrics_by_sensor_{_city}.csv'))
metrics_by_pollutants_df.to_csv(os.path.join('results',f'metrics_by_pollutant_{_city}.csv'))

### Metrics by sensor

In [74]:
metrics_by_sensors_df

Unnamed: 0,T,traffic,sensor,t,MAE,MSE,RMSE,CVRMSE,MAPE
0,12,trf,ap0,0,0.183264,0.033611,0.183334,13.580287,17.694900
1,12,trf,ap0,1,0.668449,0.722908,0.850240,106.280010,70.825616
2,12,trf,ap0,2,0.593417,0.644219,0.802632,107.017636,61.325322
3,12,trf,ap0,3,0.553202,0.572634,0.756726,108.103691,57.443420
4,12,trf,ap0,4,0.553202,0.572634,0.756726,108.103691,57.443420
...,...,...,...,...,...,...,...,...,...
211,24,no_trf,ap3,8,30.221657,913.348552,30.221657,38.745714,38.740747
212,24,no_trf,ap3,9,12.221664,149.369071,12.221664,20.369440,20.366046
213,24,no_trf,ap3,10,25.778180,664.514564,25.778180,117.173545,117.120309
214,24,no_trf,ap3,11,38.777054,1503.659917,38.777054,430.856156,430.377958


12 hours horizon

In [105]:
metric_agg_mean_df= metrics_by_sensors_df[metrics_by_sensors_df['T']==12].groupby('sensor T traffic'.split()).mean()
metric_agg_mean_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,t,MAE,MSE,RMSE,CVRMSE,MAPE
sensor,T,traffic,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ap0,12,no_trf,6.5,0.51284,0.521187,0.679843,80.580752,46.38876
ap0,12,trf,6.5,0.503952,0.485413,0.657824,78.915635,46.802612
ap1,12,no_trf,6.5,8.136476,226.84196,12.371604,80.016068,83.727835
ap1,12,trf,6.5,8.144618,227.880738,12.377571,80.182893,84.082153
ap2,12,no_trf,6.5,6.596555,90.393406,7.167713,31.532781,32.055055
ap2,12,trf,6.5,6.754119,95.931244,7.357554,32.072751,32.316755
ap3,12,no_trf,6.5,20.435825,628.716858,20.435825,195.018823,194.774247
ap3,12,trf,6.5,20.616298,637.031174,20.616298,196.295597,196.049632


In [116]:
metric_agg_mean_df= metrics_by_sensors_df[metrics_by_sensors_df['T']==12].groupby('sensor T traffic t'.split()).mean().reset_index()
metric_agg_mean_df[metric_agg_mean_df['t']==0]

Unnamed: 0,sensor,T,traffic,t,MAE,MSE,RMSE,CVRMSE,MAPE
0,ap0,12,no_trf,0,0.171822,0.030675,0.175143,12.973522,14.828314
14,ap0,12,trf,0,0.183264,0.033611,0.183334,13.580287,17.6949
28,ap1,12,no_trf,0,23.712218,1129.818984,33.61278,147.295268,228.315641
42,ap1,12,trf,0,23.77216,1134.174219,33.677503,147.578892,229.818393
56,ap2,12,no_trf,0,23.92677,673.819899,25.958041,64.093929,58.055435
70,ap2,12,trf,0,24.590072,711.876908,26.681022,65.879065,59.655255
84,ap3,12,no_trf,0,37.34159,1394.394344,37.34159,414.906556,414.44606
98,ap3,12,trf,0,37.552216,1410.168927,37.552216,417.246844,416.783751


In [117]:
metric_agg_mean_df[metric_agg_mean_df['t']==6]

Unnamed: 0,sensor,T,traffic,t,MAE,MSE,RMSE,CVRMSE,MAPE
6,ap0,12,no_trf,6,0.548023,0.485356,0.696675,107.180809,67.436402
20,ap0,12,trf,6,0.563594,0.508685,0.713221,109.726343,69.942933
34,ap1,12,no_trf,6,3.645186,30.765508,5.546666,32.210606,58.544499
48,ap1,12,trf,6,3.583294,29.361068,5.418585,31.466814,58.420831
62,ap2,12,no_trf,6,0.069033,0.007716,0.087841,0.732011,0.477076
76,ap2,12,trf,6,0.023848,0.000598,0.02445,0.20375,0.206524
90,ap3,12,no_trf,6,3.342243,11.170588,3.342243,7.772658,7.770851
104,ap3,12,trf,6,3.55262,12.621109,3.55262,8.261907,8.259986


24 hours horizon

In [106]:
metric_agg_mean_df= metrics_by_sensors_df[metrics_by_sensors_df['T']==24].groupby('sensor T traffic'.split()).mean()
metric_agg_mean_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,t,MAE,MSE,RMSE,CVRMSE,MAPE
sensor,T,traffic,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ap0,24,no_trf,6.0,0.49877,0.422187,0.623977,89.43418,62.004607
ap0,24,trf,6.0,0.492978,0.421491,0.61411,88.988352,62.147832
ap1,24,no_trf,6.0,11.883944,442.506319,19.812511,90.681639,89.715126
ap1,24,trf,6.0,12.947507,528.645663,21.98092,100.655449,94.791434
ap2,24,no_trf,6.0,6.423042,89.755349,7.123702,44.285752,80.918402
ap2,24,trf,6.0,6.557291,93.669483,7.291282,45.037185,81.491256
ap3,24,no_trf,6.0,34.27296,1388.471582,34.27296,110.202857,110.110775
ap3,24,trf,6.0,35.235276,1456.796438,35.235276,108.093768,108.005571


## Metrics by pollutants

12 hours horizon

In [93]:
metric_agg_mean_df= metrics_by_pollutants_df[metrics_by_pollutants_df['T']==12].drop(columns='sensor').groupby('pollutant T traffic'.split()).mean()
metric_agg_mean_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,MAE,MSE,RMSE,CVRMSE,MAPE
pollutant,T,traffic,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CO,12,no_trf,0.174309,0.075459,0.23254,60.619186,61.55286
CO,12,trf,0.182764,0.07783,0.244551,63.022682,62.86865
O3,12,no_trf,20.101174,576.037851,23.975632,82.836036,209.189073
O3,12,trf,20.212866,582.827119,24.11564,83.325453,210.71934
PM10,12,no_trf,11.831657,350.377586,17.842101,58.127157,30.005989
PM10,12,trf,11.955747,355.467879,18.048225,58.887057,30.250458
PM25,12,no_trf,4.986472,45.350481,6.66365,51.274432,35.679977
PM25,12,trf,5.018858,45.762854,6.699637,51.557459,35.693913
SO2,12,no_trf,0.574884,0.536547,0.606338,42.137236,48.722628
SO2,12,trf,0.557528,0.498407,0.587221,40.920023,47.817205


24 hours horizon

In [102]:
metric_agg_mean_df= metrics_by_pollutants_df[metrics_by_pollutants_df['T']==24].drop(columns='sensor').groupby('pollutant T traffic'.split()).mean()
metric_agg_mean_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,MAE,MSE,RMSE,CVRMSE,MAPE
pollutant,T,traffic,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CO,24,no_trf,0.16173,0.049124,0.209571,53.801237,56.44544
CO,24,trf,0.19328,0.076647,0.253301,65.150542,68.792539
O3,24,no_trf,33.297134,1328.941089,36.445495,56.574977,140.088859
O3,24,trf,36.279808,1563.173797,39.514072,61.423222,145.799684
PM10,24,no_trf,14.272245,513.516515,20.87762,66.863557,39.679518
PM10,24,trf,14.442894,528.847431,21.21945,68.011751,39.365388
PM25,24,no_trf,5.474587,47.724929,6.834071,74.561665,132.662668
PM25,24,trf,5.601102,51.575422,7.082067,77.152896,130.986727
SO2,24,no_trf,0.562456,0.447467,0.586045,50.086727,53.389726
SO2,24,trf,0.519592,0.430053,0.549972,46.492532,49.676121


In [72]:
print("That's all folks!")

That's all folks!
