In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error, mean_absolute_error

from airpollution_trf_graph_loader import AirpollutionDatasetLoader

In [2]:
T_lst= [6,12] #target time horizons to analyze
_city= 'madrid'
_include_trf= True # include or not traffic data as input

loader= AirpollutionDatasetLoader(_city, _include_trf)
dataset=loader.get_dataset(T=T_lst[0])

feature_dim= loader.get_feature_dim()
feature_dim

{'trf': 4, 'ap0': 2, 'ap1': 5, 'ap2': 2, 'ap3': 1}

In [3]:
target_nodes= list(feature_dim.keys())
target_nodes

['trf', 'ap0', 'ap1', 'ap2', 'ap3']

In [4]:
y_hat_dict= {}
y_true_dict= {}

_trf_str= 'trf'
if not _include_trf:
    _trf_str='no_trf'
    
for _T in T_lst:
    for k in target_nodes:
        _df= pd.read_csv(os.path.join('results',f'y_hat_{_city}_{_T}_{k}_{_trf_str}.csv'), index_col=0)
        y_hat_dict['_'.join([str(_T),k])]= _df
    
        _df= pd.read_csv(os.path.join('results',f'y_true_{_city}_{_T}_{k}_{_trf_str}.csv'), index_col=0)
        y_true_dict['_'.join([str(_T),k])]= _df

In [5]:
y_true_dict

{'6_trf':        Car  Motorcycle   Bus  Truck
 0     45.0         0.0  22.0    0.0
 1     33.0         1.0   0.0    0.0
 2     50.0         3.0   6.0   28.0
 3     20.0         0.0   2.0    1.0
 4     16.0         5.0   1.0    1.0
 ..     ...         ...   ...    ...
 177  167.0         5.0  54.0   13.0
 178   17.0         0.0   0.0    0.0
 179   13.0         0.0   0.0    2.0
 180  168.0         0.0   7.0   17.0
 181  139.0         0.0   5.0   20.0
 
 [182 rows x 4 columns],
 '6_ap0':      CO  SO2
 0   0.3  1.0
 1   0.3  1.0
 2   0.4  1.0
 3   0.4  1.0
 4   0.6  3.0
 5   0.8  3.0
 6   0.7  3.0
 7   0.5  2.0
 8   0.5  1.0
 9   0.5  1.0
 10  0.5  1.0
 11  0.4  1.0
 12  0.3  1.0
 13  0.3  1.0,
 '6_ap1':      CO    O3  PM10  PM25  SO2
 0   0.1  52.0  22.0  11.0  1.0
 1   0.1  52.0  16.0   7.0  1.0
 2   0.1  31.0  17.0   7.0  1.0
 3   0.2  12.0  22.0   9.0  1.0
 4   0.3  10.0  26.0  12.0  1.0
 5   0.4  18.0  34.0  16.0  1.0
 6   0.2  39.0  31.0  11.0  1.0
 7   0.2  55.0  21.0   7.0  1.0
 8 

In [6]:
def mape_fn(actual, pred):
    #print(actual)
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / (actual+0.01))) * 100

def compute_metrics_as_dataframe_fn(y_valid, y_hat, particle_name):
    metrics= []
    metrics_global = {'mse':[],'rmse':[],'mae':[],'cvrmse':[],'mape':[],}

    try:
        mae = mean_absolute_error(y_valid, y_hat)
        mse = mean_squared_error(y_valid, y_hat)
        rmse= mean_squared_error(y_valid, y_hat, squared = False)
        cvrmse =  (rmse/np.mean(y_valid))*100 # it is a percentage
        mape = mape_fn(y_valid, y_hat)

        metrics.append((time_horizon, str(particle_name), mae, mse, rmse, cvrmse, mape))


        metrics_df = pd.DataFrame.from_records(metrics, columns='T particle MAE MSE RMSE CVRMSE MAPE'.split())

        return metrics_df
    except:
        return None

def compute_metrics(y_valid, y_hat):
    metrics= []
    metrics_global = {'mse':[],'rmse':[],'mae':[],'cvrmse':[],'mape':[],}

    mae = mean_absolute_error(y_valid, y_hat)
    mse = mean_squared_error(y_valid, y_hat)
    rmse= mean_squared_error(y_valid, y_hat, squared = False)
    cvrmse =  (rmse/np.mean(y_valid))*100 # it is a percentage
    mape = mape_fn(y_valid, y_hat)

    return mae, mse, rmse, cvrmse, mape


In [7]:
metrics_by_sensors= []
metrics_by_pollutants= []
for _T in T_lst:
    for k in target_nodes:
        if k != 'trf':
            y_true_df= y_true_dict['_'.join([str(_T),k])]
            y_hat_df= y_hat_dict['_'.join([str(_T),k])]
        
            #Metris by station
            for i in range(y_true_df.shape[0]):
                mae, mse, rmse, cvrmse, mape= compute_metrics(y_true_df.iloc[i], y_hat_df.iloc[i])
                #print(y_true_df, y_hat_df, mae, mse, rmse, cvrmse, mape)
                metrics_by_sensors.append((_T, k, i, mae, mse, rmse, cvrmse, mape))
        
            for c in y_true_df.columns:
                c_hat= y_hat_df[c].T
                c_true= y_true_df[c].T
                mae, mse, rmse, cvrmse, mape= compute_metrics(c_true, c_hat)
                metrics_by_pollutants.append((_T, k, c, mae, mse, rmse, cvrmse, mape))


metrics_by_sensors_df = pd.DataFrame.from_records(metrics_by_sensors, columns='T sensor t MAE MSE RMSE CVRMSE MAPE'.split())
metrics_by_pollutants_df = pd.DataFrame.from_records(metrics_by_pollutants, columns='T sensor pollutant MAE MSE RMSE CVRMSE MAPE'.split())

_trf_str= 'trf'
if not _include_trf:
    _trf_str='no_trf'
    
metrics_by_sensors_df.to_csv(os.path.join('results',f'metrics_by_sensor_{_city}_{k}_{_trf_str}.csv'))
metrics_by_pollutants_df.to_csv(os.path.join('results',f'metrics_by_pollutant_{_city}_{k}_{_trf_str}.csv'))


In [8]:
metrics_by_sensors_df

Unnamed: 0,T,sensor,t,MAE,MSE,RMSE,CVRMSE,MAPE
0,6,ap0,0,0.360629,0.170361,0.412748,63.499626,53.576025
1,6,ap0,1,0.360629,0.170361,0.412748,63.499626,53.576025
2,6,ap0,2,0.309989,0.157177,0.396455,56.636433,35.244274
3,6,ap0,3,0.309989,0.157177,0.396455,56.636433,35.244274
4,6,ap0,4,1.205437,2.854099,1.689408,93.855995,41.471899
...,...,...,...,...,...,...,...,...
107,12,ap3,9,24.946202,622.313009,24.946202,92.393342,92.359135
108,12,ap3,10,11.965414,143.171139,11.965414,85.467245,85.406241
109,12,ap3,11,10.977267,120.500402,10.977267,84.440519,84.375615
110,12,ap3,12,32.962986,1086.558479,32.962986,94.179961,94.153061


In [9]:
metrics_by_sensors_df.groupby('sensor T'.split()).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,t,MAE,MSE,RMSE,CVRMSE,MAPE
sensor,T,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ap0,6,6.5,0.554129,0.80455,0.723203,66.991496,42.399645
ap0,12,6.5,0.513819,0.733913,0.668619,56.041308,34.304251
ap1,6,6.5,16.741452,866.400066,27.027667,142.112386,77.059509
ap1,12,6.5,14.422553,555.650616,22.498355,139.245295,79.143208
ap2,6,6.5,17.240663,369.449349,18.636315,97.384761,88.391608
ap2,12,6.5,18.006476,437.475868,19.280298,95.578693,88.191979
ap3,6,6.5,51.347471,3419.485448,51.347471,93.385052,93.3603
ap3,12,6.5,24.732178,853.392487,24.732178,87.913129,87.862006


In [10]:
metrics_by_sensors_df.groupby('sensor T'.split()).std()

Unnamed: 0_level_0,Unnamed: 1_level_0,t,MAE,MSE,RMSE,CVRMSE,MAPE
sensor,T,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ap0,6,4.1833,0.396005,1.138711,0.550621,15.404114,9.689695
ap0,12,4.1833,0.413008,1.033483,0.555813,20.013859,10.722629
ap1,6,4.1833,6.867617,688.591522,12.097913,19.089582,11.320627
ap1,12,4.1833,3.934542,340.238575,7.299341,15.083662,16.45371
ap2,6,4.1833,4.609075,163.205133,4.882618,5.113562,4.706253
ap2,12,4.1833,7.866264,422.616434,8.414473,5.244977,4.959602
ap3,6,4.1833,29.037001,3213.809486,29.037001,4.882469,4.899631
ap3,12,4.1833,16.133973,823.291183,16.133973,9.150074,9.185714


In [11]:
metrics_by_pollutants_df

Unnamed: 0,T,sensor,pollutant,MAE,MSE,RMSE,CVRMSE,MAPE
0,6,ap0,CO,0.093797,0.012873,0.113457,24.436982,23.163402
1,6,ap0,SO2,1.014461,1.596227,1.263419,84.227909,61.635888
2,6,ap1,CO,0.089356,0.010062,0.100308,50.153869,56.129152
3,6,ap1,O3,52.9949,3714.623432,60.947711,108.282735,89.729296
4,6,ap1,PM10,22.155458,541.594764,23.272189,94.437867,89.126347
5,6,ap1,PM25,7.687462,75.153676,8.669122,86.69122,73.07686
6,6,ap1,SO2,0.780083,0.618396,0.786381,78.638149,77.235893
7,6,ap2,PM10,24.124971,620.579494,24.911433,95.02999,91.339763
8,6,ap2,PM25,10.356354,118.319203,10.877463,91.73764,85.443452
9,6,ap3,O3,51.347471,3419.485448,58.476367,108.721,93.3603


In [12]:
metrics_by_pollutants_df.drop(columns='sensor').groupby('pollutant T'.split()).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,MAE,MSE,RMSE,CVRMSE,MAPE
pollutant,T,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CO,6,0.091576,0.011467,0.106883,37.295426,39.646277
CO,12,0.181504,0.082047,0.252502,64.945654,59.049088
O3,6,52.171186,3567.05444,59.712039,108.501867,91.544798
O3,12,27.070719,1082.461155,32.713966,111.516282,85.301801
PM10,6,23.140215,581.087129,24.091811,94.733928,90.233055
PM10,12,27.797228,1016.094404,31.535333,104.47099,90.916457
PM25,6,9.021908,96.736439,9.773293,89.21443,79.260156
PM25,12,11.230173,154.587672,12.432927,95.922149,83.95891
SO2,6,0.897272,1.107311,1.0249,81.433029,69.43589
SO2,12,0.663142,0.807286,0.803235,58.398915,45.058997


In [13]:
metrics_by_pollutants_df.drop(columns='sensor').groupby('pollutant T'.split()).std()

Unnamed: 0_level_0,Unnamed: 1_level_0,MAE,MSE,RMSE,CVRMSE,MAPE
pollutant,T,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CO,6,0.00314,0.001988,0.009298,18.184585,23.310305
CO,12,0.129417,0.096587,0.191261,58.681015,59.509313
O3,6,1.164908,208.69407,1.747504,0.309901,2.567508
O3,12,3.307197,323.952017,4.95128,3.059293,3.620676
PM10,6,1.392656,55.850638,1.159121,0.418694,1.565122
PM10,12,4.374097,414.707477,6.575283,7.661718,1.65374
PM25,6,1.887191,30.522637,1.561533,3.568358,8.744501
PM25,12,0.111022,3.51838,0.141494,1.524508,3.787211
SO2,6,0.165731,0.691431,0.337316,3.952557,11.030869
SO2,12,0.388205,0.9147,0.569385,25.932501,9.305321


In [14]:
print("That's all folks!")

That's all folks!
