In [1]:
from os.path import join, basename, splitext
from glob import glob
from dask import dataframe as dd
from matplotlib import rcParams
import pandas as pd
import dask
from collections import Counter
import pickle
import numpy as np
from datetime import datetime
from sklearn.metrics import mean_absolute_error, median_absolute_error, mean_squared_error, r2_score

from deep_aqi import ROOT


pd.set_option('expand_frame_repr', False)
pd.set_option('max_columns', 50)
pd.set_option('max_rows', 25)

In [2]:
PROCESSED_DATA = join(ROOT, 'data', 'processed')
INTERIM_DATA = join(ROOT, 'data', 'interim')
RAW_DATA = join(ROOT, 'data', 'raw')

In [3]:
pm25_path = join(PROCESSED_DATA, '88101.parquet')
pm25 = pd.read_parquet(pm25_path)

In [4]:
pm25.head()

Unnamed: 0_level_0,SiteCode,LocalDate,WindDir,WindSpeed,Temperature,Pressure,RelHum,PM2.5 - Local Conditions
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,California_Fresno_5001.0,2014-01-01 00:00:00,63.0,1.749611,5.555556,100660.0,67.0,80.0
1,California_Fresno_5001.0,2014-01-01 05:00:00,71.0,3.304821,3.888889,100660.0,69.0,43.0
2,California_Fresno_5001.0,2014-01-01 15:00:00,280.0,5.054432,17.777778,100520.0,29.0,62.0
3,California_Fresno_5001.0,2014-01-01 19:00:00,50.0,1.749611,11.666667,100660.0,53.0,83.0
4,California_Fresno_5001.0,2014-01-01 21:00:00,67.0,3.304821,8.888889,100660.0,58.0,75.0


In [5]:
to_concat = []
for name, df in pm25.groupby(by='SiteCode'):
    df = df.sort_values(['SiteCode', 'LocalDate'])
    df.set_index('LocalDate', inplace=True)
    
    timeshift = df.iloc[:, -1].reset_index()

    timeshift['PredictTime'] = timeshift.LocalDate
    timeshift.PredictTime = timeshift.PredictTime + pd.Timedelta('1D')
    timeshift = timeshift.rename(columns={'PM2.5 - Local Conditions': 'PM2.5-24h'})
    
    timeshift = timeshift[['PredictTime', 'PM2.5-24h']]
    df.reset_index(inplace=True)
    df = pd.merge(df, timeshift, left_on='LocalDate', right_on='PredictTime', how='inner')
    
    mae = mean_absolute_error(df['PM2.5 - Local Conditions'], df['PM2.5-24h'])
    mse = mean_squared_error(df['PM2.5 - Local Conditions'], df['PM2.5-24h'])
    mdae = median_absolute_error(df['PM2.5 - Local Conditions'], df['PM2.5-24h'])
    r2 = r2_score(df['PM2.5 - Local Conditions'], df['PM2.5-24h'])
    
    _min = np.min(df[df.columns[-1]])
    mean = np.mean(df[df.columns[-1]])
    median = np.median(df[df.columns[-1]])
    _max = np.max(df[df.columns[-1]])
    
    result = pd.DataFrame(index = [name],
                          data={'MAE': mae,
                                'MSE': mse,
                                'MdAE': mdae,
                                'R2': r2,
                                'MEAN': mean,
                               })
    to_concat.append(result)
    
results = pd.concat(to_concat)

In [6]:
results.sort_values('R2')

Unnamed: 0,MAE,MSE,MdAE,R2,MEAN
Wisconsin_Dodge_1.0,4.282498,226.795399,3.00,-0.900150,5.731986
Missouri_St. Louis City_93.0,5.754779,95.438810,4.30,-0.706272,9.390168
Missouri_Cass_3.0,4.259552,42.416157,3.20,-0.612278,6.634311
Ohio_Hamilton_40.0,5.353377,54.127056,4.00,-0.589247,8.485400
Missouri_St. Louis City_94.0,6.243577,81.002556,4.70,-0.583667,8.832225
New Jersey_Essex_3.0,5.367381,51.517114,4.10,-0.578497,8.702853
Missouri_Buchanan_5.0,5.677430,82.650993,4.05,-0.569122,8.987560
Iowa_Linn_40.0,5.104159,77.815823,3.50,-0.549996,7.526587
Massachusetts_Suffolk_42.0,4.750030,50.100788,4.00,-0.533156,6.953669
Nebraska_Douglas_19.0,4.703755,71.360206,3.00,-0.488863,8.926318


In [27]:
df['PM2.5 - Local Conditions'].describe()

count    31298.00000
mean        16.13151
std         14.94622
min         -5.00000
25%          7.00000
50%         12.00000
75%         20.00000
max        239.00000
Name: PM2.5 - Local Conditions, dtype: float64

7.749313055147294
137.88727714230942
5.0
0.3827297509054153
