# **Using a singular model-structure to predict the data**


## Data Import

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

# Models
import tensorflow as tf
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Conv1D, GlobalAveragePooling1D, BatchNormalization
from tensorflow.keras.optimizers import Adam
import xgboost as xgb

# Metrics
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error

from sklearn.model_selection import train_test_split

from utils import *

datasets_folder = './datasets'
figsize = (20,4)
verbosity = 0

metrics = [root_mean_squared_error, r2_score, mean_absolute_error, huber]

tf.config.set_visible_devices([], 'GPU')
print(tf.config.list_physical_devices('GPU'))

[]


In [24]:
pollution_data = read_and_preprocess_dataset(datasets_folder, 'pollution', v=verbosity)
traffic_data = read_and_preprocess_dataset(datasets_folder, 'traffic', v=verbosity, radius=2)
weather_data = read_and_preprocess_dataset(datasets_folder, 'weather', v=verbosity)

stations = list(pollution_data.keys())
agents = list(set(agent for station in stations for agent in pollution_data[station].keys()))

In [25]:
merged_traffic_data = pd.concat(
    [df.assign(Station=key) for key, df in traffic_data.items()]
)
merged_traffic_data

Unnamed: 0_level_0,Traffic_value,Station
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01 00:00:00,27629.0,GIARDINI MARGHERITA
2019-01-01 01:00:00,44430.0,GIARDINI MARGHERITA
2019-01-01 02:00:00,40318.0,GIARDINI MARGHERITA
2019-01-01 03:00:00,25661.0,GIARDINI MARGHERITA
2019-01-01 04:00:00,15906.0,GIARDINI MARGHERITA
...,...,...
2024-12-31 19:00:00,19786.0,VIA CHIARINI
2024-12-31 20:00:00,16239.0,VIA CHIARINI
2024-12-31 21:00:00,7226.0,VIA CHIARINI
2024-12-31 22:00:00,5568.0,VIA CHIARINI


In [30]:
merged_pollution_data = pd.concat(
    [df.assign(Station=station, Agent=agent) 
     for station, agents_data in pollution_data.items() 
     for agent, df in agents_data.items()]
)
merged_pollution_data

Unnamed: 0_level_0,Agent_value,Station,Agent
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-02 00:00:00,25.0,GIARDINI MARGHERITA,NO2
2019-01-02 01:00:00,25.0,GIARDINI MARGHERITA,NO2
2019-01-02 02:00:00,22.0,GIARDINI MARGHERITA,NO2
2019-01-02 03:00:00,21.0,GIARDINI MARGHERITA,NO2
2019-01-02 05:00:00,30.0,GIARDINI MARGHERITA,NO2
...,...,...,...
2024-12-23 00:00:00,19.0,VIA CHIARINI,PM10
2024-12-24 00:00:00,14.0,VIA CHIARINI,PM10
2024-12-25 00:00:00,23.0,VIA CHIARINI,PM10
2024-12-30 00:00:00,17.0,VIA CHIARINI,PM10


In [32]:
merged_data = (
    merged_traffic_data
    .merge(merged_pollution_data, on=["Station", "Date"], how="inner")
    .merge(weather_data, left_index=True, right_index=True, how="inner")
)
merged_data['Station'] = merged_data['Station'].apply(lambda station: stations.index(station))
merged_data['Agent'] = merged_data['Agent'].apply(lambda agent: agents.index(agent))
merged_data

Unnamed: 0_level_0,Traffic_value,Station,Agent_value,Agent,TAVG,PREC,RHAVG,RAD,W_SCAL_INT,W_VEC_DIR,LEAFW
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-01-02 00:00:00,10990.0,0,25.0,3,0.5,0.0,99.5,0.0,1.4,283.2,1.0
2019-01-02 00:00:00,10990.0,0,2.0,5,0.5,0.0,99.5,0.0,1.4,283.2,1.0
2019-01-02 00:00:00,10990.0,0,18.0,1,0.5,0.0,99.5,0.0,1.4,283.2,1.0
2019-01-02 00:00:00,10990.0,0,11.0,0,0.5,0.0,99.5,0.0,1.4,283.2,1.0
2019-01-02 01:00:00,6004.0,0,25.0,3,0.1,0.0,99.7,0.0,1.0,236.6,1.0
...,...,...,...,...,...,...,...,...,...,...,...
2024-12-31 21:00:00,7226.0,2,9.0,5,6.9,0.0,78.4,0.0,3.5,235.4,0.0
2024-12-31 22:00:00,5568.0,2,27.0,3,7.0,0.0,77.0,0.0,4.8,232.1,0.0
2024-12-31 22:00:00,5568.0,2,13.0,5,7.0,0.0,77.0,0.0,4.8,232.1,0.0
2024-12-31 23:00:00,4587.0,2,20.0,3,7.0,0.0,80.3,0.0,5.3,230.4,0.0


In [40]:
merged_pollution_data[merged_pollution_data.isna().any(axis=1)]

Unnamed: 0_level_0,Agent_value,Station,Agent
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-02-15,,GIARDINI MARGHERITA,PM2.5
2023-02-19,,GIARDINI MARGHERITA,PM2.5
2023-02-23,,GIARDINI MARGHERITA,PM2.5
2023-03-20,,GIARDINI MARGHERITA,PM2.5
2023-03-31,,GIARDINI MARGHERITA,PM2.5
2023-04-02,,GIARDINI MARGHERITA,PM2.5
2023-04-03,,GIARDINI MARGHERITA,PM2.5
2023-04-04,,GIARDINI MARGHERITA,PM2.5
2023-04-05,,GIARDINI MARGHERITA,PM2.5
2023-04-06,,GIARDINI MARGHERITA,PM2.5
