In [None]:
import pandas as pd

df_city_hourly = pd.read_csv("dataset/city_hour.csv")
df_city_daily = pd.read_csv("dataset/city_day.csv")
df_station_hourly = pd.read_csv("dataset/station_hour.csv")
df_station_daily = pd.read_csv("dataset/station_day.csv")

In [60]:
df_city_hourly_list = "City,Datetime,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket"
df_city_daily_list = "City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket"
df_station_hourly_list = "StationId,Datetime,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket"
df_station_daily_list = "StationId,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket"

In [61]:
df_city_hourly = df_city_hourly.drop(columns=[ 'NH3','City','Datetime','Toluene', 'Xylene', 'AQI_Bucket'], axis=1)
df_city_daily = df_city_daily.drop(columns=[ 'NH3','City','Date','Toluene', 'Xylene', 'AQI_Bucket'], axis=1)
df_station_hourly  = df_station_hourly.drop(columns=[ 'NH3','StationId','Datetime','Toluene', 'Xylene', 'AQI_Bucket'], axis=1)
df_station_daily = df_station_daily.drop(columns=[ 'NH3','StationId','Date','Toluene', 'Xylene', 'AQI_Bucket'], axis=1)

In [62]:
dfs = [df_city_hourly, df_city_daily, df_station_hourly, df_station_daily]
df_merged = pd.concat(dfs, ignore_index=True)

In [63]:
columns_to_fill = ['PM2.5','PM10', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3', 'Benzene','AQI']
df_merged[columns_to_fill] = df_merged[columns_to_fill].fillna(df_merged[columns_to_fill].mean())

In [64]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
tf.config.run_functions_eagerly(True)

In [65]:
df_merged = df_merged.replace([np.inf, -np.inf], np.nan).dropna()

In [66]:
# df_merged.reset_index(drop=True, inplace=True)

In [67]:
air_quality_features = ['PM2.5','PM10', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3', 'Benzene']
X = df_merged['AQI']
y = df_merged[air_quality_features]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train.to_frame())
X_test_scaled = scaler_X.transform(X_test.to_frame())

scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).flatten()

print("X_train_scaled stats:")
print("Mean:", np.mean(X_train_scaled))
print("Std:", np.std(X_train_scaled))
print("Min:", np.min(X_train_scaled))
print("Max:", np.max(X_train_scaled))

print("\ny_train_scaled stats:")
print("Mean:", np.mean(y_train_scaled))
print("Std:", np.std(y_train_scaled))
print("Min:", np.min(y_train_scaled))
print("Max:", np.max(y_train_scaled))

In [None]:
y_train.values.reshape(-1, 1)

In [None]:
import joblib
joblib.dump(scaler_X, 'scaler_X.pkl')
joblib.dump(scaler_y, 'scaler_y.pkl')

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [71]:
def lr_schedule(epoch):
    return 0.001 * 0.9 ** epoch

In [72]:
optimizer = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

In [None]:
history = model.fit(
    X_train_scaled, y_train_scaled,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[
        keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
        keras.callbacks.LearningRateScheduler(lr_schedule)
    ],
    verbose=1
)

In [None]:
loss = model.evaluate(X_test_scaled, y_test_scaled)
print(loss)

In [None]:
X_test_scaled.shape

In [None]:
y_test_scaled.shape

In [None]:
model.save('model.h5')

In [None]:
import matplotlib.pyplot as plt


plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
# scaler = StandardScaler()

scaler = joblib.load('scaler_y.pkl')

user_input = pd.DataFrame({
    'PM2.5': [81],
    'PM10': [124],
    'NO': [1.44],
    'NO2': [20],
    'NOx': [12],
    'NH3': [10],
    'CO': [0.1],
    'SO2': [15],
    'O3': [127],
    'Benzene': [0.20],
    'Toluene': [6],
    'Xylene': [0.06]
})


user_input_scaled = scaler.transform(user_input.values.reshape(-1, 1))

user_pred = model.predict(user_input_scaled)

actual_pred = scaler.inverse_transform(user_pred)

print(user_pred)

print("----------------------------")

print(actual_pred)