In [357]:
# Import necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [None]:
# Load the data
file_path = 'city_day.csv'
df = pd.read_csv(file_path)
print(df)

In [None]:
# Drop AQI_Bucket column
df_mod0 = df.drop(columns=['AQI_Bucket'])
print(df_mod0)

In [None]:
# Remove any NaN AQI values
df_mod1 = df_mod0.copy()
df_mod1 = df_mod1.dropna(subset=['AQI'])
print(df_mod1) 

In [None]:
# Drop City
df_mod2 = df_mod1.copy()
columns_to_drop = ['City']
df_mod2.drop(columns=columns_to_drop, inplace=True)
print(df_mod2) 

In [None]:
# Linear interpolation to address NaN pollutant values
columns_to_fill = df_mod2.select_dtypes(include=np.number).columns.difference(['AQI'])
df_mod3 = df_mod2.copy()
df_mod3[columns_to_fill] = df_mod2[columns_to_fill].interpolate(method='linear', limit_area='inside', limit=2)
df_mod3[columns_to_fill] = df_mod3[columns_to_fill].fillna(df[columns_to_fill].mean())

print(df_mod3) 

In [None]:
# Feature: Fraction of Type of Pollutant (UNUSED)
""" df_mod4 = df_mod3.copy()
df_mod4['Sum_Part_Matter'] = df_mod4['PM2.5'] + df_mod4['PM10']
df_mod4['Sum_Gaseous'] = df_mod4[['NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3']].sum(axis=1)
df_mod4['Sum_VOC'] = df_mod4[['Benzene', 'Toluene', 'Xylene']].sum(axis=1)

df_mod4['Total_Sum'] = df_mod4.iloc[:, 2:-3].sum(axis=1) 
df_mod4['Fraction_Part_Matter'] = df_mod4['Sum_Part_Matter'] / df_mod4['Total_Sum']
df_mod4['Fraction_Gaseous'] = df_mod4['Sum_Gaseous'] / df_mod4['Total_Sum']
df_mod4['Fraction_VOC'] = df_mod4['Sum_VOC'] / df_mod4['Total_Sum']
fraction_columns = ['Fraction_Part_Matter', 'Fraction_Gaseous', 'Fraction_VOC']
df_mod4[fraction_columns] = df_mod4[fraction_columns].div(df_mod4[fraction_columns].sum(axis=1), axis=0)

print(df_mod4) """

In [None]:
# Feature: Season
df_mod4 = df_mod3.copy()
df_mod4['Date'] = pd.to_datetime(df_mod4['Date'])

winter_range = ((df_mod4['Date'].dt.month >= 12) | (df_mod4['Date'].dt.month <= 3))
pre_monsoon_range = (df_mod4['Date'].dt.month >= 4) & (df_mod4['Date'].dt.month <= 6)
monsoon_range = (df_mod4['Date'].dt.month >= 7) & (df_mod4['Date'].dt.month <= 9)
post_monsoon_range = (df_mod4['Date'].dt.month >= 10) & (df_mod4['Date'].dt.month <= 11)

df_mod4.loc[winter_range, 'Season'] = 1
df_mod4.loc[pre_monsoon_range, 'Season'] = 2
df_mod4.loc[monsoon_range, 'Season'] = 3
df_mod4.loc[post_monsoon_range, 'Season'] = 4

print(df_mod4) 

In [None]:
# Drop Date
df_mod5 = df_mod4.copy()
df_mod5 = df_mod5.drop(columns=['Date'])

print(df_mod5)

In [None]:
# Reordering the columns
df_mod6 = df_mod5.copy()
new_order = [
    'Season', 'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene',
    'Toluene', 'Xylene', 'AQI']
df_mod6 = df_mod6.reindex(columns=new_order)

print(df_mod6) 

In [681]:
# Save final dataset to csv for visualization purposes
data = df_mod6.copy()
data.to_csv('final_data.csv', index=False) 

In [None]:
# Splitting the data
X_train = data.drop(columns=['AQI'])
y_train = data['AQI']

X_val = data.drop(columns=['AQI'])
y_val = data['AQI']

X_test = data.drop(columns=['AQI'])
y_test = data['AQI']

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape) 

X_train = np.asarray(X_train).astype('float32')
X_val = np.asarray(X_val).astype('float32')
X_test = np.asarray(X_test).astype('float32')
y_train = np.asarray(y_train).astype('float32')
y_val = np.asarray(y_val).astype('float32')
y_test = np.asarray(y_test).astype('float32')

In [None]:
# LSTM Model 1
lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(units=64, input_shape=(13, 1)),
    tf.keras.layers.Dense(units=1)
])

lstm_model.compile(optimizer='adam', loss='mean_squared_error')
lstm_model.fit(X_train, y_train, epochs=20, batch_size=20, validation_data=(X_val, y_val)) 

In [None]:
# LSTM Model 2
lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(units=128, return_sequences=True, input_shape=(13, 1)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(units=64, return_sequences=True),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.LSTM(units=32),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units=32, activation='relu'),
    tf.keras.layers.Dense(units=1,)
])

lstm_model.compile(optimizer='adam', loss='mean_squared_error')
lstm_model.fit(X_train, y_train, epochs=80, batch_size=128, validation_data=(X_val, y_val)) 

In [None]:
# Validation loss
validation_loss = lstm_model.evaluate(X_val, y_val)
print(f'Validation Loss: {validation_loss}') 

In [None]:
# Calculate MSE, MAR, and R-squared values
predictions = lstm_model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error on Test Set: {mse}") 

mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error (MAE) on Test Data:", mae)

r2 = r2_score(y_test, predictions)
print("R-squared (R²) Score on Test Data:", r2) 

In [None]:
# Calculate success rate as a percentage
predictions_reshaped = np.reshape(predictions, (-1, 1))
y_test_reshaped = np.reshape(y_test, (-1, 1))
differences = abs(predictions_reshaped - y_test_reshaped)
successful_predictions = np.sum(differences <= 30)
total_predictions = len(predictions)
success_rate = (successful_predictions / total_predictions) * 100

print("Percentage success rate:", success_rate) 

In [None]:
# CNN Model 1
cnn_model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(13, 1)),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(units=1) 
])

lstm_model.compile(optimizer='adam', loss='mean_squared_error')
lstm_model.fit(X_train, y_train, epochs=20, batch_size=20, validation_data=(X_val, y_val)) 

In [None]:
# CNN Model 2
cnn_model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(13, 1)),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(units=128, activation='sigmoid'),
    tf.keras.layers.Dense(units=1) 
])

cnn_model.compile(optimizer='adam', loss='mean_squared_error') 
cnn_model.fit(X_train, y_train, epochs=300, batch_size=128, validation_data=(X_val, y_val)) 

In [None]:
# Validation loss
validation_loss = cnn_model.evaluate(X_val, y_val)
print(f'Validation Loss: {validation_loss}') 

In [None]:
# Calculate MSE, MAR, and R-squared values
predictions = cnn_model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error on Test Set: {mse}") 

mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error (MAE) on Test Data:", mae)

r2 = r2_score(y_test, predictions)
print("R-squared (R²) Score on Test Data:", r2) 

In [None]:
# Calculate success rate as a percentage
predictions_reshaped = np.reshape(predictions, (-1, 1))
y_test_reshaped = np.reshape(y_test, (-1, 1))
differences = abs(predictions_reshaped - y_test_reshaped)
successful_predictions = np.sum(differences <= 30)
total_predictions = len(predictions)
success_rate = (successful_predictions / total_predictions) * 100

print("Percentage success rate:", success_rate) 