## CITY HOUR:

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('city_hour.csv')

In [3]:
df['Datetime'] = pd.to_datetime(df['Datetime'])
df['Month'] = df['Datetime'].dt.month
df['Year'] = df['Datetime'].dt.year
df['Hour'] = df['Datetime'].dt.hour
df['Minute'] = df['Datetime'].dt.minute

In [4]:
print(df.isnull().sum())

City               0
Datetime           0
PM2.5         145088
PM10          296737
NO            116632
NO2           117122
NOx           123224
NH3           272542
CO             86517
SO2           130373
O3            129208
Benzene       163646
Toluene       220607
Xylene        455829
AQI           129080
AQI_Bucket    129080
Month              0
Year               0
Hour               0
Minute             0
dtype: int64


In [5]:
df = df.drop(columns=['NH3', 'Toluene', 'Xylene', 'AQI', 'AQI_Bucket'], axis=1)

In [6]:
columns_to_fill = ['PM2.5','PM10', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3', 'Benzene']
df[columns_to_fill] = df[columns_to_fill].fillna(df[columns_to_fill].mean())

In [7]:
df.head(5)

Unnamed: 0,City,Datetime,PM2.5,PM10,NO,NO2,NOx,CO,SO2,O3,Benzene,Month,Year,Hour,Minute
0,Ahmedabad,2015-01-01 01:00:00,67.622994,119.075804,1.0,40.01,36.37,1.0,122.07,34.798979,0.0,1,2015,1,0
1,Ahmedabad,2015-01-01 02:00:00,67.622994,119.075804,0.02,27.75,19.73,0.02,85.9,34.798979,0.0,1,2015,2,0
2,Ahmedabad,2015-01-01 03:00:00,67.622994,119.075804,0.08,19.32,11.08,0.08,52.83,34.798979,0.0,1,2015,3,0
3,Ahmedabad,2015-01-01 04:00:00,67.622994,119.075804,0.3,16.45,9.2,0.3,39.53,153.58,0.0,1,2015,4,0
4,Ahmedabad,2015-01-01 05:00:00,67.622994,119.075804,0.12,14.9,7.85,0.12,32.63,34.798979,0.0,1,2015,5,0


In [8]:
# Define features
time_features = ['Month', 'Year', 'Hour', 'Minute']
air_quality_features = ['PM2.5', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3', 'Benzene']

In [9]:
cities = df['City'].unique()
cities_to_index = {city: index for index, city in enumerate(cities)}
df['CityIndex'] = df['City'].map(cities_to_index)
num_cities= len(cities)

In [10]:
df.head(5)

Unnamed: 0,City,Datetime,PM2.5,PM10,NO,NO2,NOx,CO,SO2,O3,Benzene,Month,Year,Hour,Minute,CityIndex
0,Ahmedabad,2015-01-01 01:00:00,67.622994,119.075804,1.0,40.01,36.37,1.0,122.07,34.798979,0.0,1,2015,1,0,0
1,Ahmedabad,2015-01-01 02:00:00,67.622994,119.075804,0.02,27.75,19.73,0.02,85.9,34.798979,0.0,1,2015,2,0,0
2,Ahmedabad,2015-01-01 03:00:00,67.622994,119.075804,0.08,19.32,11.08,0.08,52.83,34.798979,0.0,1,2015,3,0,0
3,Ahmedabad,2015-01-01 04:00:00,67.622994,119.075804,0.3,16.45,9.2,0.3,39.53,153.58,0.0,1,2015,4,0,0
4,Ahmedabad,2015-01-01 05:00:00,67.622994,119.075804,0.12,14.9,7.85,0.12,32.63,34.798979,0.0,1,2015,5,0,0


In [12]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

def create_sequences(data, city_data, seq_length):
    X, y, X_city = [], [], []
    
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length, :])  # Sequence of features
        y.append(data[i + seq_length, :len(features)])  # Target: next timestep pollutant values
        X_city.append(city_data[i + seq_length])  # City index for sequence
        
    return np.array(X), np.array(y), np.array(X_city)

# Sequence length
seq_length = 30

# Define the features to use for the model
features = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3', 'Benzene']

# Extract feature values and other relevant data for sequence generation
data = df[features + ['Month', 'Year', 'Hour', 'Minute']].values
city_data = df['CityIndex'].values

# Create sequences using the provided function
X, y, X_city = create_sequences(data, city_data, seq_length)

# Convert CityIndex to categorical (for multi-city handling)
num_cities = len(df['City'].unique())  # Get number of unique cities
X_city = to_categorical(X_city, num_classes=num_cities)

# Split data into training and test sets
X_train, X_test, y_train, y_test, X_city_train, X_city_test = train_test_split(
    X, y, X_city, test_size=0.2, random_state=42
)

# Display the shapes of the datasets for verification
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}, X_city_train shape: {X_city_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}, X_city_test shape: {X_city_test.shape}")


X_train shape: (566276, 30, 13), y_train shape: (566276, 9), X_city_train shape: (566276, 26)
X_test shape: (141569, 30, 13), y_test shape: (141569, 9), X_city_test shape: (141569, 26)


In [15]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Input, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
import numpy as np

# Define RMSE metric
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

# Create the stacked LSTM model
def create_stacked_lstm_model(seq_length, num_features, num_cities, num_pollutants):
    # Input for the time series sequence data
    input_seq = Input(shape=(seq_length, num_features))  # (Batch, seq_length, num_features)
    input_city = Input(shape=(num_cities,))  # City input as one-hot vector

    # LSTM layers with recurrent dropout
    x = LSTM(128, return_sequences=True, recurrent_dropout=0.1)(input_seq)
    x = LSTM(64, return_sequences=False, recurrent_dropout=0.1)(x)
    x = Dropout(0.3)(x)

    # Batch Normalization
    x = BatchNormalization()(x)

    # Concatenate the output of LSTM with city input
    x = Concatenate()([x, input_city])
    
    # Fully connected layers with regularization
    x = Dense(64, activation='relu', kernel_regularizer=l2(0.001))(x)
    x = Dropout(0.2)(x)
    x = Dense(32, activation='relu', kernel_regularizer=l2(0.001))(x)
    
    # Output layer for pollutant prediction
    output = Dense(num_pollutants, activation='linear')(x)

    # Create and compile the model
    model = Model(inputs=[input_seq, input_city], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss='mse', metrics=[rmse])
    return model

# Data preparation for the model
seq_length = 30  # Sequence length for the LSTM

# Define the features for input
features = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3', 'Benzene']
num_features = len(features) + 4  # Adding 4 extra (Month, Year, Hour, Minute)
num_pollutants = len(features)  # Number of pollutants to predict

# Prepare data for sequences
data = df[features + ['Month', 'Year', 'Hour', 'Minute']].values
city_data = df['CityIndex'].values

# Create sequences for the model
X, y, X_city = create_sequences(data, city_data, seq_length)

# Convert CityIndex to one-hot encoding
num_cities = len(df['City'].unique())
X_city = to_categorical(X_city, num_classes=num_cities)

# Split data into training and test sets
X_train, X_test, y_train, y_test, X_city_train, X_city_test = train_test_split(
    X, y, X_city, test_size=0.2, random_state=42
)

# Create the model
model = create_stacked_lstm_model(seq_length, num_features, num_cities, num_pollutants)

# Callbacks for early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

# Train the model
history = model.fit(
    [X_train, X_city_train], y_train,
    epochs=100,
    batch_size=32,
    validation_data=([X_test, X_city_test], y_test),
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# Predict pollutant concentrations
def predict_pollutants(model, X, X_city):
    predictions = model.predict([X, X_city])
    return predictions

# Example prediction call
predicted_pollutants = predict_pollutants(model, X_test, X_city_test)
print(predicted_pollutants[:5])


Epoch 1/100




[1m17697/17697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m350s[0m 19ms/step - loss: 878.5266 - rmse: 27.3768 - val_loss: 441.2382 - val_rmse: 19.4508 - learning_rate: 5.0000e-04
Epoch 2/100
[1m17697/17697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m333s[0m 19ms/step - loss: 499.6494 - rmse: 21.0599 - val_loss: 420.6554 - val_rmse: 19.1074 - learning_rate: 5.0000e-04
Epoch 3/100
[1m17697/17697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m339s[0m 19ms/step - loss: 477.5233 - rmse: 20.5305 - val_loss: 484.9495 - val_rmse: 20.8457 - learning_rate: 5.0000e-04
Epoch 4/100
[1m17697/17697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m344s[0m 19ms/step - loss: 465.7923 - rmse: 20.3056 - val_loss: 412.3201 - val_rmse: 18.8537 - learning_rate: 5.0000e-04
Epoch 5/100
[1m17697/17697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m348s[0m 20ms/step - loss: 470.3163 - rmse: 20.3705 - val_loss: 394.7694 - val_rmse: 18.4006 - learning_rate: 5.0000e-04
Epoch 6/100
[1m17697/17697[0m 

In [16]:
model.save('C_hr.h5')



In [17]:

from tensorflow.keras.models import load_model
model = load_model('C_hr.h5', custom_objects={'mse': 'mean_squared_error'})



In [19]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model

# Load the saved model with custom metrics
model = load_model('C_hr.h5', custom_objects={'mse': 'mean_squared_error'})

# Function to predict pollutants for a future date and city
def predict_pollutants_for_date(model, future_date, city, seq_length, features, city_to_index):
    # Check if the required historical data is available
    historical_data = df[df['City'] == city].tail(seq_length)  # Get the last 'seq_length' rows for that city
    
    if len(historical_data) < seq_length:
        print(f"Not enough historical data for city: {city} to predict for {future_date}.")
        return
    
    # Prepare the sequence input and city input
    sequence_input = historical_data[features + ['Month', 'Year', 'Hour', 'Minute']].values
    
    # Normalize or reshape the sequence data if necessary
    X_input = np.array([sequence_input])  # Reshape to (1, seq_length, num_features)
    
    # One-hot encode the city index for the city input
    city_index = city_to_index[city]
    X_city_input = np.zeros((1, len(city_to_index)))
    X_city_input[0, city_index] = 1  # One-hot encode the city index
    
    # Make predictions using the loaded model
    predicted_pollutants = model.predict([X_input, X_city_input])

    # Display the predicted concentrations
    print(f"Predicted pollutant concentrations for {city} on {future_date}:")
    pollutants = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3', 'Benzene']
    for pollutant, concentration in zip(pollutants, predicted_pollutants[0]):
        print(f"{pollutant}: {concentration:.2f}")

# Example usage
future_date = pd.Timestamp('2025-10-05')
city = 'Chennai'

# Create a mapping from city names to their indices (one-hot encoding)
city_to_index = {city_name: index for index, city_name in enumerate(df['City'].unique())}

# Features used in the prediction
features = ['PM2.5','PM10', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3', 'Benzene']
seq_length = 30  # Adjust according to your model's input requirement

predict_pollutants_for_date(model, future_date, city, seq_length, features, city_to_index)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 708ms/step
Predicted pollutant concentrations for Chennai on 2025-10-05 00:00:00:
PM2.5: 22.75
PM10: 37.74
NO: 5.78
NO2: 14.95
NOx: 14.78
CO: 1.07
SO2: 7.63
O3: 46.62
Benzene: 0.46


In [20]:
import numpy as np
from sklearn.metrics import mean_squared_error, f1_score
from tensorflow.keras.models import load_model
import tensorflow.keras.backend as K

# Define RMSE metric
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

# Define MSE metric
def mse(y_true, y_pred):
    return K.mean(K.square(y_pred - y_true))

# Load the saved model with custom metrics
model = load_model('C_hr.h5', custom_objects={'rmse': rmse, 'mse': mse})

# Get predictions on the test set
predicted_pollutants = model.predict([X_test, X_city_test])

# Binarize predictions and true values based on a threshold (e.g., > 0)
threshold = 0.5  # Adjust threshold as needed
predicted_classes = (predicted_pollutants > threshold).astype(int)
true_classes = (y_test > threshold).astype(int)

# Calculate F1 score
f1_scores = f1_score(true_classes, predicted_classes, average='weighted', zero_division=0)
print(f"F1 Score: {f1_scores}")

# Calculate RMSE for additional evaluation
rmse_value = np.sqrt(mean_squared_error(y_test, predicted_pollutants))
print(f"Root Mean Squared Error (RMSE): {rmse_value}")




[1m4425/4425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 26ms/step
F1 Score: 0.9636505278760298
Root Mean Squared Error (RMSE): 17.609840010990577
