In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Concatenate
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv(r"ds\station_hour.csv")
df['Datetime'] = pd.to_datetime(df['Datetime'])

  df = pd.read_csv(r"C:\Users\Padmajaa\OneDrive - SSN Trust\IIT KANPUR\ds\station_hour.csv")


In [3]:
df['Month'] = df['Datetime'].dt.month
df['Year'] = df['Datetime'].dt.year
df['Hour'] = df['Datetime'].dt.hour
df['Minute'] = df['Datetime'].dt.minute

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2589083 entries, 0 to 2589082
Data columns (total 20 columns):
 #   Column      Dtype         
---  ------      -----         
 0   StationId   object        
 1   Datetime    datetime64[ns]
 2   PM2.5       float64       
 3   PM10        float64       
 4   NO          float64       
 5   NO2         float64       
 6   NOx         float64       
 7   NH3         float64       
 8   CO          float64       
 9   SO2         float64       
 10  O3          float64       
 11  Benzene     float64       
 12  Toluene     float64       
 13  Xylene      float64       
 14  AQI         float64       
 15  AQI_Bucket  object        
 16  Month       int32         
 17  Year        int32         
 18  Hour        int32         
 19  Minute      int32         
dtypes: datetime64[ns](1), float64(13), int32(4), object(2)
memory usage: 355.6+ MB


In [5]:
# Count null values in each column
null_count = df.isnull().sum()
print(null_count)


StationId           0
Datetime            0
PM2.5          647689
PM10          1119252
NO             553711
NO2            528973
NOx            490808
NH3           1236618
CO             499302
SO2            742737
O3             725973
Benzene        861579
Toluene       1042366
Xylene        2075104
AQI            570190
AQI_Bucket     570190
Month               0
Year                0
Hour                0
Minute              0
dtype: int64


In [6]:
df = df.drop(columns=[ 'NH3', 'Toluene', 'Xylene', 'AQI', 'AQI_Bucket'], axis=1)


In [7]:
# List of columns to fill NaN values
columns_to_fill = ['PM2.5','PM10', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3', 'Benzene']

# Fill NaN values for the specified columns without affecting the Date column
df[columns_to_fill] = df[columns_to_fill].fillna(df[columns_to_fill].mean())

# Ensure the Date column remains intact
print(df.head())  # Check if Date column is still present


  StationId            Datetime  PM2.5    PM10    NO    NO2    NOx   CO  \
0     AP001 2017-11-24 17:00:00  60.50   98.00  2.35  30.80  18.25  0.1   
1     AP001 2017-11-24 18:00:00  65.50  111.25  2.70  24.20  15.07  0.1   
2     AP001 2017-11-24 19:00:00  80.00  132.00  2.10  25.18  15.15  0.1   
3     AP001 2017-11-24 20:00:00  81.50  133.25  1.95  16.25  10.23  0.1   
4     AP001 2017-11-24 21:00:00  75.25  116.00  1.43  17.48  10.43  0.1   

     SO2      O3  Benzene  Month  Year  Hour  Minute  
0  11.85  126.40      0.1     11  2017    17       0  
1  13.17  117.12      0.1     11  2017    18       0  
2  12.08   98.98      0.2     11  2017    19       0  
3  10.47  112.20      0.2     11  2017    20       0  
4   9.12  106.35      0.2     11  2017    21       0  


In [8]:
# Define features
time_features = ['Month', 'Year', 'Hour', 'Minute']
air_quality_features = ['PM2.5','PM10', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3', 'Benzene']



In [9]:
# Prepare the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), time_features + air_quality_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['StationId']) # Change 'sparse' to 'sparse_output'
    ])

In [10]:
# Fit the preprocessor and transform the data
X = preprocessor.fit_transform(df)

In [None]:
# Prepare sequences
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:(i + seq_length)])
        y.append(data[i + seq_length, -len(air_quality_features):])  # Only predict air quality features
    return np.array(X), np.array(y)

seq_length = 24  # Use 24 hours of data to predict the next hour
X_seq, y_seq = create_sequences(X, seq_length)

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

In [None]:
# Build the LSTM model
input_shape = X_train.shape[1:]
model = Sequential([
    LSTM(64, activation='relu', input_shape=input_shape, return_sequences=True),
    LSTM(32, activation='relu'),
    Dense(len(air_quality_features))
])


In [None]:
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=1)


In [None]:
# Save the model in SavedModel format (this is the default format)
model.save('lstm_hourly_model')


In [None]:
from datetime import datetime, timedelta


In [None]:
def predict_future(model, preprocessor, last_sequence, station_id, future_datetime, num_hours=24):
    # Generate a range of datetimes, starting from the input datetime
    future_datetimes = [future_datetime + timedelta(hours=i) for i in range(num_hours)]

    # Prepare the input sequence
    last_sequence_df = pd.DataFrame(last_sequence, columns=time_features + air_quality_features)
    last_sequence_df['StationId'] = station_id

    # Transform the last sequence
    last_sequence_transformed = preprocessor.transform(last_sequence_df)

    # Initialize the sequence for prediction
    current_sequence = last_sequence_transformed[-seq_length:]

    future_predictions = []

    for future_dt in future_datetimes:
        # Prepare the input for the next time step
        next_input = np.zeros((1, seq_length, current_sequence.shape[1]))
        next_input[0, :-1, :] = current_sequence[1:]

        # Update time features for the next step
        time_features_next = [future_dt.month, future_dt.year, future_dt.hour, future_dt.minute]
        next_input[0, -1, :len(time_features)] = preprocessor.named_transformers_['num'].transform([time_features_next + [0]*len(air_quality_features)])[0, :len(time_features)]

        # Keep the StationId encoding the same
        station_id_cols = preprocessor.named_transformers_['cat'].transform([[station_id]])
        next_input[0, -1, len(time_features) + len(air_quality_features):] = station_id_cols

        # Make prediction
        prediction = model.predict(next_input)

        # Store the prediction
        future_predictions.append(prediction[0])

        # Update the sequence for the next iteration
        current_sequence = next_input[0]
        current_sequence[-1, len(time_features):-len(station_id_cols[0])] = prediction[0]

    # Convert predictions to original scale
    future_predictions = np.array(future_predictions)
    future_predictions_inv = preprocessor.named_transformers_['num'].inverse_transform(
        np.column_stack((np.zeros((len(future_predictions), len(time_features))), future_predictions)))[:, -len(air_quality_features):]

    return future_predictions_inv, future_datetimes


In [None]:
# Get user input
'''
station_id = input("Enter StationId: ")
future_date = input("Enter future Date (YYYY-MM-DD): ")
future_time = input("Enter future Time (HH:MM:SS): ")'''

In [None]:
station_id = 'AP001 '
future_date = '2024-09-27'
future_time = '12:00:00'

In [None]:
future_datetime = datetime.strptime(f"{future_date} {future_time}", "%Y-%m-%d %H:%M:%S")


In [None]:
last_sequence = df[df['StationId'] == station_id].sort_values('Datetime').iloc[-seq_length:]
last_sequence = last_sequence[time_features + air_quality_features].values


In [None]:
future_predictions, prediction_datetimes = predict_future(model, preprocessor, last_sequence, station_id, future_datetime)


In [None]:
future_df = pd.DataFrame(future_predictions, columns=air_quality_features, index=prediction_datetimes)


In [None]:
print(future_df)

# Print the prediction for the specific datetime entered by the user
specific_prediction = future_df.loc[future_datetime]
print(f"\nPrediction for {future_datetime} at station {station_id}:")
for feature, value in specific_prediction.items():
    print(f"{feature}: {value:.2f}")