In [None]:
import pandas as pd
import numpy as np
from sklearnex import patch_sklearn, config_context
patch_sklearn()
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Concatenate
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt
import intel_extension_for_tensorflow as itex
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

In [None]:
df = pd.read_csv("station_hour.csv")
df['Datetime'] = pd.to_datetime(df['Datetime'])

In [None]:
df['Month'] = df['Datetime'].dt.month
df['Year'] = df['Datetime'].dt.year
df['Hour'] = df['Datetime'].dt.hour
df['Minute'] = df['Datetime'].dt.minute

In [None]:
df = df.drop(columns=['NH3', 'Toluene', 'Xylene', 'AQI', 'AQI_Bucket'], axis=1)

In [None]:
columns_to_fill = ['PM2.5','PM10', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3', 'Benzene']
df[columns_to_fill] = df[columns_to_fill].fillna(df[columns_to_fill].mean())

In [None]:
time_features = ['Month', 'Year', 'Hour', 'Minute']
features = ['PM2.5','PM10', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3', 'Benzene']

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), time_features + features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['StationId'])
    ])

In [None]:
X = preprocessor.fit_transform(df)

In [None]:
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:(i + seq_length)])
        y.append(data[i + seq_length, -len(features):])
    return np.array(X), np.array(y)

seq_length = 24
X_seq, y_seq = create_sequences(X, seq_length)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

In [None]:
input_shape = X_train.shape[1:]
model = Sequential([
    LSTM(64, activation='relu', input_shape=input_shape, return_sequences=True),
    LSTM(32, activation='relu'),
    Dense(len(features))
])

itex_model = itex.optimize(model)

In [None]:
with config_context(target_offload="gpu:0"):
    itex_model.compile(optimizer='adam', loss='mse')
    itex_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=1)

In [None]:
itex_model.save('lstm_hourly_model.h5')

In [None]:
from datetime import datetime, timedelta

features = ['PM2.5','PM10', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3', 'Benzene']
time_features = ['Month', 'Year', 'Hour', 'Minute']

def predict_future(model, preprocessor, last_sequence, station, future_datetime, num_hours=24):
    future_datetimes = [future_datetime + timedelta(hours=i) for i in range(num_hours)]
    last_sequence_df = pd.DataFrame(last_sequence, columns=time_features + features)
    last_sequence_df['StationId'] = station
    last_sequence_transformed = preprocessor.transform(last_sequence_df)
    current_sequence = last_sequence_transformed[-seq_length:]

    future_predictions = []

    for future_dt in future_datetimes:
        next_input = np.zeros((1, seq_length, current_sequence.shape[1]))
        next_input[0, :-1, :] = current_sequence[1:]
        time_features_next = [future_dt.month, future_dt.year, future_dt.hour, future_dt.minute]
        next_input[0, -1, :len(time_features)] = preprocessor.named_transformers_['num'].transform([time_features_next + [0]*len(features)])[0, :len(time_features)]
        station_cols = preprocessor.named_transformers_['cat'].transform([[station]])
        next_input[0, -1, len(time_features) + len(features):] = station_cols
        prediction = model.predict(next_input)
        future_predictions.append(prediction[0])
        current_sequence = next_input[0]
        current_sequence[-1, len(time_features):-len(station_cols[0])] = prediction[0]

    future_predictions = np.array(future_predictions)
    future_predictions_inv = preprocessor.named_transformers_['num'].inverse_transform(
    np.column_stack((np.zeros((len(future_predictions), len(time_features))), future_predictions)))[:, -len(features):]

    return future_predictions_inv, future_datetimes


In [None]:
station = 'AP001'
future_date = '2024-09-27'
future_time = '12:00:00'

future_datetime = datetime.strptime(f"{future_date} {future_time}", "%Y-%m-%d %H:%M:%S")

In [None]:
df = pd.read_csv("station_hour.csv")

last_sequence = df[df['StationId'] == station].sort_values('Datetime').iloc[-seq_length:]
last_sequence = last_sequence[time_features + features].values


In [None]:
future_predictions, prediction_datetimes = predict_future(model, preprocessor, last_sequence, station, future_datetime)

In [None]:
future_df = pd.DataFrame(future_predictions, columns=features, index=prediction_datetimes)

In [None]:
print(future_df)

specific_prediction = future_df.loc[future_datetime]
print(f"\nPrediction for {future_datetime} at station {station}:")
for feature, value in specific_prediction.items():
    print(f"{feature}: {value:.2f}")