In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import pandas as pd


In [None]:
# Load the data
data = pd.read_csv('AQI.csv')
data['time'] = pd.to_datetime(data['time'])
data.set_index('time')
time = data['time']

fold = 3
# Split into training, validation, and test sets according to time
# fold 1
if fold == 1:
    train = data[(time >= '2018-01-01 00:00:00') & (time <= '2018-12-31 23:00:00')]
    validation = data[(time >= '2019-01-01 00:00:00') & (time <= '2019-12-31 23:00:00')]
# fold 2
if fold == 2:
    train = data[(time >= '2018-01-01 00:00:00') & (time <= '2019-12-31 23:00:00')]
    validation = data[(time >= '2020-01-01 00:00:00') & (time <= '2020-12-31 23:00:00')]
# fold 3
if fold == 3:
    train = data[(time >= '2018-01-01 00:00:00') & (time <= '2020-12-31 23:00:00')]
    validation = data[(time >= '2021-01-01 00:00:00') & (time <= '2021-12-31 23:00:00')]


test = data[(time >= '2022-01-01 00:00:00') & (time <= '2022-12-31 23:00:00')]




print(f'Train shape: {train.shape}')
print(f'Validation shape: {validation.shape}')
print(f'Test shape: {test.shape}')


In [None]:
# Normalize the features
feature_scaler = StandardScaler()
train_features_scaled = feature_scaler.fit_transform(train.drop(columns=['time']))
validation_features_scaled = feature_scaler.transform(validation.drop(columns=['time']))
test_features_scaled = feature_scaler.transform(test.drop(columns=['time']))

# Normalize the target
target_scaler = StandardScaler()
train_target_scaled = target_scaler.fit_transform(train[['PM2.5']])
validation_target_scaled = target_scaler.transform(validation[['PM2.5']])
test_target_scaled = target_scaler.transform(test[['PM2.5']])

# Prepare the data for LSTM
def create_sequences(features, target, seq_length, pred_length):
    xs, ys = [], []
    for i in range(len(features) - seq_length - pred_length + 1):
        x = features[i:i + seq_length]
        y = target[i + seq_length:i + seq_length + pred_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

seq_length = 120
pred_length = 12

X_train, y_train = create_sequences(train_features_scaled, train_target_scaled, seq_length, pred_length)
X_val, y_val = create_sequences(validation_features_scaled, validation_target_scaled, seq_length, pred_length)
X_test, y_test = create_sequences(test_features_scaled, test_target_scaled, seq_length, pred_length)

# Reshape for LSTM [samples, time steps, features]
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], X_train.shape[2]))
X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], X_val.shape[2]))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], X_test.shape[2]))


print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_val shape: {X_val.shape}, y_val shape: {y_val.shape}')

In [None]:
# Model Definition
model = Sequential([
    Input(shape=(seq_length, X_train.shape[2])),
    LSTM(300, activation='softsign', return_sequences=True),
    LSTM(300, activation='softsign', return_sequences=False),
    Dropout(0.5),
    Dense(pred_length, kernel_regularizer=tf.keras.regularizers.l2(0.03))
])

# Compile the model with MAE as an additional metric
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mean_squared_error', metrics=['mae'])

print(model.summary())


In [None]:
# Early stopping and learning rate reduction callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=1, min_lr=1e-10)

# save running time
import time
start = time.time()

# Training the Model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping, reduce_lr])

# Making Predictions
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)
y_pred_test = model.predict(X_test)

end = time.time()
running_time = end - start
print(f'Running time: {running_time}')

In [None]:
#saving predictions to npy files
np.save('y_pred_train_'+str(seq_length)+'_'+str(pred_length)+'.npy', y_pred_train)
np.save('y_pred_val_'+str(seq_length)+'_'+str(pred_length)+'.npy', y_pred_val)
np.save('y_pred_test_'+str(seq_length)+'_'+str(pred_length)+'.npy', y_pred_test)
np.save('y_train_'+str(seq_length)+'_'+str(pred_length)+'.npy', y_train)
np.save('y_val_'+str(seq_length)+'_'+str(pred_length)+'.npy', y_val)
np.save('y_test_'+str(seq_length)+'_'+str(pred_length)+'.npy', y_test)

In [None]:
# load data from npy files
y_pred_train = np.load('y_pred_train_'+str(seq_length)+'_'+str(pred_length)+'.npy')
y_pred_val = np.load('y_pred_val_'+str(seq_length)+'_'+str(pred_length)+'.npy')
y_pred_test = np.load('y_pred_test_'+str(seq_length)+'_'+str(pred_length)+'.npy')


In [None]:
# reshape y_train, y_val, y_test
y_train = y_train.reshape((y_train.shape[0], y_train.shape[1]))
y_val = y_val.reshape((y_val.shape[0], y_val.shape[1]))
y_test = y_test.reshape((y_test.shape[0], y_test.shape[1]))

# print shape
print(f'y_train shape: {y_train.shape}')
print(f'y_val shape: {y_val.shape}')
print(f'y_test shape: {y_test.shape}')

In [None]:
# reverse scaling
y_train = target_scaler.inverse_transform(y_train)
y_val = target_scaler.inverse_transform(y_val)
y_test = target_scaler.inverse_transform(y_test)

y_pred_train = target_scaler.inverse_transform(y_pred_train)
y_pred_val = target_scaler.inverse_transform(y_pred_val)
y_pred_test = target_scaler.inverse_transform(y_pred_test)

In [None]:
# add the time index back to the predictions
y_pred_train = pd.DataFrame(y_pred_train, index=train['time'].iloc[seq_length:seq_length + len(y_pred_train)])
y_pred_val = pd.DataFrame(y_pred_val, index=validation['time'].iloc[seq_length:seq_length + len(y_pred_val)])
y_pred_test = pd.DataFrame(y_pred_test, index=test['time'].iloc[seq_length:seq_length + len(y_pred_test)])


In [None]:
# add the time index back to the target
y_train = pd.DataFrame(y_train, index=train['time'].iloc[seq_length:seq_length + len(y_train)])
y_val = pd.DataFrame(y_val, index=validation['time'].iloc[seq_length:seq_length + len(y_val)])
y_test = pd.DataFrame(y_test, index=test['time'].iloc[seq_length:seq_length + len(y_test)])

In [None]:
# evaluate the model with RMSE and MAE
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
train_mae = mean_absolute_error(y_train, y_pred_train)
val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
val_mae = mean_absolute_error(y_val, y_pred_val)

# when calculating rmse and mae for test set, we need to exclude missing values
# load missing_indexes.csv
missing_indexes = pd.read_csv('C:/Users/zhang/Desktop/Thesis/Data/missing_indexes.csv')
# only use the test set
missing_indexes = missing_indexes.loc[35064:]
# delete the index column
missing_indexes = missing_indexes.drop('time', axis=1)
# reshape missing_indexes
def create_sequences(df, pred_length, seq_length):
    ys = []
    for i in range(len(df) - seq_length - pred_length + 1):
        y = df[i + seq_length:i + seq_length + pred_length]
        ys.append(y)
    return np.array(ys)
missing_indexes_sequence = create_sequences(missing_indexes, pred_length, seq_length)
# reshape missing_indexes_sequence
missing_indexes_sequence = missing_indexes_sequence.reshape(missing_indexes_sequence.shape[0], missing_indexes_sequence.shape[1])
# mask missing values
test_rmse = np.sqrt(np.mean((y_test[~missing_indexes_sequence] - y_pred_test[~missing_indexes_sequence])**2))
test_mae = np.mean(np.abs(y_test[~missing_indexes_sequence] - y_pred_test[~missing_indexes_sequence]))

print(f'Train RMSE: {train_rmse:.2f}, Train MAE: {train_mae:.2f}')
print(f'Validation RMSE: {val_rmse:.2f}, Validation MAE: {val_mae:.2f}')
print(f'Test RMSE: {test_rmse:.2f}, Test MAE: {test_mae:.2f}')

In [None]:
# save results to a csv file
import csv
with open('results.csv', mode='a') as file:
    writer = csv.writer(file)
    # write header if file is empty
    if file.tell() == 0:
        writer.writerow(['seq_length', 'pred_length', 'train_rmse', 'train_mae', 'val_rmse', 'val_mae', 'test_rmse', 'test_mae', 'running_time'])
    writer.writerow([seq_length, pred_length, train_rmse, train_mae, val_rmse, val_mae, test_rmse, test_mae, running_time])


In [None]:
# calculate average of 96 hours so that there is only one value for each day
y_pred_train_avg = pd.DataFrame()
y_pred_train_avg['PM2.5'] = y_pred_train.mean(axis=1)
y_pred_val_avg = pd.DataFrame()
y_pred_val_avg['PM2.5'] = y_pred_val.mean(axis=1)
y_pred_test_avg = pd.DataFrame()
y_pred_test_avg['PM2.5'] = y_pred_test.mean(axis=1)

In [None]:
# average
y_train_avg = pd.DataFrame()
y_train_avg['PM2.5'] = y_train.mean(axis=1)
y_val_avg = pd.DataFrame()
y_val_avg['PM2.5'] = y_val.mean(axis=1)
y_test_avg = pd.DataFrame()
y_test_avg['PM2.5'] = y_test.mean(axis=1)


In [None]:
# plot trend graph (train)

plt.figure(figsize=(14, 7))
plt.plot(y_train_avg, label='True')
plt.plot(y_pred_train_avg, 'r', label='Predicted')
plt.title('AQI training Prediction')
legend = plt.legend(loc='upper left', shadow=True, fontsize='x-large')

# save the plot
plt.savefig('train_trend_' + str(seq_length) + '_' + str(pred_length) + '.png')

In [None]:
# visualize average results with dot plot (validation)
plt.figure(figsize=(14, 7))
plt.plot(y_val_avg, label='True')
plt.plot(y_pred_val_avg, 'r', label='Predicted')
plt.title('AQI validation Prediction')
plt.legend()

# save the plot
plt.savefig('val_trend_' + str(seq_length) + '_' + str(pred_length) + '.png')

In [None]:
# visyalize average results with dot plot (test)
plt.figure(figsize=(14, 7))
plt.plot(y_test_avg, label='True')
plt.plot(y_pred_test_avg, 'r', label='Predicted')
plt.title('AQI test Prediction')
plt.legend()

# save the plot
plt.savefig('test_trend_' + str(seq_length) + '_' + str(pred_length) + '.png')

In [None]:
#plot the loss curve and save
plt.figure()
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
# save the plot
plt.savefig('loss_curve_' + str(seq_length) + '_' + str(pred_length) + '.png')