# COMP809 Assignment2-Part B
# Prediction of PM2.5 With  Multi-layer Perceptron (MLP) and Long Short-term  Memory (LSTM)



*    Group 43
*    Yize(Serena) Wang, Student ID: 23198583
*    GuangLiang(Ricky) Yang, Student ID: 23205919






This notebook only contains code, please refer to the running results:

1.  full dataset with line interpolation
https://drive.google.com/file/d/1CSe6pAywdErIWvxe9iS2EmOzyStftXNl/view?usp=sharing

2. Full dataset with KNN interpolation
https://drive.google.com/file/d/188dLOxdTJKS_VN1HGgyPWTXC6rNsB8fM/view?usp=drive_link

3. High qulity dataset with line interpolation
https://drive.google.com/file/d/1axAkZ6PKk0mNjPIxWoovJBLoC2zgArjx/view?usp=drive_link


# **Project Workflow**




1. **Pre-Processing**
    * Data Exploration
    * Data Cleaning
    * Extract New Features
    * Converting Categorical Data Types
    * Final Dataset

2. **Feature Selection**
    * Correlation Analysis
    * Visualization

3. **Experimental Methods**
    * Normalization Data
    * Data Segment

4. **Multilayer Perceptron (MLP)**
    * MLP Description
    * Learning Rate Analysis
    * Neurons Distribution Analysis
    * MLP Conclusion

5. **Long Short-Term Memory (LSTM)**
    * LSTM Introduction
    * Cost Function Analysis
    * Best Epoch Analysis
    * Batch Size Analysis
    * Number of Neurons in Hidden Layer Analysis
    * LSTM Conclusion

6. **Model Comparison**
    * Visual Prediction Comparison
    * Performance Comparison


In [None]:
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Colab Notebooks/DataMining/report-dataMining/data/'

# **Step1: Pre-Processing**

## **1.1 Data Exploration.**

In [None]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from tabulate import tabulate

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import  StratifiedKFold,KFold
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report, mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings('ignore')

# For confusion_matrix plot
import itertools

# Keras libraries for LSTM
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense
from keras.optimizers import Adam
from keras.callbacks import Callback
from keras.losses import MeanSquaredError, MeanAbsoluteError, MeanSquaredLogarithmicError, Huber

from tensorflow.keras.regularizers import l1, l2


In [None]:
# Define a control parameter to decide whether to compute hyperparameters
CALCU_HYPER_PARAMETERS = True#False #True
# How many times the model is run to train
MODEL_RUN_COUNT = 30
# LSTM sliding window Length
N_STEPS = 2

### Import Dataset

In [None]:
data=pd.read_csv(path+'BulkExport-7-20240606201329-clean-excel-01.csv')
#BulkExport-7-20240606201329-clean-excel-01.csv   full data ， not clean
#BulkExport-7-20240606201329-clean-excel-02.csv   clean, 2022/3/18 - 2022/6/6

In [None]:
data.info()

In [None]:
data.head()

### exploration

In [None]:
def show_time_series_hist_plot(df,columns_to_plot = ["PM2.5", "SO2", "NO", "NO2", "Temp", "Humidity", "Wind_Dir", "Wind_Speed"]):

    # Create the subplots
    fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))
    fig.suptitle('Data Distribution', fontsize=16)

    # Flatten the axes array for easier iteration
    axes = axes.flatten()

    # Define units for each column
    units = {
        "PM2.5": "μg/m3",
        "SO2": "μg/m3",
        "NO": "μg/m3",
        "NO2": "μg/m3",
        "Temp": "°C",
        "Humidity": "%",
        "Wind_Dir": "Degrees",
        "Wind_Speed": "m/s"
    }

    # Plot each column
    for i, column in enumerate(columns_to_plot):
        if i < len(axes):
            axes[i].hist(data[column].dropna(), bins=30, edgecolor='yellow', alpha=0.7)
            axes[i].set_title(f'Distribution of {column}')
            axes[i].set_xlabel(f'{column} ({units[column]})')
            axes[i].set_ylabel('Count')

    # Adjust layout
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

show_time_series_hist_plot(data)

### Data Type convert

In [None]:

# Assuming 'data' is already loaded into a DataFrame
df = pd.DataFrame(data)

# Converting 'start_time' and 'end_time' to datetime


df['start_time'] = pd.to_datetime(df['start_time'], format='%Y/%m/%d %H:%M')
df['end_time'] = pd.to_datetime(df['end_time'], format='%Y/%m/%d %H:%M')

# Converting other columns to float
columns_to_convert = ['Temp', 'Humidity', 'Wind_Speed', 'Wind_Dir', 'NO', 'NO2', 'SO2', 'PM2.5']

# Ensuring all values can be converted to float
for column in columns_to_convert:
    df[column] = pd.to_numeric(df[column], errors='coerce')

# Convert columns to float
df[columns_to_convert] = df[columns_to_convert].astype(float)

# Check the data types and look for any conversion issues
print(df.info())

# Check for any NaN values introduced during conversion
print(df[columns_to_convert].isna().sum())

### Data Check

In [None]:
# data check
print("Number of observation: ", df.shape[0])# check dimension
print("Any NA value:", df.isnull().values.any()); #check for missing values
print("Any row duplictaes:",df.duplicated().any());#check for dupllicates rows


# Define the rules for identifying errors in each column
error_checks = {
    'Temp': "Temperature should be between -20°C and 40°C",
    'Humidity': "Humidity should be between 0% and 100%",
    'Wind_Speed': "Wind speed should be between 0 and 60 m/s",
    'Wind_Dir': "Wind direction should be between 0 and 360 degrees",
    'NO': "NO should be non-negative",
    'NO2': "NO2 should be non-negative",
    'SO2': "SO2 should be non-negative",
    'PM2.5': "PM2.5 should be non-negative"
}

# Functions to check each rule
error_functions = {
    'Temp': lambda x: (x < -20) | (x > 40),
    'Humidity': lambda x: (x < 0) | (x > 100),
    'Wind_Speed': lambda x: (x < 0) | (x > 60),
    'Wind_Dir': lambda x: (x < 0) | (x >= 360),
    'NO': lambda x: x < 0,
    'NO2': lambda x: x < 0,
    'SO2': lambda x: x < 0,
    'PM2.5': lambda x: x < 0
}

# Initialize lists to store results
columns = []
rules = []
error_counts = []
percentages = []

# Calculate the number of error values and percentages for each rule
for column, rule in error_checks.items():
    error_count = error_functions[column](df[column]).sum()
    error_percentage = (error_count / len(df)) * 100
    columns.append(column)
    rules.append(rule)
    error_counts.append(error_count)
    percentages.append(error_percentage)

# Combine the results into a DataFrame
error_data = pd.DataFrame({
    'Column Name': columns,
    'Rule': rules,
    'Error Count': error_counts,
    'Percentage': percentages
})

# Print the error data
print(error_data)



### **Missing Values.**

In [None]:
# Calculate the number of missing values for each column
missing_values_count = df.isnull().sum()

# Calculate the percentage of missing values for each column
missing_values_percentage = (df.isnull().sum() / len(df)) * 100

# Combine the results into a DataFrame
missing_data = pd.DataFrame({
    'Missing Values': missing_values_count,
    'Percentage': missing_values_percentage
})

# Sort the DataFrame by the number of missing values
missing_data = missing_data.sort_values(by='Missing Values', ascending=False)

# Print the report
print(missing_data)


### **Invalid Data**

In [None]:
# Function to replace error values with NaN based on specified conditions
def replace_with_nan(df, column, condition):
    df[column] = df[column].apply(lambda x: x if condition(x) else None)

# Define the error conditions for each column
error_conditions = {
    'Temp': lambda x: -20 <= x <= 40,
    'Humidity': lambda x: 0 <= x <= 100,
    'Wind_Speed': lambda x: 0 <= x <= 60,
    'Wind_Dir': lambda x: 0 <= x < 360,
    'NO': lambda x: x >= 0,
    'NO2': lambda x: x >= 0,
    'SO2': lambda x: x >= 0,
    'PM2.5': lambda x: x >= 0,
}

# Replace error values with NaN for each column
for column, condition in error_conditions.items():
    replace_with_nan(df, column, condition)

# Fill NaN values with the previous non-NaN value for each column
# df.fillna(method='ffill', inplace=True)

# Display the number of error values for each column after replacement
print("Number of error values in 'Temp':", ((df['Temp'] < -20) | (df['Temp'] > 40)).sum())
print("Number of error values in 'Humidity':", ((df['Humidity'] < 0) | (df['Humidity'] > 100)).sum())
print("Number of error values in 'Wind_Speed':", ((df['Wind_Speed'] < 0) | (df['Wind_Speed'] > 60)).sum())
print("Number of error values in 'Wind_Dir':", ((df['Wind_Dir'] < 0) | (df['Wind_Dir'] >= 360)).sum())
print("Number of error values in 'NO':", (df['NO'] < 0).sum())
print("Number of error values in 'NO2':", (df['NO2'] < 0).sum())
print("Number of error values in 'SO2':", (df['SO2'] < 0).sum())
print("Number of error values in 'PM2.5':", (df['PM2.5'] < 0).sum())


df.isnull().sum()

### **Outliers**

In [None]:

def analyze_outliers(df):
    # Copy df to df_pm25
    df_pm25 = df.copy()

    # Function to calculate and plot for each column
    def plot_parameters(df, parameters, lower_bound, upper_bound):
        num_parameters = len(parameters)
        cols = 2
        rows = (num_parameters + 1) // cols
        fig, axes = plt.subplots(rows, cols, figsize=(14, 5 * rows), sharex=True)

        axes = axes.flatten()  # Flatten the 2D array of axes to 1D

        for i, parameter in enumerate(parameters):
            # Flag and remove extreme values
            df['is_extreme'] = ((df[parameter] > upper_bound[parameter]) | (df[parameter] < lower_bound[parameter]))
            df_cleaned = df[~df['is_extreme']]

            # Plot parameter trend over time with upper and lower limits
            axes[i].plot(df["start_time"], df[parameter], label=parameter, color='blue')
            axes[i].axhline(y=upper_bound[parameter], color='red', linewidth=1, label='Upper Limit')
            axes[i].axhline(y=lower_bound[parameter], color='green', linewidth=1, label='Lower Limit')
            axes[i].set_ylabel(f"{parameter} (unit)")  # Change the unit according to the parameter
            axes[i].legend()

        # Hide unused subplots
        for j in range(i + 1, len(axes)):
            fig.delaxes(axes[j])

        plt.xlabel("Time")
        plt.suptitle("Line plot of measured parameters with extreme values thresholds")
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.show()

     # Remove the time columns for analysis if they exist
    time_columns = ['start_time', 'end_time']
    df_without_time = df.drop(columns=[col for col in time_columns if col in df.columns])


    # Describe the DataFrame to get the statistical summary
    stats = df_without_time.describe().T

    # Calculate the IQR (Interquartile Range) for each column
    IQR = stats['75%'] - stats['25%']

    # Define the lower and upper bounds for outliers
    lower_bound = stats['25%'] - 1.5 * IQR
    upper_bound = stats['75%'] + 1.5 * IQR

    # Initialize lists to store results
    columns = []
    outlier_counts = []
    percentages = []

    # Calculate the number of outliers and percentages for each column
    for column in df_without_time.columns:
        outlier_count = ((df_without_time[column] < lower_bound[column]) | (df_without_time[column] > upper_bound[column])).sum()
        outlier_percentage = (outlier_count / len(df)) * 100
        columns.append(column)
        outlier_counts.append(outlier_count)
        percentages.append(outlier_percentage)

    # Combine the results into a DataFrame
    outlier_data = pd.DataFrame({
        'Column Name': columns,
        'Outlier Count': outlier_counts,
        'Percentage': percentages
    })

    # Print the outlier data
    print(outlier_data)

    # Generate the boxplot for all parameters
    fig, ax = plt.subplots(figsize=(15, 6))
    df_without_time.boxplot(ax=ax)

    # Set the title and labels of the plot
    plt.title("Boxplot of Air Quality and Meteorological Data", fontsize=16, fontweight='bold')
    plt.xlabel("Measured Parameters", fontsize=14, fontweight='bold')
    plt.ylabel("Observed Values", fontsize=14, fontweight='bold')
    plt.xticks(rotation=45, fontsize=12)
    plt.yticks(fontsize=12)

    # Display the plot
    plt.show()

    # Generate combined plot for all parameters
    plot_parameters(df_pm25, df_without_time.columns, lower_bound, upper_bound)


# show
analyze_outliers(df)

In [None]:

df_without_time = df.drop(columns=['start_time','end_time'])
df_without_time.describe().T



## **1.2 Data Cleaning.**


In [None]:
# Define invalid data conditions
invalid_conditions = {
    'Temp': lambda x: (x < -20) | (x > 40),
    'Humidity': lambda x: (x < 0) | (x > 100),
    'Wind_Speed': lambda x: (x < 0) | (x > 60),
    'Wind_Dir': lambda x: (x < 0) | (x >= 360),
    'NO': lambda x: x < 0,
    'NO2': lambda x: x < 0,
    'SO2': lambda x: x < 0,
    'PM2.5': lambda x: x < 0
}

# Replace invalid data with NaN
for col, condition in invalid_conditions.items():
    df.loc[condition(df[col]), col] = np.nan

# Remove the time columns for analysis
df_without_time = df.drop(columns=['start_time', 'end_time'])

# Describe the DataFrame to get the statistical summary
stats = df_without_time.describe().T

# Calculate the IQR (Interquartile Range) for each column
IQR = stats['75%'] - stats['25%']

# Define the lower and upper bounds for outliers
lower_bound = stats['25%'] - 1.5 * IQR
upper_bound = stats['75%'] + 1.5 * IQR

# Replace outliers with NaN
for column in df_without_time.columns:
    df.loc[(df_without_time[column] < lower_bound[column]) | (df_without_time[column] > upper_bound[column]), column] = np.nan

# Verify the updated dataset
print(df.info())


### Result after replacing invalid data with NaN

In [None]:

# Create a figure and a set of subplots with a specified size
fig, ax = plt.subplots(figsize=(15, 6))

# Generate a boxplot of the DataFrame df on the specified axes ax
df.boxplot(ax=ax)

# Set the title of the plot with a more descriptive title
plt.title("Boxplot of Air Quality and Meteorological Data", fontsize=16, fontweight='bold')

# Set the x-axis label with a more descriptive label
plt.xlabel("Measured Parameters", fontsize=14, fontweight='bold')

# Set the y-axis label with a more descriptive label
plt.ylabel("Observed Values", fontsize=14, fontweight='bold')

# Optionally, rotate the x-axis labels for better readability
plt.xticks(rotation=45, fontsize=12)

# Set the y-axis label font size
plt.yticks(fontsize=12)

# Display the plot
plt.show()

### Generate a complete hourly frequency data source

In [None]:

# Title: Checking for Continuous Time Series and Inserting Missing Time Points
# Description: This script checks if the 'start_time' values in a dataframe are continuous.
# If not, it inserts the missing time points with hourly frequency.

In [None]:

df_without_time = df.drop(columns=['start_time','end_time'])
df_without_time.describe().T


### Data Imputation Method Comparison

In [None]:


# Function to show time series plot
def show_time_series_plot(df, title):
    columns_to_plot = ['NO', 'NO2', 'SO2', 'Temp', 'Humidity', 'Wind_Speed', 'Wind_Dir', 'PM2.5']
    fig, axes = plt.subplots(nrows=len(columns_to_plot), ncols=1, figsize=(12, 10), sharex=True)

    for i, col in enumerate(columns_to_plot):
        axes[i].plot(df['start_time'], df[col], label=col)
        axes[i].set_ylabel(col, fontsize=12)
        axes[i].legend(loc='upper right')

    axes[-1].set_xlabel('Time', fontsize=12)
    fig.suptitle(title, fontsize=16, fontweight='bold')
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

# Handle Wind_Dir using circular statistics
def impute_wind_direction(df):
    # Convert wind direction to sine and cosine components
    df['Wind_Dir_sin'] = np.sin(np.deg2rad(df['Wind_Dir']))
    df['Wind_Dir_cos'] = np.cos(np.deg2rad(df['Wind_Dir']))

    # Interpolate the sine and cosine components
    df['Wind_Dir_sin'] = df['Wind_Dir_sin'].interpolate(method='linear', limit_direction='both')
    df['Wind_Dir_cos'] = df['Wind_Dir_cos'].interpolate(method='linear', limit_direction='both')

    # Convert back to wind direction
    df['Wind_Dir'] = np.rad2deg(np.arctan2(df['Wind_Dir_sin'], df['Wind_Dir_cos']))
    df['Wind_Dir'] = df['Wind_Dir'] % 360  # Ensure the values are within [0, 360]
    df.drop(columns=['Wind_Dir_sin', 'Wind_Dir_cos'], inplace=True)
    return df

# Show initial time series plot
show_time_series_plot(df, 'Time Series Data of Air Quality and Meteorological (Original)')

# Linear interpolation to repair the missing values
columns_to_impute = ['Temp', 'Humidity', 'Wind_Speed', 'NO', 'NO2', 'SO2', 'PM2.5']

df_interpolated = df.copy()
df_interpolated[columns_to_impute] = df_interpolated[columns_to_impute].interpolate(method='linear', limit_direction='both')
df_interpolated = impute_wind_direction(df_interpolated)

# Show time series plot after linear interpolation
show_time_series_plot(df_interpolated, 'Time Series Data of Air Quality and Meteorological (Linear Interpolation)')

# KNN to repair the missing values
imputer = KNNImputer(n_neighbors=5)
df_knn = df.copy()
df_knn[columns_to_impute] = imputer.fit_transform(df_knn[columns_to_impute])
df_knn = impute_wind_direction(df_knn)

# Show time series plot after KNN interpolation
show_time_series_plot(df_knn, 'Time Series Data of Air Quality and Meteorological (KNN Interpolation)')

# Calculate errors
def calculate_errors(original, imputed):
    mse = mean_squared_error(original, imputed)
    mae = mean_absolute_error(original, imputed)
    return mse, mae

df_original = df.fillna(method='bfill').fillna(method='ffill')  # Use backfill and forward fill for demonstration
errors = []

for col in columns_to_impute + ['Wind_Dir']:
    mse_linear, mae_linear = calculate_errors(df_original[col], df_interpolated[col])
    mse_knn, mae_knn = calculate_errors(df_original[col], df_knn[col])
    errors.append([col, mse_linear, mae_linear, mse_knn, mae_knn])

# Create a DataFrame for error comparison
errors_df = pd.DataFrame(errors, columns=['Column', 'MSE_Linear', 'MAE_Linear', 'MSE_KNN', 'MAE_KNN'])

# Print the errors DataFrame
print(errors_df)

# Visualize the comparison
errors_df.set_index('Column').plot(kind='bar', figsize=(12, 6))
plt.title('Error Comparison of Imputation Methods')
plt.ylabel('Error')
plt.xlabel('Columns')
plt.xticks(rotation=45)
plt.legend(loc='upper right')
plt.tight_layout()
plt.show()


### After Data Imputation

In [None]:
# we choose line interpolated result
df = df_interpolated
df = df_knn #test knn

analyze_outliers(df)

## **1.3 Extract New Features.**


In [None]:
# Creating lag1 (1 hours before) and lag2 (2 hours before) for PM2.5
df['lag1'] = df['PM2.5'].shift(1)
df['lag2'] = df['PM2.5'].shift(2)
df['lag1_time'] = df['start_time'].shift(1)
df['lag2_time'] = df['start_time'].shift(2)

# Setting the target to be the PM2.5 value next 1 hours into the future
#df['PM2.5'] = df['PM2.5'].shift(-1)
#df['target_time'] = df['start_time'].shift(-1)

# Ensuring the time shifts are accurate
df['lag1'] = np.where((df['start_time'] - pd.Timedelta(hours=1)) != df['lag1_time'], np.nan, df['lag1'])
df['lag2'] = np.where((df['start_time'] - pd.Timedelta(hours=2)) != df['lag2_time'], np.nan, df['lag2'])
#df['PM2.5'] = np.where((df['start_time'] + pd.Timedelta(hours=1)) != df['target_time'], np.nan, df['PM2.5'])

# Dropping the temporary time columns used for validation
df = df.drop(columns=['lag1_time', 'lag2_time']) #, 'target_time'

# Create Month and Day_Of_Week columns
df['Month'] = df['start_time'].dt.month
df['Day_Of_Week'] = df['start_time'].dt.dayofweek

# Drop rows where PM2.5, lag1, or lag2 are NaN
df = df.dropna(subset=['lag1', 'lag2']) #'PM2.5',

# Verify the updated DataFrame
print(df.info())
print(df.head())


## **1.4 Converting Categorical Data Types.**


In [None]:
# Define a function to categorize wind direction
def categorize_wind_direction(num):
  val=int((num/22.5)+.5)
  arr=["N","NNE","NE","ENE","E","ESE", "SE", "SSE","S","SSW","SW","WSW","W","WNW","NW","NNW"]
  return arr[(val % 16)]

df2 = df.copy()


# Apply the function to create a categorical wind direction column in the copy
df2['Wind_Type'] = df2['Wind_Dir'].apply(categorize_wind_direction)

# Apply one-hot encoding for Wind_Type, Month, and Day_Of_Week with custom prefixes
df2 = pd.get_dummies(df2, columns=['Wind_Type', 'Month', 'Day_Of_Week'],
                     prefix=['Wind_Type', 'Month', 'Day'])

# Convert only the dummy columns to float64
dummy_columns = [col for col in df2.columns if 'Wind_Type_' in col or 'Month_' in col or 'Day_' in col]
df2[dummy_columns] = df2[dummy_columns].astype('float64')

# Drop the original Wind_Dir, Month, and Day_Of_Week columns in the copy as they are now encoded
df2 = df2.drop(columns=['Wind_Dir'])

# Rename the columns to remove '.0'
df2.columns = df2.columns.str.replace('.0', '', regex=False)
# Print the information of the modified copy to confirm
df2.info()
df2.head()




In [None]:
#made our choice , use the linear interpolate
df2.isnull().sum()

df2.info()
df2.head()

## **1.5 Final Dataset.**


In [None]:
#change the index

df = df2.copy()
# Set start_time as index
df.set_index('start_time', inplace=True)

# Remove end_time column
df.drop(columns=['end_time'], inplace=True)

# Display the updated DataFrame
print(df.head())
df.info()

show_time_series_hist_plot(df,columns_to_plot = ["PM2.5", "SO2", "NO", "NO2", "Temp", "Humidity",  "Wind_Speed"])


In [None]:

#df_without_time = df.drop(columns=['start_time','end_time'])
df_without_time.describe().T

# **Step 2. Feature Selection**




## **2.1 Correlation Analysis.**

In [None]:

# Assuming df is the DataFrame after final preprocessing

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Create a copy of the DataFrame to scale the data
df_scaled = df.copy()

# Select columns to scale (excluding the 'start_time' and 'end_time' if present)
columns_to_scale = df_scaled.columns.difference(['start_time', 'end_time'])

# Apply the scaler to the selected columns in the copied DataFrame
df_scaled[columns_to_scale] = scaler.fit_transform(df_scaled[columns_to_scale])

# Calculate Spearman correlations
spearman_corr = df_scaled.corr(method='spearman')

# Extract Spearman correlations with PM2.5
spearman_corr_pm25 = spearman_corr.loc['PM2.5'].drop('PM2.5')

# Create a DataFrame to store the Spearman correlation results
spearman_corr_df = pd.DataFrame({
    'Parameters': spearman_corr_pm25.index,
    'Spearman_Correlation': spearman_corr_pm25.values,
})

# Order by absolute value of Spearman correlation while keeping the original values
spearman_corr_df['Absolute Spearman Correlation'] = spearman_corr_df['Spearman_Correlation'].abs()
spearman_corr_df = spearman_corr_df.sort_values(by='Absolute Spearman Correlation', ascending=False).drop(columns='Absolute Spearman Correlation')

# Print the Spearman correlation table
spearman_table = tabulate(spearman_corr_df, headers='keys', tablefmt='plain', showindex=False)
print("Spearman Correlation Table:")
print(spearman_table)

# Visualize the ordered Spearman correlations with a bar plot
plt.figure(figsize=(10, 8))
sns.barplot(x='Spearman_Correlation', y='Parameters', data=spearman_corr_df, palette='coolwarm')
plt.title('Spearman Correlation with PM2.5')
plt.show()

# Calculate Pearson correlations
pearson_corr = df_scaled.corr(method='pearson')

# Extract Pearson correlations with PM2.5
pearson_corr_pm25 = pearson_corr.loc['PM2.5'].drop('PM2.5')

# Create a DataFrame to store the Pearson correlation results
pearson_corr_df = pd.DataFrame({
    'Parameters': pearson_corr_pm25.index,
    'Pearson_Correlation': pearson_corr_pm25.values,
})

# Order by absolute value of Pearson correlation while keeping the original values
pearson_corr_df['Absolute Pearson Correlation'] = pearson_corr_df['Pearson_Correlation'].abs()
pearson_corr_df = pearson_corr_df.sort_values(by='Absolute Pearson Correlation', ascending=False).drop(columns='Absolute Pearson Correlation')

# Print the Pearson correlation table
pearson_table = tabulate(pearson_corr_df, headers='keys', tablefmt='plain', showindex=False)
print("Pearson Correlation Table:")
print(pearson_table)

# Visualize the ordered Pearson correlations with a bar plot
plt.figure(figsize=(10, 8))
sns.barplot(x='Pearson_Correlation', y='Parameters', data=pearson_corr_df, palette='coolwarm')
plt.title('Pearson Correlation with PM2.5')
plt.show()

# Select top 8 parameters based on absolute Spearman correlation with PM2.5
top_8_columns = spearman_corr_df.head(8)['Parameters'].tolist()

# Visualize the Spearman correlations with a heatmap for top 8 correlated features
plt.figure(figsize=(12, 10))
sns.heatmap(spearman_corr.loc[top_8_columns + ['PM2.5'], top_8_columns + ['PM2.5']], annot=True, cmap='coolwarm', fmt='.2f', annot_kws={"size": 10}, cbar_kws={'shrink': .5})
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.title('Spearman Correlation Matrix (Top 8 Features)', fontsize=20)
plt.show()

# Visualize the Pearson correlations with a heatmap for top 8 correlated features
plt.figure(figsize=(12, 10))
sns.heatmap(pearson_corr.loc[top_8_columns + ['PM2.5'], top_8_columns + ['PM2.5']], annot=True, cmap='coolwarm', fmt='.2f', annot_kws={"size": 10}, cbar_kws={'shrink': .5})
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.title('Pearson Correlation Matrix (Top 8 Features)', fontsize=20)
plt.show()

# Provide summary statistics of PM2.5 and the chosen predictors
summary_stats = df_scaled[['PM2.5'] + top_8_columns].describe()
print(summary_stats)

# Scatter plots with random sampling of 100 data points
df_sampled = df_scaled.sample(n=100, random_state=42)
fig, axs = plt.subplots(4, 2, figsize=(20, 20))

for i, feature in enumerate(top_8_columns):
    row, col_index = divmod(i, 2)
    sns.scatterplot(x=df_sampled[feature], y=df_sampled['PM2.5'], ax=axs[row, col_index])
    axs[row, col_index].set_title(f'Scatter plot between PM2.5 and {feature}')
    axs[row, col_index].set_xlabel(feature)
    axs[row, col_index].set_ylabel('PM2.5 (µg/m³)')

# Remove empty subplots
for j in range(i + 1, len(axs.flatten())):
    fig.delaxes(axs.flatten()[j])

# Adjust the layout
plt.tight_layout()
plt.show()

# Multiple Linear Regression Analysis using scaled data
X = df_scaled[top_8_columns]
y = df_scaled['PM2.5']
X = sm.add_constant(X)  # Adds a constant term to the predictors

model = sm.OLS(y, X).fit()
predictions = model.predict(X)

print(f'Regression Analysis for PM2.5 with Multiple Features')
print(model.summary())

# Residual plot
plt.figure(figsize=(10, 6))
sns.residplot(x=predictions, y=model.resid, lowess=True, color='g')
plt.title('Residual Plot for PM2.5 with Multiple Features')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.show()

# QQ plot
sm.qqplot(model.resid, line='45')
plt.title('QQ Plot for PM2.5 with Multiple Features')
plt.show()


In [None]:
df.describe().T


In [None]:

#selected_features = corr_df['Parameters'][:5].tolist()
selected_features = ['lag1', 'lag2', 'NO', 'NO2', 'SO2']
print("Selected Features:", selected_features)

In [None]:
# Assuming df is the preprocessed DataFrame and start_time is the index

# Step 1: Line plot showing the variation of PM2.5 concentration over time
plt.figure(figsize=(15, 7))
plt.plot(df.index, df['PM2.5'], color='blue')
plt.xlabel('Time')
plt.ylabel('PM2.5 Concentration (µg/m³)')
plt.title('PM2.5 Concentration Over Time')
plt.show()

# Step 2: Box plot showing the distribution of PM2.5 concentration for each year
# Extract year information
df['Year'] = df.index.year

# Box plot showing the distribution of PM2.5 concentration for each year
plt.figure(figsize=(15, 7))
sns.boxplot(x='Year', y='PM2.5', data=df)
plt.xlabel('Year')
plt.ylabel('PM2.5 Concentration (µg/m³)')
plt.title('Yearly PM2.5 Concentration Distribution')
plt.show()

# Step 3: Summary statistics of PM2.5 concentration
# Provide descriptive statistics
pm25_summary_stats = df['PM2.5'].describe()
print("PM2.5 Concentration Summary Statistics:")
print(pm25_summary_stats)

# Step 4: Summary statistics of predictors
# Provide descriptive statistics for the predictors with the highest correlation
predictor_summary_stats = df[selected_features].describe()
print("Top Predictors Summary Statistics:")
print(predictor_summary_stats)

# Convert summary statistics results to tabular form and print
summary_table = predictor_summary_stats.transpose()
summary_table['Mean'] = summary_table['mean']
summary_table['Median'] = summary_table['50%']
summary_table['Std'] = summary_table['std']
summary_table['Min'] = summary_table['min']
summary_table['Max'] = summary_table['max']
summary_table['25%'] = summary_table['25%']
summary_table['75%'] = summary_table['75%']

# Select only the needed columns
summary_table = summary_table[['Mean', 'Median', 'Std', 'Min', 'Max', '25%', '75%']]

# Print the table
print(tabulate(summary_table, headers='keys', tablefmt='plain'))


# **Step 3. Experimental Methods**



    * Normalization Data.

    * Data segment

Use 70% of the data for training and the rest for testing the MLP and LSTM models. Use a Workflow diagram to illustrate the process of predicting PM concentrations using the MLP and LSTM models.

In [None]:

# Initialize MinMaxScaler
my_scaler = MinMaxScaler(feature_range=(0, 1))

# Fit and transform the data
scaled_df = my_scaler.fit_transform(df[selected_features + ['PM2.5']])
scaled_df = pd.DataFrame(scaled_df, columns=selected_features + ['PM2.5'])

# Extract features and target values
X_scaler = scaled_df[selected_features].values
y_scaler = scaled_df['PM2.5'].values


# Split the data into training and testing sets based on time order
train_size = int(len(X_scaler) * 0.7)
X_train, X_test = X_scaler[:train_size], X_scaler[train_size:]
y_train, y_test = y_scaler[:train_size], y_scaler[train_size:]


# Get the corresponding time indices for the train and test sets
time_indices = df.index
# Adjust time_test for sliding window
time_test = time_indices[train_size + N_STEPS:]


#===


# Display the normalized data
print(scaled_df.describe())

#print data shape
print("X_train = ", X_train.shape)
print("X_test = ", X_test.shape)
print("y_train = ", y_train.shape)
print("y_test = ", y_test.shape)


# **Step 4. Multilayer Perceptron (MLP)**

## 4.2 Learning Rate Analysis

In [None]:

# Define a function to evaluate the model
def evaluate_model(learning_rate):
    mlp = MLPRegressor(hidden_layer_sizes=(25,), learning_rate_init=learning_rate, max_iter=1000, random_state=42)
    mlp.fit(X_train, y_train)

    y_pred_test = mlp.predict(X_test)

    mse_test = mean_squared_error(y_test, y_pred_test)
    rmse_test = np.sqrt(mse_test)
    mae_test = mean_absolute_error(y_test, y_pred_test)
    r2_test = r2_score(y_test, y_pred_test)

    return mse_test, rmse_test, mae_test, r2_test

# List of learning rates to test
learning_rates = [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0]
results = []

for lr in learning_rates:
    mse, rmse, mae, r2 = evaluate_model(lr)
    results.append((lr, mse, rmse, mae, r2))

# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results, columns=['Learning Rate', 'MSE', 'RMSE', 'MAE', 'R²'])

# Highlight the best learning rate
best_lr = results_df.loc[results_df['MSE'].idxmin()]
print("The best learning rate is: ", best_lr['Learning Rate'])
print(f"Results for the best learning rate ({best_lr['Learning Rate']}):")
print(f"MSE: {best_lr['MSE']}")
print(f"RMSE: {best_lr['RMSE']}")
print(f"MAE: {best_lr['MAE']}")
print(f"R²: {best_lr['R²']}")

# Display the results
print(results_df)

My_Learning_Rate = best_lr['Learning Rate']


The best learning rate for the MLPRegressor model, based on the tested values, is 0.001, which resulted in the lowest MSE of 3.171270. Additionally, this learning rate provided an RMSE of 1.780806, an MAE of 1.318530, and an R² score of 0.311761. This rate should be used for the highest performance on the testing dataset.

In [None]:

#if CALCU_HYPER_PARAMETERS:
    # Plotting the results
plt.figure(figsize=(12, 6))
plt.plot(results_df['Learning Rate'], results_df['MSE'], label='MSE')
plt.plot(results_df['Learning Rate'], results_df['RMSE'], label='RMSE')
plt.plot(results_df['Learning Rate'], results_df['MAE'], label='MAE')
plt.xlabel('Learning Rate')
plt.ylabel('Error')
plt.title('Error Metrics vs Learning Rate')
plt.legend()
plt.xscale('log')  # To better visualize the differences across learning rates
plt.show()

## 4.3 Neurons Distribution Analysis

In [None]:
k = 25
results = []

for i in range(1, k):  # i ranges from 1 to k-1
    neurons_layer1 = k - i
    neurons_layer2 = i

    # Create the MLPRegressor model with the specified neurons in two hidden layers
    mlp = MLPRegressor(hidden_layer_sizes=(neurons_layer1, neurons_layer2), learning_rate_init=My_Learning_Rate, max_iter=1000, random_state=42)

    # Train the model
    mlp.fit(X_train, y_train)

    # Predict on the testing set
    y_pred = mlp.predict(X_test)

    # Calculate the mean squared error
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append((neurons_layer1, neurons_layer2, mse, rmse, mae, r2))

# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results, columns=['Neurons in Layer 1', 'Neurons in Layer 2', 'MSE', 'RMSE', 'MAE', 'R²'])

# Highlight the best configuration
best_config = results_df.loc[results_df['MSE'].idxmin()]
print("The best configuration is:")
print(f"Layer 1 neurons: {best_config['Neurons in Layer 1']}, Layer 2 neurons: {best_config['Neurons in Layer 2']}")
print(f"MSE: {best_config['MSE']}")
print(f"RMSE: {best_config['RMSE']}")
print(f"MAE: {best_config['MAE']}")
print(f"R²: {best_config['R²']}")

# Display the results
print(results_df)

# Plotting the results
plt.figure(figsize=(12, 6))
plt.plot(results_df['Neurons in Layer 1'], results_df['MSE'], label='MSE')
plt.plot(results_df['Neurons in Layer 1'], results_df['RMSE'], label='RMSE')
plt.plot(results_df['Neurons in Layer 1'], results_df['MAE'], label='MAE')
plt.xlabel('Neurons in Layer 1')
plt.ylabel('Error')
plt.title('Error Metrics vs Neurons in Layer 1')
plt.legend()
plt.show()

# **Step 5. Long Short-Term Memory (LSTM)**

## 5.2 Best Cost Function Analysis

In [None]:


def create_lstm_datasets(x_scaler, y_scaler, n_steps=N_STEPS):
    # Function to create sequences
    def create_sequences(data, labels, n_steps):
        sequences = []
        seq_labels = []
        for i in range(len(data) - n_steps):
            seq = data[i:i + n_steps]
            sequences.append(seq)
            seq_labels.append(labels[i + n_steps])
        return np.array(sequences), np.array(seq_labels)

    # Create sequences
    X_sequences, y_sequences = create_sequences(x_scaler, y_scaler, n_steps)

    # Calculate split index, keep the same split index as mlp,very important
    split_index = int((len(X_sequences)+ n_steps) * 0.7)

    # Split the sequences into train and test sets based on time order
    X_train_lstm, X_test_lstm = X_sequences[:split_index], X_sequences[split_index:]
    y_train_lstm, y_test_lstm = y_sequences[:split_index], y_sequences[split_index:]

    # Define input shape
    my_input_shape = (X_train_lstm.shape[1], X_train_lstm.shape[2])

    return X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm, my_input_shape


# Default Value
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm, my_input_shape = create_lstm_datasets(X_scaler, y_scaler)





print("time_test shape:", time_test.shape)
print("X_train_lstm shape:", X_train_lstm.shape)
print("X_test_lstm shape:", X_test_lstm.shape)
print("y_train_lstm shape:", y_train_lstm.shape)
print("y_test_lstm shape:", y_test_lstm.shape)
print("Input shape for LSTM:", my_input_shape)



# Custom callback to measure epoch time
class TimeHistory(Callback):
    def on_train_begin(self, logs={}):
        self.times = []

    def on_epoch_begin(self, batch, logs={}):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, batch, logs={}):
        self.times.append(time.time() - self.epoch_time_start)


# Define different loss functions
loss_functions = {
    'MSE': MeanSquaredError(),
    'MAE': MeanAbsoluteError(),
    'MSLE': MeanSquaredLogarithmicError(),
    'Huber': Huber()
}


# Build the LSTM model


# Function to build LSTM model with stateful mode


def build_custom_lstm_model(input_shape_custom
                            , learning_rate_custom=0.01
                            , loss_function=MeanAbsoluteError()
                            , neurons_num=1 ):
    model = Sequential()
    model.add(LSTM(neurons_num, return_sequences=False, input_shape=input_shape_custom))  # Set return_sequences=False
    model.add(Dropout(0.2))
    model.add(Dense(neurons_num, activation='relu', kernel_regularizer=l1(0.01)))  # Added L1 regularization
    model.add(Dropout(0.2))
    model.add(Dense(1))

    model.compile(optimizer=Adam(learning_rate=learning_rate_custom), loss=loss_function)

    return model



def build_custom_lstm_model1(input_shape_custom
                            , learning_rate_custom=0.01
                            , loss_function=Huber()
                            , neurons_num=1 ):
    model = Sequential()
    model.add(LSTM(neurons_num, return_sequences=True, input_shape=input_shape_custom))
    model.add(LSTM(neurons_num))
    model.add(Dense(neurons_num, activation='relu', kernel_regularizer=l1(0.01)))  # Added L1 regularization
    model.add(Dropout(0.1))
    model.add(Dense(neurons_num, activation='relu', kernel_regularizer=l2(0.01)))  # Added L2 regularization
    model.add(Dense(1, activation='linear'))

    model.compile(optimizer=Adam(learning_rate=learning_rate_custom), loss=loss_function)

    return model

def build_custom_lstm_model2(input_shape_custom
                            , learning_rate_custom=0.01
                            , loss_function=MeanSquaredError()
                            , neurons_num=1 ):
    model = Sequential()
    model.add(LSTM(units=neurons_num, return_sequences=True, input_shape=input_shape_custom))
    model.add(Dropout(0.2))

    model.add(LSTM(units=neurons_num, return_sequences=True))
    model.add(Dropout(0.2))

    model.add(LSTM(units=neurons_num))
    model.add(Dropout(0.2))
    model.add(Dense(1))

    model.compile(optimizer=Adam(learning_rate=learning_rate_custom), loss=loss_function)
    return model

def build_custom_lstm_model3(input_shape):
    model = Sequential()
    model.add(LSTM(64, return_sequences=True, input_shape=input_shape))  # Reduced number of units
    model.add(LSTM(64))  # Reduced number of units
    model.add(Dense(64, activation='relu', kernel_regularizer=l1(0.01)))  # Added L1 regularization
    model.add(Dropout(0.1))
    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))  # Added L2 regularization
    model.add(Dense(8, activation='linear'))

    model.compile(optimizer='adam', loss='mse')

    return model

#  select_continuous_sub_data for lstm
def select_continuous_sub_data(X_train_lstm, y_train_lstm, X_test_lstm, y_test_lstm):


    train_length = 240#2400
    test_length = 12#120
    #train_length = len(X_train_lstm) // 3
    #test_length = len(X_test_lstm) // 3


    start_train = 0
    start_test = 0
    start_train = np.random.randint(0, len(X_train_lstm) - train_length)
    start_test = np.random.randint(0, len(X_test_lstm) - test_length)


    X_train_segment = X_train_lstm[start_train:start_train + train_length]
    y_train_segment = y_train_lstm[start_train:start_train + train_length]
    X_test_segment = X_test_lstm[start_test:start_test + test_length]
    y_test_segment = y_test_lstm[start_test:start_test + test_length]

    return X_train_segment, y_train_segment, X_test_segment, y_test_segment



# Inverse transform function
def inverse_transform_predictions(predictions, scaler, X_data):
    predictions_flat = predictions.reshape(-1, 1)
    predictions_actual_flat = scaler.inverse_transform(np.concatenate([np.zeros((predictions_flat.shape[0], X_data.shape[2])), predictions_flat], axis=1))[:, -1]
    predictions_actual = predictions_actual_flat.reshape(predictions.shape)
    return predictions_actual

In [None]:
# Predict with LSTM
#model, lstm_predictions_actual, y_test_lstm_actual = train_lstm(X_train_lstm, y_train_lstm, X_test_lstm, y_test_lstm, best_epoch, best_batch_size, best_neuron_count)


## 5.3 Best Epoch Analysis

In [None]:
MODEL_RUN_COUNT = 10
# Function to fit model and calculate Huber loss for each epoch
def fit_and_evaluate_epochs(model, X_train_segment, y_train_segment, X_test_segment, y_test_segment, epochs, batch_size, scaler):
    train_huber = []
    test_huber = []
    epoch_times = []

    huber_loss = Huber()

    for epoch in range(epochs):
        start_time = time.time()
        history = model.fit(X_train_segment, y_train_segment, epochs=1, batch_size=batch_size, verbose=0)
        end_time = time.time()

        epoch_time = end_time - start_time
        epoch_times.append(epoch_time)

        # Predict on train and test sets
        y_train_pred = model.predict(X_train_segment, verbose=0)
        y_test_pred = model.predict(X_test_segment, verbose=0)

        # Inverse transform predictions
        y_train_pred_actual = inverse_transform_predictions(y_train_pred, scaler, X_train_segment)
        y_test_pred_actual = inverse_transform_predictions(y_test_pred, scaler, X_test_segment)

        # Inverse transform actual values
        y_train_actual = inverse_transform_predictions(y_train_segment, scaler, X_train_segment)
        y_test_actual = inverse_transform_predictions(y_test_segment, scaler, X_test_segment)

        # Calculate Huber loss for train and test sets
        train_huber.append(np.mean(huber_loss(y_train_actual, y_train_pred_actual).numpy()))
        test_huber.append(np.mean(huber_loss(y_test_actual, y_test_pred_actual).numpy()))

    return train_huber, test_huber, epoch_times

# Function to evaluate model with different parameter configurations
def evaluate_model_with_parameters(param_arr, param_name, param_func, fixed_epochs=None, fixed_batch_size=None):
    all_test_huber_summary = []
    all_test_huber_combined = []
    all_times_combined = []

    for param in param_arr:
        print(f"Testing with {param_name}: {param}")

        epochs = fixed_epochs if fixed_epochs is not None else param
        batch_size = fixed_batch_size if fixed_batch_size is not None else param

        all_train_huber = np.zeros((MODEL_RUN_COUNT, epochs))
        all_test_huber = np.zeros((MODEL_RUN_COUNT, epochs))
        all_epoch_times = np.zeros((MODEL_RUN_COUNT, epochs))

        for run in range(MODEL_RUN_COUNT):
            model = param_func(param)

            # Randomly select continuous sub-data segments
            X_train_segment, y_train_segment, X_test_segment, y_test_segment = select_continuous_sub_data(X_train_lstm, y_train_lstm, X_test_lstm, y_test_lstm)

            # Fit model and calculate Huber loss for each epoch
            train_huber, test_huber, epoch_times = fit_and_evaluate_epochs(model, X_train_segment, y_train_segment, X_test_segment, y_test_segment, epochs, batch_size, my_scaler)
            all_train_huber[run] = train_huber
            all_test_huber[run] = test_huber
            all_epoch_times[run] = epoch_times

        all_test_huber_combined.append(all_test_huber.flatten())
        all_times_combined.append(np.sum(all_epoch_times, axis=1))

        # Calculate summary statistics for test Huber loss
        summary_stats = {
            param_name: param,
            'Test Huber Mean': np.mean(all_test_huber, axis=0)[-1],
            'Test Huber Std Dev': np.std(all_test_huber, axis=0)[-1],
            'Test Huber Min': np.min(all_test_huber, axis=0)[-1],
            'Test Huber Max': np.max(all_test_huber, axis=0)[-1],
            'Time Mean': np.mean(np.sum(all_epoch_times, axis=1)),
            'Time Std Dev': np.std(np.sum(all_epoch_times, axis=1)),
            'Time Min': np.min(np.sum(all_epoch_times, axis=1)),
            'Time Max': np.max(np.sum(all_epoch_times, axis=1))
        }

        all_test_huber_summary.append(summary_stats)

        # Plot the Huber loss scores for each epoch
        plt.figure(figsize=(14, 7))
        mean_train_huber = np.mean(all_train_huber, axis=0)
        mean_test_huber = np.mean(all_test_huber, axis=0)

        plt.plot(range(1, epochs + 1), mean_train_huber, label='Mean Train Huber', linewidth=2, color='blue')
        plt.plot(range(1, epochs + 1), mean_test_huber, label='Mean Test Huber', linewidth=2, color='orange')
        plt.fill_between(range(1, epochs + 1), mean_train_huber - np.std(all_train_huber, axis=0), mean_train_huber + np.std(all_train_huber, axis=0), alpha=0.2, color='blue')
        plt.fill_between(range(1, epochs + 1), mean_test_huber - np.std(all_test_huber, axis=0), mean_test_huber + np.std(all_test_huber, axis=0), alpha=0.2, color='orange')

        plt.xlabel('Epoch')
        plt.ylabel('Huber Loss')
        plt.title(f'Mean Train and Test Huber Loss Over {epochs} Epochs for {param_name} = {param}')


        # Generate x-ticks for 10 evenly spaced points
        x_ticks = np.linspace(1, epochs+1, num=10, dtype=int)

        # Adjust x-ticks to show only 10 points
        plt.xticks(ticks=x_ticks, labels=x_ticks, rotation=45)

        plt.legend()
        plt.show()

    # Combine all summaries into a single DataFrame and display
    all_test_huber_summary_df = pd.DataFrame(all_test_huber_summary)
    print(f"Summary of Test Huber Loss and Time for all {param_name} configurations:\n", all_test_huber_summary_df)

    # Plot the box plot for all test Huber loss
    plt.figure(figsize=(14, 7))
    plt.boxplot(all_test_huber_combined, labels=[str(param) for param in param_arr])
    plt.xlabel(param_name)
    plt.ylabel('Test Huber Loss')
    plt.title(f'Test Huber Loss Distribution for Different {param_name} Configurations')
    plt.show()

    # Plot the box plot for all times
    plt.figure(figsize=(14, 7))
    plt.boxplot(all_times_combined, labels=[str(param) for param in param_arr])
    plt.xlabel(param_name)
    plt.ylabel('Time (seconds)')
    plt.title(f'Time Distribution for Different {param_name} Configurations')
    plt.show()

# Main logic
if CALCU_HYPER_PARAMETERS:
    # Example usage to find best epochs
    epochs_arr = [20, 60,100,200,500]  # Example epochs to test
    evaluate_model_with_parameters(epochs_arr, 'Epochs', lambda x: build_custom_lstm_model(my_input_shape))


## 5.4 Batch Size Analysis

In [None]:

# Main logic
if CALCU_HYPER_PARAMETERS:
  # Example usage to find best batch size
    batch_sizes = [4, 8, 16, 32, 64,128,256]  # Example batch sizes to test
    evaluate_model_with_parameters(batch_sizes, 'Batch Size', lambda x: build_custom_lstm_model(my_input_shape), fixed_epochs=200)


## 5.5 Number of Neurons in Hidden Layer Analysis

In [None]:

# Main logic
if CALCU_HYPER_PARAMETERS:
    # Example usage to find best neuron count
    neuron_counts = [1, 2, 4, 8,16,32,64,128,256]  # Example neuron counts to test
    evaluate_model_with_parameters(neuron_counts, 'Neuron Count', lambda x: build_custom_lstm_model(input_shape_custom=my_input_shape, neurons_num=x), fixed_epochs=200, fixed_batch_size=256)


# **Step 6. Model Comparison**

In [None]:

# Define the best hyperparameters

# LSTM Replace with the best hyperparameters found
best_epoch = 200
best_batch_size = 64
best_neuron_count = 8

# MLP Replace with the best hyperparameters found
neurons_layer1 = 15
neurons_layer2 = 10
learning_rate = 0.01

# Create and train the MLP model
def train_mlp(X_train, y_train, X_test, y_test, neurons_layer1, neurons_layer2, learning_rate):
    mlp = MLPRegressor(hidden_layer_sizes=(neurons_layer1, neurons_layer2), learning_rate_init=learning_rate, max_iter=1000, random_state=42)
    mlp.fit(X_train, y_train)
    mlp_predictions = mlp.predict(X_test)
    mlp_predictions_actual = inverse_transform_predictions(mlp_predictions, my_scaler, X_test)
    y_test_actual = inverse_transform_predictions(y_test, my_scaler, X_test)
    return mlp, mlp_predictions_actual, y_test_actual

# Create and train the LSTM model
def train_lstm(X_train, y_train, X_test, y_test, epochs, batch_size, neuron_count):
    model = build_custom_lstm_model(input_shape_custom=my_input_shape, neurons_num=neuron_count)
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0, shuffle=False)
    lstm_predictions = model.predict(X_test)
    print("Shape of X_train:", X_train.shape)
    print("Shape of y_train:", y_train.shape)
    print("Shape of X_test:", X_test.shape)
    print("Shape of lstm_predictions:", lstm_predictions.shape)
    print("Shape of lstm_predictions:", lstm_predictions)
    lstm_predictions_actual = inverse_transform_predictions(lstm_predictions, my_scaler, X_test)
    y_test_actual = inverse_transform_predictions(y_test, my_scaler, X_test)
    return model, lstm_predictions_actual, y_test_actual

# Function to calculate performance metrics
def calculate_metrics(actual, predicted):
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actual, predicted)
    r2 = r2_score(actual, predicted)
    return mse, rmse, mae, r2

# Function to plot performance metrics
def plot_metrics(y_test_actual, mlp_predictions_actual, lstm_predictions_actual, time_test):
    plt.figure(figsize=(14, 7))
    plt.plot(time_test, y_test_actual, label='Actual PM2.5', color='blue')
    plt.plot(time_test, mlp_predictions_actual, label='Predicted PM2.5 (MLP)', color='orange', alpha=0.7)
    plt.plot(time_test, lstm_predictions_actual, label='Predicted PM2.5 (LSTM)', color='green', alpha=0.7)
    plt.title('PM2.5 Real and Prediction Values in MLP and LSTM')
    plt.xlabel('Time')
    plt.ylabel('PM2.5 (µg/m³)')
    plt.legend()
    plt.tight_layout()
    plt.show()

# Function to plot comparison bar charts
def plot_comparison_bar_chart(mlp_metrics, lstm_metrics):
    labels = ['RMSE', 'MAE', 'R²']
    x = np.arange(len(labels))  # Number of metrics
    width = 0.35  # Width of the bars
    fig, ax = plt.subplots(figsize=(10, 6))
    rects1 = ax.bar(x - width/2, mlp_metrics, width, label='MLP')
    rects2 = ax.bar(x + width/2, lstm_metrics, width, label='LSTM')
    ax.set_xlabel('Metrics')
    ax.set_ylabel('Scores')
    ax.set_title('Performance Comparison of MLP and LSTM')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()
    def autolabel(rects):
        for rect in rects:
            height = rect.get_height()
            ax.annotate(f'{height:.2f}', xy=(rect.get_x() + rect.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')
    autolabel(rects1)
    autolabel(rects2)
    fig.tight_layout()
    plt.show()

# Inverse transform function
def inverse_transform_predictions(predictions, scaler, X_data):
    predictions_flat = predictions.reshape(-1, 1)
    zeros_shape = (predictions_flat.shape[0], X_data.shape[1]) if X_data.ndim == 2 else (predictions_flat.shape[0], X_data.shape[2])
    predictions_actual_flat = scaler.inverse_transform(np.concatenate([np.zeros(zeros_shape), predictions_flat], axis=1))[:, -1]
    predictions_actual = predictions_actual_flat.reshape(predictions.shape)
    return predictions_actual

train_size = int(len(X_scaler) * 0.7)
X_train, X_test = X_scaler[:train_size], X_scaler[train_size:]
y_train, y_test = y_scaler[:train_size], y_scaler[train_size:]

time_test = df.index[train_size + N_STEPS:]

mlp, mlp_predictions_actual, y_test_actual = train_mlp(X_train, y_train, X_test, y_test, neurons_layer1, neurons_layer2, learning_rate)
model, lstm_predictions_actual, y_test_lstm_actual = train_lstm(X_train_lstm, y_train_lstm, X_test_lstm, y_test_lstm, best_epoch, best_batch_size, best_neuron_count)


In [None]:


lstm_predictions_actual = lstm_predictions_actual.reshape(-1)

# Ensure shapes are consistent
print('Shape of y_test_actual:', y_test_actual.shape)
print('Shape of mlp_predictions_actual:', mlp_predictions_actual.shape)
print('Shape of y_test_lstm_actual:', y_test_lstm_actual.shape)
print('Shape of lstm_predictions_actual:', lstm_predictions_actual.shape)



In [None]:
lstm_predictions_actual

In [None]:

# Calculate metrics
mlp_mse, mlp_rmse, mlp_mae, mlp_r2 = calculate_metrics(y_test_actual, mlp_predictions_actual)
lstm_mse, lstm_rmse, lstm_mae, lstm_r2 = calculate_metrics(y_test_lstm_actual, lstm_predictions_actual)

# Plot actual and predicted PM2.5 values
plot_metrics(y_test_actual[N_STEPS:], mlp_predictions_actual[N_STEPS:], lstm_predictions_actual, time_test)

# Plot performance metrics comparison
plot_comparison_bar_chart([mlp_rmse, mlp_mae, mlp_r2], [lstm_rmse, lstm_mae, lstm_r2])

# Print performance metrics for MLP
print("MLP Performance:")
print(f"RMSE: {mlp_rmse}")
print(f"MAE: {mlp_mae}")
print(f"R²: {mlp_r2}")

# Print performance metrics for LSTM
print("\nLSTM Performance:")
print(f"RMSE: {lstm_rmse}")
print(f"MAE: {lstm_mae}")
print(f"R²: {lstm_r2}")

# Print model summaries
print("\nMLP Model Summary:")
print(f"Hidden Layers: {mlp.hidden_layer_sizes}")
print(f"Number of iterations: {mlp.n_iter_}")
print(f"Learning rate: {mlp.learning_rate_init}")

print("\nLSTM Model Summary:")
model.summary()