# Import necessary modules

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


# Load the dataset and split the dataset to test and train variables

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/processed_data1.csv")

# Encode categorical columns
label_encoder = LabelEncoder()
df['Region'] = label_encoder.fit_transform(df['Region'])
df['Day_period'] = label_encoder.fit_transform(df['Day_period'])
df['Season'] = label_encoder.fit_transform(df['Season'])
df['Weekday_or_weekend'] = label_encoder.fit_transform(df['Weekday_or_weekend'])
df['Regular_day_or_holiday'] = label_encoder.fit_transform(df['Regular_day_or_holiday'])

# Define features and target including encoded columns
features = df[['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene',
               'Region', 'Day_period', 'Month_encoded', 'Season', 'Weekday_or_weekend', 'Regular_day_or_holiday']]
target = df['AQI']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Print the shapes of the splits to ensure consistency
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

Mounted at /content/drive
X_train shape: (2517657, 17)
X_test shape: (1078996, 17)
y_train shape: (2517657,)
y_test shape: (1078996,)


# Initialize the model

In [None]:
# Initialize the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)


# Predict using various metrices

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from math import sqrt

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Calculate R² (Coefficient of Determination)
r2 = r2_score(y_test, y_pred)

# Recall (Sensitivity) - Not applicable for regression models
recall = None

# Accuracy - Not applicable for regression models
accuracy = None

# Calculate MAPE (Mean Absolute Percentage Error)
mape = mean_absolute_percentage_error(y_test, y_pred)

# Calculate MAE (Mean Absolute Error)
mae = mean_absolute_error(y_test, y_pred)

# Calculate RMSE (Root Mean Square Error)
rmse = sqrt(mean_squared_error(y_test, y_pred))

# Print the calculated metrics
print("R² (Coefficient of Determination):", r2)
print("Recall (Sensitivity):", recall)
print("Accuracy:", accuracy)
print("MAPE (Mean Absolute Percentage Error):", mape)
print("MAE (Mean Absolute Error):", mae)
print("RMSE (Root Mean Square Error):", rmse)

# Plot graph

In [None]:
import matplotlib.pyplot as plt

# Create a dictionary with the metrics
metrics = {
    'R²': r2,
    'MAPE': mape,
    'MAE': mae,
    'RMSE': rmse
}

# Plot performance metrics
plt.figure(figsize=(10, 6))
plt.barh(list(metrics.keys()), list(metrics.values()), color='skyblue')
plt.xlabel('Metric Value')
plt.title('Model Performance Metrics')
plt.gca().invert_yaxis()  # Invert y-axis to display metrics from top to bottom
plt.show()

# Analyze the metrics
for metric, value in metrics.items():
    print(f"The model's {metric} is {value}.")


# Model Improvement: Hyperparameter Tuning using Grid Search
grid search cv is used to find better predictions. A random forest has fixed param grid and in this hyper tuning, we give multiple values for each parameter and we test the model on those combinations. 3* 4* 3* 3 so 108 random forests will be generated with 3 different values for 4 param grid. by this, we can use the best param grid and hence better prediction.

Calling the gridsearchcv function, we pass parameters, cv is cross validation meaning how many times we train the model with the param grid. The value is 5 so train 108 random forests 5 times. n_jobs =-1 is to make model work faster.

Randomsearchcv is faster and it chooses random 10/20/30 random forests from the 108 random forests. But best results are not guaranteed.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 11),  # Using randint for random integer sampling
    'min_samples_leaf': randint(1, 5)    # Using randint for random integer sampling
}

random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=100,
                                   cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)

random_search.fit(X_train, y_train)

best_params = random_search.best_params_
best_rf_model = random_search.best_estimator_
