In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import joblib  # To save the model

In [3]:
# Load dataset
data = pd.read_csv("your_data_with_aqi.csv")  # Update this path to your actual CSV file

# Prepare features (pollutants) and target (AQI)
X = data[['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3']]
y = data['AQI']

In [4]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [6]:


models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    # Fit the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_r2 = r2_score(y_train, y_train_pred)
    
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_r2 = r2_score(y_test, y_test_pred)
    
    return train_mae, train_rmse, train_r2, test_mae, test_rmse, test_r2

# Evaluate all models and store results
results = {}

for name, model in models.items():
    train_mae, train_rmse, train_r2, test_mae, test_rmse, test_r2 = evaluate_model(model, X_train, X_test, y_train, y_test)
    
    # Perform cross-validation
    cv_r2 = np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='r2'))
    
    # Store results
    results[name] = {
        "Train MAE": train_mae,
        "Train RMSE": train_rmse,
        "Train R²": train_r2,
        "Test MAE": test_mae,
        "Test RMSE": test_rmse,
        "Test R²": test_r2,
        "Validation R² (CV)": cv_r2
    }




In [7]:
# Display results for each model
print("Model Performance Metrics:")
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

# Identify the best model based on Test R²
best_model_name = max(results, key=lambda name: results[name]["Test R²"])
best_model = models[best_model_name]
print(f"\nBest Model: {best_model_name}")


Model Performance Metrics:

Linear Regression:
  Train MAE: 31.0893
  Train RMSE: 39.1477
  Train R²: 0.7398
  Test MAE: 30.8219
  Test RMSE: 38.6233
  Test R²: 0.7362
  Validation R² (CV): 0.7392

Decision Tree:
  Train MAE: 0.0000
  Train RMSE: 0.0000
  Train R²: 1.0000
  Test MAE: 0.6744
  Test RMSE: 4.2126
  Test R²: 0.9969
  Validation R² (CV): 0.9972

Random Forest:
  Train MAE: 0.1717
  Train RMSE: 1.2729
  Train R²: 0.9997
  Test MAE: 0.5029
  Test RMSE: 2.6723
  Test R²: 0.9987
  Validation R² (CV): 0.9977

Best Model: Random Forest


In [8]:
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [9]:
# Evaluate the model
y_pred = rf_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [10]:
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)

Mean Absolute Error (MAE): 0.5029209386970005
Root Mean Squared Error (RMSE): 2.6723390159876312


In [11]:
# Save the trained model
joblib.dump(rf_model, 'random_forest_aqi_model.pkl')
print("Model saved as 'random_forest_aqi_model.pkl'")

Model saved as 'random_forest_aqi_model.pkl'
