# 🌬️ Project Vaayu - Phase 1

**AQI Prediction using ML Models**

### 1. Import Required Libraries

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from imblearn.over_sampling import SMOTE
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
import seaborn as sns
    

### 2. Load and Preprocess Dataset

In [None]:

# Replace this with your actual dataset path
df = pd.read_csv("your_dataset.csv")  

# KNN Imputation for missing values
imputer = KNNImputer(n_neighbors=3)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Feature-target split
X = df_imputed.drop('AQI', axis=1)
y = df_imputed['AQI']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Balance using SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_scaled, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
    

### 3. Define Model Training & Evaluation Function

In [None]:

def train_and_evaluate(model, model_name):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    
    print(f"\n{model_name} Performance:")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R²: {r2:.4f}")
    
    # Plot actual vs predicted
    plt.figure(figsize=(6,4))
    sns.scatterplot(x=y_test, y=preds, alpha=0.6)
    plt.xlabel("Actual AQI")
    plt.ylabel("Predicted AQI")
    plt.title(f"{model_name}: Actual vs Predicted AQI")
    plt.grid(True)
    plt.show()
    

### 4. Train and Evaluate All ML Models

In [None]:

# Random Forest
rf_model = RandomForestRegressor(n_estimators=10, random_state=42)
train_and_evaluate(rf_model, "Random Forest Regressor")

# CatBoost
cat_model = CatBoostRegressor(iterations=100, learning_rate=0.1, depth=6, silent=True)
train_and_evaluate(cat_model, "CatBoost Regressor")

# XGBoost
xgb_model = XGBRegressor(n_estimators=10, learning_rate=0.1, verbosity=0)
train_and_evaluate(xgb_model, "XGBoost Regressor")

# Bagging Regressor
bag_model = BaggingRegressor(n_estimators=1000, random_state=42)
train_and_evaluate(bag_model, "Bagging Regressor")
    

### 5. Optional: 5-Fold Cross-Validation Scores

In [None]:

def cross_validate(model, name):
    scores = cross_val_score(model, X_scaled, y, cv=5, scoring='r2')
    print(f"\n{name} 5-Fold R² Scores: {scores}")
    print(f"Average R²: {np.mean(scores):.4f}")

cross_validate(rf_model, "Random Forest")
cross_validate(cat_model, "CatBoost")
cross_validate(xgb_model, "XGBoost")
cross_validate(bag_model, "Bagging")
    