In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

cleaned_data= pd.read_csv("/content/Cleaned_Feature-Engineered_Pollution_Data.csv")

In [2]:
# Separate features and target
X = cleaned_data.drop(['Pollution_Severity'], axis=1)
y = cleaned_data['Pollution_Severity']

# Stratified Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Ensure categorical features are numeric (already encoded)
print("X_train shape:", X_train.shape)
print("y_train distribution:\n", y_train.value_counts(normalize=True))


X_train shape: (160995, 14)
y_train distribution:
 Pollution_Severity
Low          0.823386
Moderate     0.136812
High         0.031262
Very High    0.008541
Name: proportion, dtype: float64


In [3]:
# Initialize the XGBoost Classifier
xgb = XGBClassifier(
    objective='multi:softmax',  # Multiclass classification
    num_class=len(y.unique()),  # Number of classes
    random_state=42
)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'scale_pos_weight': [1, 5, 10]  # For class imbalance
}


In [None]:
# Set up StratifiedKFold for class balance preservation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='f1_weighted',  # Focus on balanced performance
    cv=skf,
    verbose=1,
    n_jobs=-1
)
# Perform the grid search
grid_search.fit(X_train, y_train)

# Best parameters and model
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_


In [None]:
y_pred = best_model.predict(X_test)

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=best_model.classes_, yticklabels=best_model.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")



In [None]:
# Predict probabilities for multiclass ROC
if len(best_model.classes_) > 2:
    from sklearn.preprocessing import label_binarize
    y_test_bin = label_binarize(y_test, classes=best_model.classes_)
    y_pred_probs = best_model.predict_proba(X_test)

    # Compute ROC Curve and AUC for each class
    from sklearn.metrics import roc_curve, auc
    for i, class_name in enumerate(best_model.classes_):
        fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_probs[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{class_name} (AUC = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve for Each Class")
    plt.legend()
    plt.show()
