In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, recall_score
import pickle

In [29]:
# Load the training and testing CSV files
train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_data.csv')

In [30]:
# Separate features and labels for the training set
X_train = train_df.drop('Air Quality', axis=1)
y_train = train_df['Air Quality']

# Separate features and labels for the testing set
X_test = test_df.drop('Air Quality', axis=1)
y_test = test_df['Air Quality']

In [31]:
# Display the shapes to verify
print("Training features shape:", X_train.shape)
print("Training labels shape:", y_train.shape)
print("Testing features shape:", X_test.shape)
print("Testing labels shape:", y_test.shape)

Training features shape: (3000, 9)
Training labels shape: (3000,)
Testing features shape: (1000, 9)
Testing labels shape: (1000,)


In [34]:
# 2. Decision Tree Model Training and Evaluation
# Define hyperparameter grid for Decision Tree
dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Define StratifiedKFold for cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define metric for scoring (weighted recall)
weighted_recall = make_scorer(recall_score, average='weighted')

# Define GridSearchCV object
dt_grid = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid=dt_param_grid,
    scoring=weighted_recall,
    cv=cv,
    n_jobs=-1,  # Use all available cores
    verbose=1,
    return_train_score=True
)

# Run the grid search
dt_grid.fit(X_train, y_train)

# Display best model parameters and score
print("Best Decision Tree Parameters:", dt_grid.best_params_)
print("Weighted Mean Training Score:", dt_grid.cv_results_['mean_train_score'][dt_grid.best_index_])
print("Weighted Mean Cross-Validation Score:", dt_grid.best_score_)

# Evaluate on training data
dt_train_preds = dt_grid.predict(X_train)
print("Decision Tree - Training Data:")
print(confusion_matrix(y_train, dt_train_preds))
print(classification_report(y_train, dt_train_preds))


Fitting 5 folds for each of 90 candidates, totalling 450 fits
Best Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Weighted Mean Training Score: 0.9856666666666667
Weighted Mean Cross-Validation Score: 0.9333333333333333
Decision Tree - Training Data:
[[1200    0    0    0]
 [   0  289    0   11]
 [   0    0  882   18]
 [   0    7   10  583]]
              precision    recall  f1-score   support

        Good       1.00      1.00      1.00      1200
   Hazardous       0.98      0.96      0.97       300
    Moderate       0.99      0.98      0.98       900
        Poor       0.95      0.97      0.96       600

    accuracy                           0.98      3000
   macro avg       0.98      0.98      0.98      3000
weighted avg       0.98      0.98      0.98      3000



In [36]:
# 3. SVM Model Training and Evaluation
# Define pipeline for SVM (scaling + SVC)
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(class_weight='balanced', random_state=42))
])

# Define hyperparameter grid for SVM
svm_param_grid = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__kernel': ['rbf', 'poly', 'sigmoid'],
    'svc__gamma': ['scale', 'auto', 0.1, 1],
    'svc__degree': [2, 3]  # Only relevant for polynomial kernel
}

# Define GridSearchCV object for SVM
svm_grid = GridSearchCV(
    svm_pipeline,
    param_grid=svm_param_grid,
    scoring=weighted_recall,
    cv=cv,
    n_jobs=-1,  # Use all available cores
    verbose=1,
    return_train_score=True
)

# Run the grid search
svm_grid.fit(X_train, y_train)

# Display best model parameters and score
print("Best SVM Parameters:", svm_grid.best_params_)
print("Weighted Mean Training Score:", svm_grid.cv_results_['mean_train_score'][svm_grid.best_index_])
print("Weighted Mean Cross-Validation Score:", svm_grid.best_score_)

# Evaluate on training data
svm_train_preds = svm_grid.predict(X_train)
print("SVM - Training Data:")
print(confusion_matrix(y_train, svm_train_preds))
print(classification_report(y_train, svm_train_preds))

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best SVM Parameters: {'svc__C': 1, 'svc__degree': 2, 'svc__gamma': 'scale', 'svc__kernel': 'rbf'}
Weighted Mean Training Score: 0.9616666666666667
Weighted Mean Cross-Validation Score: 0.9443333333333334
SVM - Training Data:
[[1199    0    1    0]
 [   0  276    0   24]
 [   2    0  875   23]
 [   0   36   30  534]]
              precision    recall  f1-score   support

        Good       1.00      1.00      1.00      1200
   Hazardous       0.88      0.92      0.90       300
    Moderate       0.97      0.97      0.97       900
        Poor       0.92      0.89      0.90       600

    accuracy                           0.96      3000
   macro avg       0.94      0.95      0.94      3000
weighted avg       0.96      0.96      0.96      3000



### Performance Summary

#### Decision Tree

Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}

Weighted Mean Training Score: 0.9857

Weighted Mean Cross-Validation Score: 0.9333

Training Accuracy: 0.98

#### SVM

Best Parameters: {'svc__C': 1, 'svc__degree': 2, 'svc__gamma': 'scale', 'svc__kernel': 'rbf'}

Weighted Mean Training Score: 0.9617

Weighted Mean Cross-Validation Score: 0.9443

Training Accuracy: 0.96

### Overfitting/Underfitting

Decision Tree: The decision tree has a high training score (0.9857) but a lower cross-validation score (0.9333). This suggests overfitting, meaning the model has memorized the training data and might not generalize well to new, unseen data.

SVM: The SVM has a lower training score (0.9617) but a higher cross-validation score (0.9443) compared to the decision tree. The training score is closer to the cross-validation score, suggesting less overfitting. The SVM generalizes better than the decision tree.

### Important Metrics

Based on the Project Use Case, the most important metric is recall, especially for the "Hazardous" and "Poor" air quality classes. The goal of this project is to minimize false negatives. The confusion matrices and classification reports show details of the models.

Decision Tree:

"Hazardous" recall: 0.96 (289/300)

"Poor" recall: 0.97 (583/600)

SVM:

"Hazardous" recall: 0.92 (276/300)

"Poor" recall: 0.89 (534/600)

The Decision Tree has a higher recall for both "Hazardous" and "Poor" classes.

### Additional Observations

The confusion matrix for the decision tree shows perfect classification for the "Good" class, but has more misclassifications in the "Hazardous", "Moderate", and "Poor" classes.

The SVM's confusion matrix also shows good performance in the "Good" class, but compared to the decision tree, it misclassifies more instances in the "Hazardous" and "Poor" categories.

## Choosing the Best Model

Considering all factors, the Decision Tree is the better model due to a higher recall score in the "Hazardous" class. Therefore, it is more desirable for this particular use case to focus on capturing all instances where the air quality is "Hazardous."

In [40]:
# 4. Model Comparison and Selection (manual selection based on recall for 'Hazardous' class)
# Based on analysis, the Decision Tree is preferred for its higher recall on critical classes

best_model = dt_grid.best_estimator_
best_model_name = "Decision Tree"

print(f"Best Model (selected based on recall for 'Hazardous' class): {best_model_name}")

# Save the best model to a file
filename = 'best_air_quality_model.pkl'
pickle.dump(best_model, open(filename, 'wb'))
print(f"Saved best model to {filename}")

Best Model (selected based on recall for 'Hazardous' class): Decision Tree
Saved best model to best_air_quality_model.pkl


In [41]:
# 5. Final Model Evaluation on Test Data
# Load the model from the pickle file
loaded_model = pickle.load(open(filename, 'rb'))

# Make predictions on the test data
test_preds = loaded_model.predict(X_test)

# Display evaluation metrics
print("Test Data Metrics:")
print(confusion_matrix(y_test, test_preds))
print(classification_report(y_test, test_preds))

Test Data Metrics:
[[399   0   1   0]
 [  0  74   0  26]
 [  4   0 285  11]
 [  0  19  15 166]]
              precision    recall  f1-score   support

        Good       0.99      1.00      0.99       400
   Hazardous       0.80      0.74      0.77       100
    Moderate       0.95      0.95      0.95       300
        Poor       0.82      0.83      0.82       200

    accuracy                           0.92      1000
   macro avg       0.89      0.88      0.88      1000
weighted avg       0.92      0.92      0.92      1000

