<a href="https://colab.research.google.com/github/erandimalk-glitch/CMP7005_S1_25/blob/main/CMP7005_PRAC1_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CMP7005 – Programming for Data Analysis
## PRAC1 – From Data to Application Development
#
#
## Main analysis notebook:
### - Task 1: Data handling (Import, Merge)
### - Task 2: EDA (Fundamental Understanding, Preprocessing, Statistics + Visualisation)
### - Task 3: Model building (regression + classification)
#
#
#### NOTE: This assignement assumes that all India air quality CSV files are stored in a folder called "data/" and each file is of the form "<City>_data.csv" with consistent columns.



In [None]:
import os
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    classification_report,
    confusion_matrix,
)
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

sns.set(style="whitegrid", context="notebook")
plt.rcParams["figure.figsize"] = (10, 6)


In [None]:
DATA_DIR = Path("data")


def load_city_csvs(data_dir: Path = DATA_DIR):
    \"\"\"Return a sorted list of all *_data.csv files in the given directory.\"\"\"
    return sorted(data_dir.glob("*_data.csv"))


def parse_city_name(path: Path) -> str:
    \"\"\"Extract the city name from a filename like 'Delhi_data.csv' -> 'Delhi'.\"\"\"
    return path.stem.replace("_data", "")

In [None]:
csv_files = load_city_csvs()
print("Found CSV files:")
for f in csv_files:
    print("  -", f.name)

frames = []
for path in csv_files:
    city_name = parse_city_name(path)
    df_city = pd.read_csv(path)
    df_city["City"] = city_name
    frames.append(df_city)

df = pd.concat(frames, ignore_index=True)

print("\\nMerged DataFrame shape:", df.shape)
df.head()

In [None]:
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])
print("\\nColumn names:\\n", df.columns.tolist())

print("\\nData types:")
print(df.dtypes)

print("\\nSample of data:")
display(df.head())

print("\\nMissing values per column:")
print(df.isna().sum())

print("\\nBasic descriptive statistics (numeric columns):")
display(df.describe(include="number"))

print("\\nUnique cities:", df["City"].nunique())
print("Cities:", sorted(df["City"].unique()))

df["Date"] = pd.to_datetime(df["Date"], dayfirst=True, errors="coerce")
df = df.sort_values(["City", "Date"]).reset_index(drop=True)

print("\\nCheck Date conversion:")
display(df[["City", "Date"]].head())

In [None]:
pollutant_cols = [
    "PM2.5",
    "PM10",
    "NO",
    "NO2",
    "NOx",
    "NH3",
    "CO",
    "SO2",
    "O3",
    "Benzene",
    "Toluene",
    "Xylene",
]

num_duplicates = df.duplicated().sum()
print(f"Number of exact duplicate rows: {num_duplicates}")
df = df.drop_duplicates().reset_index(drop=True)

df = df.dropna(subset=["Date", "City"]).reset_index(drop=True)

mask_all_pollutants_nan = df[pollutant_cols].isna().all(axis=1)
mask_aqi_nan = df["AQI"].isna()
rows_dropped = df[mask_all_pollutants_nan & mask_aqi_nan].shape[0]
print("Rows dropped because all pollutants and AQI are NaN:", rows_dropped)
df = df[~(mask_all_pollutants_nan & mask_aqi_nan)].reset_index(drop=True)

for col in pollutant_cols:
    df[col] = df.groupby("City")[col].transform(
        lambda s: s.fillna(s.median())
    )

df["AQI"] = df.groupby("City")["AQI"].transform(
    lambda s: s.fillna(s.median())
)


def aqi_to_bucket(aqi: float) -> str:
    if pd.isna(aqi):
        return "Unknown"
    if aqi <= 50:
        return "Good"
    elif aqi <= 100:
        return "Satisfactory"
    elif aqi <= 200:
        return "Moderate"
    elif aqi <= 300:
        return "Poor"
    elif aqi <= 400:
        return "Very Poor"
    else:
        return "Severe"


df["AQI_Bucket"] = df["AQI_Bucket"].fillna(df["AQI"].apply(aqi_to_bucket))

print("\\nMissing data after preprocessing:")
print(df.isna().sum())

df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Day"] = df["Date"].dt.day


def month_to_season(m: int) -> str:
    if m in (12, 1, 2):
        return "Winter"
    elif m in (3, 4, 5):
        return "Spring"
    elif m in (6, 7, 8):
        return "Summer"
    else:
        return "Autumn"


df["Season"] = df["Month"].apply(month_to_season)

df["PM_ratio"] = df["PM2.5"] / df["PM10"]
df["PM_ratio"] = df["PM_ratio"].replace([np.inf, -np.inf], np.nan)
df["PM_ratio"] = df["PM_ratio"].fillna(df["PM_ratio"].median())

df_clean = df.copy()
display(df_clean.head())


In [None]:
plt.figure()
sns.histplot(df_clean["AQI"], kde=True, bins=40)
plt.title("Distribution of AQI")
plt.xlabel("AQI")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

top_cities = df_clean["City"].value_counts().head(6).index.tolist()
plt.figure()
sns.boxplot(
    data=df_clean[df_clean["City"].isin(top_cities)],
    x="City",
    y="PM2.5",
)
plt.xticks(rotation=45)
plt.title("PM2.5 distribution for top 6 cities")
plt.tight_layout()
plt.show()

plt.figure()
sns.countplot(data=df_clean, x="AQI_Bucket", order=df_clean["AQI_Bucket"].value_counts().index)
plt.xticks(rotation=45)
plt.title("Counts by AQI bucket")
plt.tight_layout()
plt.show()

plt.figure()
sample = df_clean.sample(min(5000, len(df_clean)), random_state=42)
sns.scatterplot(
    data=sample,
    x="PM2.5",
    y="AQI",
    hue="City",
    alpha=0.4,
    legend=False,
)
plt.title("AQI vs PM2.5 (sample)")
plt.tight_layout()
plt.show()

avg_aqi_city = df_clean.groupby("City")["AQI"].mean().sort_values(ascending=False)
plt.figure()
sns.barplot(x=avg_aqi_city.values, y=avg_aqi_city.index)
plt.xlabel("Average AQI")
plt.ylabel("City")
plt.title("Average AQI by city")
plt.tight_layout()
plt.show()

avg_aqi_season = df_clean.groupby(["City", "Season"])["AQI"].mean().reset_index()
plt.figure()
sns.lineplot(
    data=avg_aqi_season[avg_aqi_season["City"].isin(top_cities)],
    x="Season",
    y="AQI",
    hue="City",
    marker="o",
)
plt.title("Seasonal average AQI for top cities")
plt.tight_layout()
plt.show()

corr_cols = pollutant_cols + ["AQI", "PM_ratio"]
corr = df_clean[corr_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=False, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation matrix – pollutants and AQI")
plt.tight_layout()
plt.show()

print("\\nCorrelation of AQI with key pollutants:")
print(corr["AQI"].sort_values(ascending=False))


In [None]:
feature_cols = pollutant_cols + ["PM_ratio"]

cutoff_year = 2019
train_df = df_clean[df_clean["Year"] < cutoff_year]
test_df = df_clean[df_clean["Year"] >= cutoff_year]

print("Training rows:", train_df.shape[0])
print("Test rows:", test_df.shape[0])

X_train = train_df[feature_cols].values
y_train_reg = train_df["AQI"].values
y_train_clf = train_df["AQI_Bucket"].values

X_test = test_df[feature_cols].values
y_test_reg = test_df["AQI"].values
y_test_clf = test_df["AQI_Bucket"].values


def evaluate_regression_model(name: str, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    print(f"\\n[{name}] Regression performance")
    print("-" * 40)
    print(f"MAE  : {mae:.3f}")
    print(f"RMSE : {rmse:.3f}")
    print(f"R^2  : {r2:.3f}")

In [None]:
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

preprocessor = ColumnTransformer(
    transformers=[("num", numeric_transformer, list(range(len(feature_cols))))],
    remainder="drop",
)

linreg_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", LinearRegression()),
    ]
)

linreg_model.fit(X_train, y_train_reg)
y_pred_linreg = linreg_model.predict(X_test)
evaluate_regression_model("Linear Regression", y_test_reg, y_pred_linreg)

In [None]:
knn_reg_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", KNeighborsRegressor(n_neighbors=5)),
    ]
)

knn_reg_model.fit(X_train, y_train_reg)
y_pred_knn_reg = knn_reg_model.predict(X_test)
evaluate_regression_model("KNN Regression (k=5)", y_test_reg, y_pred_knn_reg)

plt.figure()
plt.scatter(y_test_reg, y_pred_knn_reg, alpha=0.3)
plt.xlabel("Actual AQI")
plt.ylabel("Predicted AQI (KNN)")
plt.title("Actual vs Predicted AQI – KNN Regression")
plt.plot([y_test_reg.min(), y_test_reg.max()],
         [y_test_reg.min(), y_test_reg.max()],
         color="red", linestyle="--")
plt.tight_layout()
plt.show()

In [None]:
knn_clf_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", KNeighborsClassifier(n_neighbors=7)),
    ]
)

knn_clf_model.fit(X_train, y_train_clf)
y_pred_knn_clf = knn_clf_model.predict(X_test)

print("\\n[KNN Classification] AQI_Bucket performance")
print("-" * 40)
print(classification_report(y_test_clf, y_pred_knn_clf))

cm = confusion_matrix(y_test_clf, y_pred_knn_clf)
plt.figure()
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=np.unique(y_test_clf),
    yticklabels=np.unique(y_test_clf),
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion matrix – KNN classifier for AQI_Bucket")
plt.tight_layout()
plt.show()

In [None]:
rf_clf_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", RandomForestClassifier(
            n_estimators=150,
            random_state=42,
            n_jobs=-1,
        )),
    ]
)

rf_clf_model.fit(X_train, y_train_clf)
y_pred_rf = rf_clf_model.predict(X_test)

print("\\n[RandomForest Classification] AQI_Bucket performance")
print("-" * 40)
print(classification_report(y_test_clf, y_pred_rf))

cm_rf = confusion_matrix(y_test_clf, y_pred_rf)
plt.figure()
sns.heatmap(
    cm_rf,
    annot=True,
    fmt="d",
    cmap="Greens",
    xticklabels=np.unique(y_test_clf),
    yticklabels=np.unique(y_test_clf),
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion matrix – RandomForest classifier for AQI_Bucket")
plt.tight_layout()
plt.show()

In [None]:
import joblib

ARTIFACT_DIR = Path("artifacts")
ARTIFACT_DIR.mkdir(exist_ok=True)

joblib.dump(df_clean, ARTIFACT_DIR / "df_clean.pkl")
joblib.dump(feature_cols, ARTIFACT_DIR / "feature_cols.pkl")
joblib.dump(knn_reg_model, ARTIFACT_DIR / "knn_reg_model.pkl")
joblib.dump(rf_clf_model, ARTIFACT_DIR / "rf_clf_model.pkl")

print("\\nSaved artefacts to:", ARTIFACT_DIR.resolve())