# 02 - Baseline Model

Baseline regression for `main_aqi` with proper preprocessing pipeline and time-based validation split.


In [11]:
from pathlib import Path
from math import sqrt
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

# --- Same robust path logic as in 01_eda.ipynb ---

cwd = Path.cwd()
print("CWD in notebook:", cwd)

candidate_paths = [
    cwd / "data" / "raw" / "training" / "training_all_cities_until_2024_06_30.parquet",
    cwd.parent / "data" / "raw" / "training" / "training_all_cities_until_2024_06_30.parquet",
]

DATA_PATH = None
for path in candidate_paths:
    print("Checking:", path.resolve(), "exists:", path.exists())
    if path.exists():
        DATA_PATH = path
        break

if DATA_PATH is None:
    raise FileNotFoundError(
        "training_all_cities_until_2024_06_30.parquet not found in expected locations.\n"
        f"CWD: {cwd}\n"
        "Make sure you started Jupyter from the smogguard_pk project folder."
    )

print("Using data file:", DATA_PATH.resolve())
df = pd.read_parquet(DATA_PATH)
df.head()


CWD in notebook: /Users/faisalimran/Desktop/ML_PROJ/smogguard_pk/notebooks
Checking: /Users/faisalimran/Desktop/ML_PROJ/smogguard_pk/notebooks/data/raw/training/training_all_cities_until_2024_06_30.parquet exists: False
Checking: /Users/faisalimran/Desktop/ML_PROJ/smogguard_pk/data/raw/training/training_all_cities_until_2024_06_30.parquet exists: True
Using data file: /Users/faisalimran/Desktop/ML_PROJ/smogguard_pk/data/raw/training/training_all_cities_until_2024_06_30.parquet


Unnamed: 0,datetime,main_aqi,components_co,components_no,components_no2,components_o3,components_so2,components_pm2_5,components_pm10,components_nh3,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,surface_pressure,wind_speed_10m,wind_direction_10m,shortwave_radiation,city
0,2021-08-24 00:00:00,5,1228.33,0.0,27.76,40.41,6.02,66.96,87.07,14.69,29.7,55.0,19.7,0.0,943.4,10.5,74.0,0,Islamabad
1,2021-08-24 01:00:00,5,1134.87,0.0,24.33,46.49,6.2,64.5,82.37,14.57,29.4,56.0,19.7,0.0,943.2,10.8,92.0,0,Islamabad
2,2021-08-24 02:00:00,5,1361.85,0.94,39.41,30.76,6.91,64.21,80.38,16.21,28.9,58.0,19.8,0.0,943.0,10.1,107.0,0,Islamabad
3,2021-08-24 03:00:00,5,1682.28,8.38,51.41,27.18,9.06,64.75,79.55,17.73,28.4,60.0,19.8,0.0,942.8,10.5,106.0,0,Islamabad
4,2021-08-24 04:00:00,5,1054.76,3.07,27.08,91.55,21.7,59.86,71.18,13.3,28.1,62.0,20.1,0.0,942.9,9.4,97.0,0,Islamabad


In [4]:
# Apply basic rules from EDA
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce').dt.tz_localize(None)
df = df.dropna(subset=['datetime'])
df = df.drop_duplicates(subset=['city', 'datetime'])
df = df.sort_values(['city', 'datetime'])

feature_cols = [
    'components_co', 'components_no', 'components_no2', 'components_o3', 'components_so2',
    'components_pm2_5', 'components_pm10', 'components_nh3',
    'temperature_2m', 'relative_humidity_2m', 'dew_point_2m',
    'precipitation', 'surface_pressure', 'wind_speed_10m',
    'wind_direction_10m', 'shortwave_radiation',
]
target_col = 'main_aqi'

# Optional: drop rows with critical nulls identified during EDA
# df = df.dropna(subset=['components_pm2_5', 'components_pm10', 'temperature_2m'])

X = df[feature_cols + ['city']]
y = df[target_col]


In [5]:
numeric_features = feature_cols
categorical_features = ['city']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ]
)

model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
)

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model),
])


In [6]:
# Time-based split to avoid leakage
cutoff_date = df['datetime'].sort_values().quantile(0.8)
train_mask = df['datetime'] <= cutoff_date
val_mask = df['datetime'] > cutoff_date

X_train, X_val = X[train_mask], X[val_mask]
y_train, y_val = y[train_mask], y[val_mask]

print('Train period:', df.loc[train_mask, 'datetime'].min(), '→', df.loc[train_mask, 'datetime'].max())
print('Val period:', df.loc[val_mask, 'datetime'].min(), '→', df.loc[val_mask, 'datetime'].max())


Train period: 2021-08-24 00:00:00 → 2023-12-05 22:00:00
Val period: 2023-12-05 23:00:00 → 2024-06-30 23:00:00


In [10]:
from math import sqrt
from sklearn.metrics import r2_score

clf.fit(X_train, y_train)

y_pred = clf.predict(X_val)

mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("Baseline MSE:", mse)
print("Baseline RMSE:", rmse)
print("Baseline MAE:", mae)
print("Baseline R^2:", r2)


Baseline MSE: 0.10664764254385965
Baseline RMSE: 0.3265695064513214
Baseline MAE: 0.191871751137102
Baseline R^2: 0.906937308564132


In [13]:
# Round predictions to nearest integer between 1 and 5
y_pred_rounded = np.clip(np.round(y_pred), 1, 5).astype(int)

acc = accuracy_score(y_val.astype(int), y_pred_rounded)
print("Exact class accuracy (rounded):", acc)


Exact class accuracy (rounded): 0.8408463287849253


## Baseline results & learnability
- Model: RandomForestRegressor with preprocessing pipeline (median imputation, scaling, one-hot for city).
- Split: time-based 80% older data, 20% recent data.
- Metrics: RMSE and MAE printed above.
- Interpretation: if RMSE is much smaller than the 1–5 range of main_aqi, the task looks learnable; otherwise we will add horizon-based targets, lags, and improved models later.
- We do not persist this model; it serves as a baseline for understanding.
