In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns

from scipy.stats import zscore
import pickle
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.metrics import classification_report, cohen_kappa_score, confusion_matrix, mean_absolute_error
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from collections import Counter


In [None]:
# MODEL TRAINING & TESTING DATA
aq_df = pd.read_csv('/content/drive/MyDrive/softwarica/machine-learning/air-quality-prediction-classification/kathmandu_aq_us_embassy_till_2025_4.csv')             # Air Quality Data
mt_df = pd.read_csv('/content/drive/MyDrive/softwarica/machine-learning/air-quality-prediction-classification/kathmandu_weather_2020_1_to_2025_4.csv')        # Meteorological Data
hd_df = pd.read_csv("/content/drive/MyDrive/softwarica/machine-learning/air-quality-prediction-classification/nepal_holidays_2020_1_to_2025_4_with_weekends.csv")  # Holiday Data

# MODEL TESTING IN PRODUCTION DATA
# aq_df = pd.read_csv('/content/drive/MyDrive/softwarica/machine-learning/air-quality-prediction-classification/kathmandu_aq_bhaisepati_till_2025_5.csv')             # Air Quality Data
# mt_df = pd.read_csv('/content/drive/MyDrive/softwarica/machine-learning/air-quality-prediction-classification/kathmandu_weather_2025_4_to_2025_5.csv')        # Meteorological Data
# hd_df = pd.read_csv("/content/drive/MyDrive/softwarica/machine-learning/air-quality-prediction-classification/nepal_holidays_2025_4_to_2025_5_with_weekends.csv")  # Holiday Data

In [None]:
# Clean & Prepare Air Quality Data
aq_df.columns = aq_df.columns.str.strip()                                               # Strip leading/trailing spaces from column names
aq_df['date'] = pd.to_datetime(aq_df['date'], format='%Y/%m/%d', errors='coerce')       # Convert date column to datetime
aq_df.drop(columns=['pm10'], inplace=True)   # Drop pm10 Column

# comment out for cleaning production dataset
aq_df = aq_df[(aq_df['date'] >= '2020-01-01') & (aq_df['date'] <= '2025-04-25')].copy() # Filter by date

aq_df['pm25'] = pd.to_numeric(aq_df['pm25'], errors='coerce')                           # Convert pm25 to numeric (coerce invalid entries to NaN)
aq_df.sort_values('date', inplace=True)

In [None]:
def classify_pm25(pm):
    if pm <= 35.4:
        return 'Low'       # Merged Good + Moderate
    elif pm <= 55.4:
        return 'Moderate'  # UGS
    elif pm <= 150.4:
        return 'Unhealthy'
    elif pm <= 250.4:
        return 'Very Unhealthy'
    else:
        return 'Hazardous'

aq_df['pm25_class'] = aq_df['pm25'].apply(classify_pm25)

In [None]:
# Clean & Prepare Meteorological Data
mt_df = mt_df.drop(columns=["precipitation"])               # Drop precipitation as it has all 0 values
mt_df['timestamp'] = pd.to_datetime(mt_df['timestamp'])      # convert timestamp to datetime
mt_df['temperature'] = mt_df['temperature'].apply(lambda x: np.round((x - 32) * 5/9))   # Convert Fahrenheit to Celcius
mt_df['dew_point'] = mt_df['dew_point'].apply(lambda x: np.round((x - 32) * 5/9))      # Convert Fahrenheit to Celcius
mt_df['wind_speed'] = mt_df['wind_speed'].apply(lambda x: np.round(x * 1.60934))      # Convert wind speed from miles per hour to kmph
mt_df['wind_gust'] = mt_df['wind_gust'].apply(lambda x: np.round(x * 1.60934))      # Convert wind speed from miles per hour to kmph
mt_df['pressure'] = mt_df['pressure'].apply(lambda x: int(np.round(x * 33.8639)))     # Convert pressure from inches of Mercury (inHg) to hectopascals (hPa)
mt_df['wind'] = mt_df['wind'].bfill()     # Backward Fill missing values for wind with
mt_df.sort_values('timestamp', inplace=True)

In [None]:
hd_df['holiday'] = hd_df['type'].str.contains('optional', case=False).astype(int)
hd_df['holiday'] = hd_df['holiday'].replace(0, 2)
hd_df['date'] = pd.to_datetime(hd_df['date']) # Convert date to date object

In [None]:
mt_df['date'] = mt_df['timestamp'].dt.date
mt_daily_df = mt_df.groupby('date').agg({
    'temperature': 'mean',
    'dew_point': 'mean',
    'humidity': 'mean',
    'wind_speed': 'mean',
    'pressure': 'mean',
    'condition': lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan,
    'wind': lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan
})#.reset_index()
mt_daily_df.reset_index(inplace=True)
mt_daily_df['date'] = pd.to_datetime(mt_daily_df['date'])

In [None]:
# Sort and deduplicate rows
hd_df_sorted = hd_df.sort_values(['date', 'holiday'], ascending=[True, False])  # Higher holiday value first
hd_unique_df = hd_df_sorted.drop_duplicates(subset='date')[['date', 'holiday']]

In [None]:
# Merge meteorological data with holidays
mthd_daily_df = mt_daily_df.merge(hd_unique_df, on='date', how='left')
mthd_daily_df['holiday'] = mthd_daily_df['holiday'].fillna(0).astype(int)

In [None]:
#  Merge with air quality data
aq_df['date'] = pd.to_datetime(aq_df['date'])
final_df = pd.merge(aq_df[['date', 'pm25_class']], mthd_daily_df, on='date', how='inner')

In [None]:
# Label Encode the target variable 'pm25_class'
le = LabelEncoder()
final_df['pm25_class'] = le.fit_transform(final_df['pm25_class'])

In [None]:
final_df

Unnamed: 0,date,pm25_class,temperature,dew_point,humidity,wind_speed,pressure,condition,wind,holiday
0,2020-01-01,4,9.608696,3.608696,69.565217,4.130435,873.282609,Fog,VAR,0
1,2020-01-02,4,10.409091,5.136364,72.977273,4.022727,873.431818,Fog,VAR,0
2,2020-01-03,4,8.333333,7.000000,91.250000,4.666667,869.694444,Fog,VAR,0
3,2020-01-04,3,8.543478,7.434783,93.086957,3.347826,867.304348,Fog,VAR,0
4,2020-01-05,3,8.673913,4.826087,79.521739,6.173913,866.239130,Fair,VAR,0
...,...,...,...,...,...,...,...,...,...,...
1813,2025-02-28,3,14.666667,10.583333,79.020833,6.270833,868.750000,Mostly Cloudy,VAR,1
1814,2025-03-01,3,14.562500,9.895833,77.520833,6.687500,867.270833,Mostly Cloudy,CALM,2
1815,2025-03-02,3,15.291667,7.000000,65.104167,6.770833,862.937500,Fair,VAR,2
1816,2025-03-03,3,16.510638,8.574468,63.042553,7.234043,863.425532,Fair,VAR,0


In [None]:
final_df.to_csv('/content/drive/MyDrive/softwarica/machine-learning/air-quality-prediction-classification/compiled/kathmandu_pm25_class_2020_1_to_2025_4_dataset.csv', index=False)

# final_df.to_csv('/content/drive/MyDrive/softwarica/machine-learning/air-quality-prediction-classification/compiled/kathmandu_pm25_class_2025_4_to_2025_5_dataset.csv', index=False)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/softwarica/machine-learning/air-quality-prediction-classification/compiled/kathmandu_pm25_class_2025_4_to_2025_5_dataset.csv')
#

In [None]:
df

Unnamed: 0,date,pm25_class,temperature,dew_point,humidity,wind_speed,pressure,condition,wind,holiday
0,2025-04-26,2,24.520833,7.0,36.583333,6.770833,862.9375,Fair,E,2
1,2025-04-27,1,20.916667,11.25,56.4375,8.416667,865.791667,Fair,E,1
2,2025-04-28,0,17.270833,12.8125,77.395833,6.458333,865.1875,Mostly Cloudy,VAR,0
3,2025-04-29,0,19.021277,13.510638,73.617021,7.425532,864.893617,Mostly Cloudy,CALM,0
4,2025-04-30,0,20.666667,14.083333,67.3125,9.270833,866.145833,Mostly Cloudy,S,0
5,2025-05-01,0,19.208333,13.958333,73.604167,8.229167,868.083333,Mostly Cloudy,E,2
6,2025-05-02,0,19.479167,13.604167,70.875,6.083333,866.708333,Fair,E,0
7,2025-05-03,0,19.958333,15.5625,76.9375,7.770833,864.229167,Mostly Cloudy,VAR,2
8,2025-05-04,0,19.020833,16.083333,84.5,5.1875,864.145833,Mostly Cloudy,VAR,1
9,2025-05-05,0,20.4375,15.125,73.583333,8.229167,864.0625,Mostly Cloudy,VAR,0
