In [1]:
# Cell 1: imports and paths

import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier, export_text

BASE_DIR = Path("..").resolve()  # .. = go one folder up from ML_model

RAW_DIR = BASE_DIR / "dataset" / "raw"
UCI_DIR = BASE_DIR / "dataset" / "UCI_reference"
MERGED_DIR = BASE_DIR / "dataset" / "merged"

MERGED_DIR.mkdir(parents=True, exist_ok=True)

RAW_DIR, UCI_DIR, MERGED_DIR


(WindowsPath('C:/Users/cheta/OneDrive/Desktop/AirQualityMonitoringSystem/dataset/raw'),
 WindowsPath('C:/Users/cheta/OneDrive/Desktop/AirQualityMonitoringSystem/dataset/UCI_reference'),
 WindowsPath('C:/Users/cheta/OneDrive/Desktop/AirQualityMonitoringSystem/dataset/merged'))

In [2]:
# Cell 2: Load MASTER LOG

log_file = RAW_DIR / "master_log.csv"
print("Loading:", log_file)

log_df = pd.read_csv(log_file)
log_df.columns = ["timestamp","mq135_raw","temperature","humidity","aqi_category"]

log_df['source'] = 'esp32'

esp_df = log_df[['mq135_raw','temperature','humidity','aqi_category','source']]

esp_df.head()


Loading: C:\Users\cheta\OneDrive\Desktop\AirQualityMonitoringSystem\dataset\raw\master_log.csv


Unnamed: 0,mq135_raw,temperature,humidity,aqi_category,source
0,209,25.0,50.0,Moderate,esp32
1,211,25.0,50.0,Moderate,esp32
2,208,25.0,50.0,Moderate,esp32
3,209,25.0,50.0,Moderate,esp32
4,4095,25.0,50.0,Unhealthy,esp32


In [3]:
# Cell 3: load indoor_data.csv and prepare it

uci_path = UCI_DIR / "indoor_data.csv"

if not uci_path.exists():
    raise FileNotFoundError(f"{uci_path} not found. Make sure indoor_data.csv is in dataset/UCI_reference/")

df = pd.read_csv(uci_path)

# Check columns once (just to be sure)
print(df.columns)
df.head()


Index(['created_at', 'entry_id', 'field1', 'field2', 'field3', 'field4',
       'field5', 'field6', 'field7', 'latitude', 'longitude', 'elevation',
       'status'],
      dtype='object')


Unnamed: 0,created_at,entry_id,field1,field2,field3,field4,field5,field6,field7,latitude,longitude,elevation,status
0,2024-04-06 12:00:27+05:30,1256,163,42.0,33.8,49.0,520.0,18.0,21.0,,,,
1,2024-04-06 12:01:30+05:30,1257,162,37.0,33.8,49.0,637.0,36.0,9.0,,,,
2,2024-04-06 12:02:32+05:30,1258,173,47.0,33.7,50.0,679.0,42.0,3.0,,,,
3,2024-04-06 12:03:34+05:30,1259,168,37.0,33.3,51.0,539.0,21.0,0.0,,,,
4,2024-04-06 12:04:36+05:30,1260,168,37.0,33.3,52.0,697.0,45.0,0.0,,,,


In [4]:
# Cell 3b: rename generic fields into meaningful names

df = df.rename(columns={
    'field1': 'mq7',
    'field2': 'mq135',
    'field3': 'temperature',
    'field4': 'humidity',
    'field5': 'eco2',
    'field6': 'tvoc',
    'field7': 'dust'
})

df[['mq7','mq135','temperature','humidity','eco2','tvoc','dust']].head()


Unnamed: 0,mq7,mq135,temperature,humidity,eco2,tvoc,dust
0,163,42.0,33.8,49.0,520.0,18.0,21.0
1,162,37.0,33.8,49.0,637.0,36.0,9.0
2,173,47.0,33.7,50.0,679.0,42.0,3.0
3,168,37.0,33.3,51.0,539.0,21.0,0.0
4,168,37.0,33.3,52.0,697.0,45.0,0.0


In [5]:
# Cell 3c: drop unused columns
df = df[['mq7','mq135','temperature','humidity','eco2','tvoc','dust']]

df.head()


Unnamed: 0,mq7,mq135,temperature,humidity,eco2,tvoc,dust
0,163,42.0,33.8,49.0,520.0,18.0,21.0
1,162,37.0,33.8,49.0,637.0,36.0,9.0
2,173,47.0,33.7,50.0,679.0,42.0,3.0
3,168,37.0,33.3,51.0,539.0,21.0,0.0
4,168,37.0,33.3,52.0,697.0,45.0,0.0


In [6]:
# Cell 3d: create AQI categories from dust (interpreted as PM2.5)

def categorize_aqi_from_pm(pm):
    if pm <= 12.0:
        return 'Good'
    elif pm <= 35.4:
        return 'Moderate'
    elif pm <= 55.4:
        return 'Unhealthy for Sensitive'
    elif pm <= 150.4:
        return 'Unhealthy'
    elif pm <= 250.4:
        return 'Very Unhealthy'
    else:
        return 'Hazardous'

df['aqi_category'] = df['dust'].apply(categorize_aqi_from_pm)

df[['mq135','temperature','humidity','dust','aqi_category']].head()
print(df['aqi_category'].value_counts())


aqi_category
Good                       1202
Moderate                     66
Hazardous                    27
Unhealthy                    25
Unhealthy for Sensitive       9
Very Unhealthy                4
Name: count, dtype: int64


In [7]:
# Cell 3e: build uci_df for training

uci_df = df[['mq135','temperature','humidity','aqi_category']]
uci_df = uci_df.rename(columns={'mq135': 'mq135_raw'})
uci_df['source'] = 'indoor_data'

uci_df.head()


Unnamed: 0,mq135_raw,temperature,humidity,aqi_category,source
0,42.0,33.8,49.0,Moderate,indoor_data
1,37.0,33.8,49.0,Good,indoor_data
2,47.0,33.7,50.0,Good,indoor_data
3,37.0,33.3,51.0,Good,indoor_data
4,37.0,33.3,52.0,Good,indoor_data


In [8]:
# Cell 4: merge esp32 + uci into one DataFrame

all_df = pd.concat([esp_df, uci_df], ignore_index=True)

print("Total rows:", len(all_df))
all_df.head()


Total rows: 1354


Unnamed: 0,mq135_raw,temperature,humidity,aqi_category,source
0,209.0,25.0,50.0,Moderate,esp32
1,211.0,25.0,50.0,Moderate,esp32
2,208.0,25.0,50.0,Moderate,esp32
3,209.0,25.0,50.0,Moderate,esp32
4,4095.0,25.0,50.0,Unhealthy,esp32


In [9]:
# Cell 5: basic cleaning + encode labels

all_df = all_df.dropna(subset=['mq135_raw', 'temperature', 'humidity', 'aqi_category'])

mapping = {
    'GOOD': 'Good',
    'good': 'Good',
    'Good': 'Good',
    'MODERATE': 'Moderate',
    'Moderate': 'Moderate',
    'USG': 'Unhealthy for Sensitive',
    'Unhealthy for Sensitive': 'Unhealthy for Sensitive',
    'UNHEALTHY SENSITIVE': 'Unhealthy for Sensitive',
    'Unhealthy': 'Unhealthy',
    'UNHEALTHY': 'Unhealthy',
    'Very Unhealthy': 'Very Unhealthy',
    'VERY UNHEALTHY': 'Very Unhealthy',
    'Hazardous': 'Hazardous',
    'HAZARDOUS': 'Hazardous'
}

all_df['aqi_category'] = all_df['aqi_category'].replace(mapping)

valid_labels = [
    'Good',
    'Moderate',
    'Unhealthy for Sensitive',
    'Unhealthy',
    'Very Unhealthy',
    'Hazardous'
]
all_df = all_df[all_df['aqi_category'].isin(valid_labels)]

print("Label counts:")
print(all_df['aqi_category'].value_counts())
all_df.head()


Label counts:
aqi_category
Good                       1202
Moderate                     79
Unhealthy                    33
Hazardous                    27
Unhealthy for Sensitive       9
Very Unhealthy                4
Name: count, dtype: int64


Unnamed: 0,mq135_raw,temperature,humidity,aqi_category,source
0,209.0,25.0,50.0,Moderate,esp32
1,211.0,25.0,50.0,Moderate,esp32
2,208.0,25.0,50.0,Moderate,esp32
3,209.0,25.0,50.0,Moderate,esp32
4,4095.0,25.0,50.0,Unhealthy,esp32


In [10]:
# Cell 6: features and train/test split

feature_cols = ['mq135_raw', 'temperature', 'humidity']
X = all_df[feature_cols].values
y = all_df['aqi_category'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

len(X_train), len(X_test)


(1015, 339)

In [11]:
# Cell 7 â€” train model on RAW values (not scaled)

clf = DecisionTreeClassifier(max_depth=4, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


                         precision    recall  f1-score   support

                   Good       0.91      1.00      0.95       301
              Hazardous       0.00      0.00      0.00         7
               Moderate       1.00      0.25      0.40        20
              Unhealthy       1.00      0.38      0.55         8
Unhealthy for Sensitive       0.00      0.00      0.00         2
         Very Unhealthy       0.00      0.00      0.00         1

               accuracy                           0.91       339
              macro avg       0.48      0.27      0.32       339
           weighted avg       0.89      0.91      0.88       339

[[301   0   0   0   0   0]
 [  7   0   0   0   0   0]
 [ 15   0   5   0   0   0]
 [  5   0   0   3   0   0]
 [  2   0   0   0   0   0]
 [  1   0   0   0   0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [12]:
# Cell 8: export rules text form

tree_rules = export_text(clf, feature_names=feature_cols)
print(tree_rules)


|--- mq135_raw <= 49.00
|   |--- mq135_raw <= 43.50
|   |   |--- humidity <= 66.50
|   |   |   |--- temperature <= 33.05
|   |   |   |   |--- class: Good
|   |   |   |--- temperature >  33.05
|   |   |   |   |--- class: Good
|   |   |--- humidity >  66.50
|   |   |   |--- mq135_raw <= 35.50
|   |   |   |   |--- class: Good
|   |   |   |--- mq135_raw >  35.50
|   |   |   |   |--- class: Good
|   |--- mq135_raw >  43.50
|   |   |--- humidity <= 53.50
|   |   |   |--- temperature <= 33.35
|   |   |   |   |--- class: Good
|   |   |   |--- temperature >  33.35
|   |   |   |   |--- class: Good
|   |   |--- humidity >  53.50
|   |   |   |--- temperature <= 31.00
|   |   |   |   |--- class: Good
|   |   |   |--- temperature >  31.00
|   |   |   |   |--- class: Good
|--- mq135_raw >  49.00
|   |--- mq135_raw <= 441.00
|   |   |--- humidity <= 52.00
|   |   |   |--- class: Moderate
|   |   |--- humidity >  52.00
|   |   |   |--- class: Good
|   |--- mq135_raw >  441.00
|   |   |--- class: Unheal