In [7]:
import os
import json
import numpy as np
import pandas as pd
import sklearn as sk

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import joblib

In [8]:
print('sk', sk.__version__)
print('joblib', joblib.__version__) 
print('numpy', np.__version__)


sk 1.7.2
joblib 1.5.3
numpy 1.23.5


In [5]:
DATA_PATH = "azureml://subscriptions/ff00accc-1e70-418e-9fd2-5a2bf77f0654/resourcegroups/ruap_projekt/workspaces/ruap-projekt-airquality/datastores/workspaceblobstore/paths/UI/2026-02-03_171052_UTC/AirQualityUCI.csv"
df = pd.read_csv(
    DATA_PATH,
    sep=";",
    decimal=".",
    engine="python"
)

Class DeploymentTemplateOperations: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
  mlflow.mismatch._check_version_mismatch()
Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented


In [9]:
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
df.columns = [c.strip() for c in df.columns]
print("Shape:", df.shape)
df.head()

Shape: (9471, 15)


Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,10/03/2004,18.00.00,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,7578
1,10/03/2004,19.00.00,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,7255
2,10/03/2004,20.00.00,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,7502
3,10/03/2004,21.00.00,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,7867
4,10/03/2004,22.00.00,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,7888


In [10]:
df = df.replace(-200, np.nan)
df = df.drop(columns=[c for c in df.columns if c.strip() == ""], errors="ignore")

In [11]:
TARGET_COL = "NO2(GT)"
drop_cols = {"Date", "Time"}
candidate_features = [c for c in df.columns if c not in drop_cols and c != TARGET_COL]

In [12]:
for c in candidate_features + [TARGET_COL]:
    df[c] = pd.to_numeric(df[c], errors="coerce")


In [14]:
df_model = df.dropna(subset=[TARGET_COL]).copy()
q1, q2 = df_model[TARGET_COL].quantile([0.33, 0.66]).values

def no2_to_class(x):
    if x <= q1:
        return "Good"
    elif x <= q2:
        return "Moderate"
    return "Bad"

df_model["air_quality_class"] = df_model[TARGET_COL].apply(no2_to_class)
FEATURES = candidate_features

X = df_model[FEATURES].copy()
y = df_model["air_quality_class"].copy()

print("Features:", len(FEATURES))
print("Class distribution:\n", y.value_counts(normalize=True).round(3))
df_model[[TARGET_COL, "air_quality_class"]].head()



Features: 12
Class distribution:
 Bad         0.339
Good        0.335
Moderate    0.326
Name: air_quality_class, dtype: float64


Unnamed: 0,NO2(GT),air_quality_class
0,113.0,Moderate
1,92.0,Moderate
2,114.0,Moderate
3,122.0,Moderate
4,116.0,Moderate


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [16]:
numeric_features = FEATURES

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features)
    ],
    remainder="drop"
)

In [18]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, n_jobs=None),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=15)
}

results = []

for name, clf in models.items():
    pipe = Pipeline(steps=[
        ("preprocess", preprocess),
        ("model", clf)
    ])

    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)

    acc = accuracy_score(y_test, pred)
    f1m = f1_score(y_test, pred, average="macro")

    results.append({"model": name, "accuracy": acc, "macro_f1": f1m})

results_df = pd.DataFrame(results).sort_values(["macro_f1", "accuracy"], ascending=False)
display(results_df)



Unnamed: 0,model,accuracy,macro_f1
2,RandomForest,0.818535,0.817611
4,KNN,0.809462,0.808518
3,GradientBoosting,0.797797,0.796665
1,DecisionTree,0.766688,0.765382
0,LogisticRegression,0.740117,0.738733


In [22]:
best_name = results_df.iloc[0]["model"]
best_clf = models[best_name]

best_pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", best_clf)
])

best_pipe.fit(X_train, y_train)
best_pred = best_pipe.predict(X_test)

display("Best model:", best_name)
display("Accuracy:", accuracy_score(y_test, best_pred))
display("Macro F1:", f1_score(y_test, best_pred, average="macro"))
display("\nClassification report:\n", classification_report(y_test, best_pred))

cm = confusion_matrix(y_test, best_pred, labels=["Good", "Moderate", "Bad"])
display("Confusion matrix (rows=true, cols=pred) [Good, Moderate, Bad]:\n", cm)



'Best model:'
'RandomForest'
'Accuracy:'
0.8185353208036293
'Macro F1:'
0.8176113067597175
'\nClassification report:\n'
'              precision    recall  f1-score   support\n\n         Bad       0.82      0.86      0.84       523\n        Good       0.90      0.87      0.89       517\n    Moderate       0.73      0.72      0.73       503\n\n    accuracy                           0.82      1543\n   macro avg       0.82      0.82      0.82      1543\nweighted avg       0.82      0.82      0.82      1543\n'
'Confusion matrix (rows=true, cols=pred) [Good, Moderate, Bad]:\n'
array([[450,  59,   8],
       [ 49, 363,  91],
       [  0,  73, 450]])

In [23]:
os.makedirs("artifacts", exist_ok=True)

MODEL_PATH = "artifacts/model.pkl"
FEATURES_PATH = "artifacts/features.json"
META_PATH = "artifacts/metadata.json"

joblib.dump(best_pipe, MODEL_PATH)

with open(FEATURES_PATH, "w", encoding="utf-8") as f:
    json.dump(FEATURES, f, ensure_ascii=False, indent=2)

with open(META_PATH, "w", encoding="utf-8") as f:
    json.dump(
        {
            "target_definition": f"NO2(GT) terciles: Good <= {q1:.3f}, Moderate <= {q2:.3f}, Bad > {q2:.3f}",
            "target_column": TARGET_COL,
            "classes": ["Good", "Moderate", "Bad"],
            "best_model": best_name
        },
        f,
        ensure_ascii=False,
        indent=2
    )

print("Saved:", MODEL_PATH, FEATURES_PATH, META_PATH)

Saved: artifacts/model.pkl artifacts/features.json artifacts/metadata.json
