# Weather Type Classification — Notebook 2: Modeling & Selection 

Quy trình: chọn siêu tham số trên validation (Macro-F1), refit trên train+val, đánh giá trên test

In [1]:
import os, json, random, numpy as np, pandas as pd, joblib, datetime, pathlib
SEED = 42
random.seed(SEED); np.random.seed(SEED)
def resolve_root():
    here = pathlib.Path.cwd().resolve()
    for c in [here, *list(here.parents)[:3]]:
        if (c / 'data' / 'raw' / 'weather_classification_data.csv').exists():
            return c
    return here
BASE_DIR = resolve_root()
DATA_PATH = BASE_DIR / 'data' / 'raw' / 'weather_classification_data.csv'
OUT_TBL = BASE_DIR / 'reports' / 'tables'
OUT_FIG = BASE_DIR / 'Images' / 'reports' / 'figures'
OUT_MDL = BASE_DIR / 'models'
for p in [OUT_TBL, OUT_FIG, OUT_MDL]:
    p.mkdir(parents=True, exist_ok=True)
print('Resolved ROOT:', BASE_DIR)

Resolved ROOT: C:\Users\HAD\Desktop\Machine Learning\Weather-type-prediction-on-tabular-dataset


In [None]:
import numpy as np
from typing import List, Optional
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SKPipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer

def make_preprocessor(
    numeric_features: List[str],
    categorical_features: List[str],
    log1p_features: Optional[List[str]] = None,
) -> ColumnTransformer:
    log1p_features = list(log1p_features or [])
    num_no_log = [f for f in numeric_features if f not in log1p_features]

    transformers = []

    if log1p_features:
        transformers.append(
            (
                'num_log1p',
                SKPipeline(
                    steps=[
                        ('log1p', FunctionTransformer(np.log1p, feature_names_out='one-to-one')),
                        ('scaler', StandardScaler()),
                    ]
                ),
                log1p_features,
            )
        )

    if num_no_log:
        transformers.append(('num', StandardScaler(), num_no_log))

    if categorical_features:
        transformers.append(
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
        )

    preprocessor = ColumnTransformer(transformers)
    return preprocessor

In [None]:
df = pd.read_csv(DATA_PATH)
label_col = "Weather Type"
cat_features = ["Cloud Cover", "Season", "Location"]
num_features = [c for c in df.columns if c not in cat_features + [label_col]]
log1p_features = ["Wind Speed", "Precipitation (%)", "UV Index", "Visibility (km)"]
from sklearn.model_selection import train_test_split
X = df.drop(columns=[label_col]); y = df[label_col]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder(); y_train_enc = le.fit_transform(y_train); y_val_enc = le.transform(y_val); y_test_enc = le.transform(y_test)
print("Classes:", le.classes_)

Classes: ['Cloudy' 'Rainy' 'Snowy' 'Sunny']


In [None]:
preprocessor = make_preprocessor(num_features, cat_features, log1p_features)
assert "handle_unknown='ignore'" in repr(preprocessor)
print(preprocessor)

ColumnTransformer(transformers=[('num_log1p',
                                 Pipeline(steps=[('log1p',
                                                  FunctionTransformer(feature_names_out='one-to-one',
                                                                      func=<ufunc 'log1p'>)),
                                                 ('scaler', StandardScaler())]),
                                 ['Wind Speed', 'Precipitation (%)', 'UV Index',
                                  'Visibility (km)']),
                                ('num', StandardScaler(),
                                 ['Temperature', 'Humidity',
                                  'Atmospheric Pressure']),
                                ('cat',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse_output=False),
                                 ['Cloud Cover', 'Season', 'Location'])])


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd, numpy as np, json, datetime

MODELS = {
    "logreg": LogisticRegression(penalty="l2", solver="lbfgs", multi_class="multinomial", max_iter=200, random_state=42),
    "svm": SVC(kernel="rbf", probability=True, random_state=42),
    "xgb": XGBClassifier(objective="multi:softprob", num_class=len(le.classes_), random_state=42, n_jobs=1, tree_method="hist", eval_metric="mlogloss"),
}
GRIDS = {
    "logreg": [{"config_id": k, "params": {"C": v}} for k,v in [("C0.3",0.3),("C1",1.0),("C3",3.0),("C10",10.0),("C30",30.0)]],
    "svm": [
        {"config_id": "C0.3_gscale", "params": {"C": 0.3, "gamma": "scale"}},
        {"config_id": "C1_gscale",   "params": {"C": 1.0, "gamma": "scale"}},
        {"config_id": "C3_gauto",    "params": {"C": 3.0, "gamma": "auto"}},
        {"config_id": "C3_gscale",   "params": {"C": 3.0, "gamma": "scale"}},
        {"config_id": "C10_gscale",  "params": {"C": 10.0, "gamma": "scale"}},
    ],
    "xgb": [
        {"config_id": "A", "params": {"n_estimators": 200, "max_depth": 3, "learning_rate": 0.1, "subsample": 1.0, "colsample_bytree": 1.0}},
        {"config_id": "B", "params": {"n_estimators": 300, "max_depth": 5, "learning_rate": 0.1, "subsample": 1.0, "colsample_bytree": 1.0}},
        {"config_id": "C", "params": {"n_estimators": 500, "max_depth": 4, "learning_rate": 0.05,"subsample": 0.8, "colsample_bytree": 1.0}},
        {"config_id": "D", "params": {"n_estimators": 200, "max_depth": 6, "learning_rate": 0.05,"subsample": 0.8, "colsample_bytree": 0.8}},
        {"config_id": "E", "params": {"n_estimators": 400, "max_depth": 3, "learning_rate": 0.03,"subsample": 1.0, "colsample_bytree": 0.8}},
    ],
}

In [6]:
def eval_on_val(model_key, base_estimator, grid):
    rows = []
    for cfg in grid:
        params = cfg["params"].copy(); cfg_id = cfg["config_id"]
        pipe = Pipeline(steps=[("prep", preprocessor), ("clf", base_estimator.__class__(**base_estimator.get_params()))])
        pipe.set_params(**{f"clf__{k}": v for k,v in params.items()})
        pipe.fit(X_train, y_train_enc)
        y_pred = pipe.predict(X_val)
        rows.append({
            "model": model_key, "config_id": cfg_id, "params_json": json.dumps(params),
            "macro_f1_val": f1_score(y_val_enc, y_pred, average="macro"),
            "accuracy_val": accuracy_score(y_val_enc, y_pred),
        })
    return pd.DataFrame(rows)

frames = [eval_on_val(mk, est, GRIDS[mk]) for mk, est in MODELS.items()]
val_grid = pd.concat(frames, ignore_index=True)
val_grid_path = OUT_TBL / "val_grid_results.csv"
val_grid.to_csv(val_grid_path, index=False)
print("Saved", val_grid_path)
display(val_grid.sort_values(["model","macro_f1_val"], ascending=[True, False]).groupby("model").head(5))







Saved C:\Users\HAD\Desktop\Machine Learning\Weather-type-prediction-on-tabular-dataset\reports\tables\val_grid_results.csv


Unnamed: 0,model,config_id,params_json,macro_f1_val,accuracy_val
3,logreg,C10,"{""C"": 10.0}",0.87003,0.869697
4,logreg,C30,"{""C"": 30.0}",0.87003,0.869697
1,logreg,C1,"{""C"": 1.0}",0.869994,0.869697
2,logreg,C3,"{""C"": 3.0}",0.868499,0.868182
0,logreg,C0.3,"{""C"": 0.3}",0.868491,0.868182
9,svm,C10_gscale,"{""C"": 10.0, ""gamma"": ""scale""}",0.912211,0.912121
8,svm,C3_gscale,"{""C"": 3.0, ""gamma"": ""scale""}",0.904807,0.904545
5,svm,C0.3_gscale,"{""C"": 0.3, ""gamma"": ""scale""}",0.903192,0.902273
7,svm,C3_gauto,"{""C"": 3.0, ""gamma"": ""auto""}",0.901962,0.901515
6,svm,C1_gscale,"{""C"": 1.0, ""gamma"": ""scale""}",0.901933,0.901515


In [None]:
best = (val_grid.sort_values(["macro_f1_val","accuracy_val"], ascending=False)
                .groupby("model", as_index=False).head(1))
best.to_csv(OUT_TBL / "best_val_by_model.csv", index=False); display(best)

from sklearn.metrics import f1_score, accuracy_score
from sklearn.pipeline import Pipeline
X_tr_all = pd.concat([X_train, X_val], axis=0)
y_tr_all = np.concatenate([y_train_enc, y_val_enc], axis=0)
fitted = {}
for _, r in best.iterrows():
    mk = r["model"]; params = json.loads(r["params_json"])
    est = MODELS[mk].__class__(**MODELS[mk].get_params())
    pipe = Pipeline(steps=[("prep", preprocessor), ("clf", est)])
    pipe.set_params(**{f"clf__{k}": v for k,v in params.items()})
    pipe.fit(X_tr_all, y_tr_all)
    fitted[mk] = pipe
    joblib.dump(pipe, OUT_MDL / f"{mk}_best.joblib")
    with open(OUT_TBL / f"{mk}_best_params.json","w") as f:
        json.dump({"model": mk, "config_id": r["config_id"], "params": params}, f, indent=2)
print("Saved models/params.")

Unnamed: 0,model,config_id,params_json,macro_f1_val,accuracy_val
11,xgb,B,"{""n_estimators"": 300, ""max_depth"": 5, ""learnin...",0.916824,0.916667
9,svm,C10_gscale,"{""C"": 10.0, ""gamma"": ""scale""}",0.912211,0.912121
3,logreg,C10,"{""C"": 10.0}",0.87003,0.869697


Saved models/params.




In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt, seaborn as sns, numpy as np, pandas as pd

def plot_and_save_confusion(cm, classes, title, path):
    plt.figure(figsize=(5.2,4.2))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False,
                xticklabels=classes, yticklabels=classes)
    plt.xlabel("Dự đoán"); plt.ylabel("Thực tế"); plt.title(title)
    plt.tight_layout(); plt.savefig(path, dpi=150); plt.close()

def roc_pr_ovr(y_true_enc, y_proba, classes, prefix):
    # ROC
    plt.figure(figsize=(5.2,4.2))
    for i, cls in enumerate(classes):
        fpr, tpr, _ = roc_curve(y_true_enc==i, y_proba[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"{cls} (AUC={roc_auc:.3f})")
    plt.plot([0,1],[0,1],'k--'); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title(f"ROC OvR — {prefix}"); plt.legend()
    plt.tight_layout(); plt.savefig(OUT_FIG / f"{prefix}_test_roc.png", dpi=150); plt.close()
    # PR
    plt.figure(figsize=(5.2,4.2))
    for i, cls in enumerate(classes):
        p, r, _ = precision_recall_curve(y_true_enc==i, y_proba[:, i])
        ap = average_precision_score(y_true_enc==i, y_proba[:, i])
        plt.plot(r, p, label=f"{cls} (AP={ap:.3f})")
    plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title(f"PR OvR — {prefix}"); plt.legend()
    plt.tight_layout(); plt.savefig(OUT_FIG / f"{prefix}_test_pr.png", dpi=150); plt.close()

rows = []
for mk, pipe in fitted.items():
    y_pred = pipe.predict(X_test); proba = None
    if hasattr(pipe.named_steps["clf"], "predict_proba"):
        proba = pipe.predict_proba(X_test)
    rep = classification_report(y_test_enc, y_pred, target_names=list(le.classes_), output_dict=True, digits=4)
    pd.DataFrame(rep).T.to_csv(OUT_TBL / f"{mk}_test_report.csv")
    cm = confusion_matrix(y_test_enc, y_pred, labels=np.arange(len(le.classes_)))
    plot_and_save_confusion(cm, le.classes_, f"Confusion Matrix — {mk.upper()} (test)", OUT_FIG / f"{mk}_test_confusion.png")
    if proba is not None:
        roc_pr_ovr(y_test_enc, proba, le.classes_, mk)
    from sklearn.metrics import f1_score, accuracy_score
    rows.append({"model": mk, "macro_f1_test": f1_score(y_test_enc, y_pred, average="macro"),
                 "accuracy_test": accuracy_score(y_test_enc, y_pred)})
pd.DataFrame(rows).sort_values("macro_f1_test", ascending=False).to_csv(OUT_TBL / "test_summary.csv", index=False)
print("Saved reports, figures, test_summary.csv")

Saved reports, figures, test_summary.csv


In [9]:
print("Sanity checks:")
print(" - No explicit transform on test outside Pipeline.")
print(" - Label order:", list(le.classes_))

Sanity checks:
 - No explicit transform on test outside Pipeline.
 - Label order: ['Cloudy', 'Rainy', 'Snowy', 'Sunny']
