# Weather Type Classification — Notebook 1: Tiền xử lý & EDA
**Mục tiêu.** Xây dựng *quy trình* tiền xử lý nhất quán: `log1p` + z-score cho thuộc tính lệch phân phối, z-score cho phần còn lại, One-Hot cho thuộc tính phân loại (`handle_unknown='ignore'`), tách 8/1/1

In [1]:
import os, random, numpy as np, pandas as pd, json, pathlib, matplotlib.pyplot as plt
SEED = 42
random.seed(SEED); np.random.seed(SEED)
def resolve_root():
    here = pathlib.Path.cwd().resolve()
    for c in [here, *list(here.parents)[:3]]:
        if (c / 'data' / 'raw' / 'weather_classification_data.csv').exists():
            return c
    return here
BASE_DIR = resolve_root()
DATA_PATH = BASE_DIR / 'data' / 'raw' / 'weather_classification_data.csv'
OUT_TBL = BASE_DIR / 'reports' / 'tables'
OUT_FIG = BASE_DIR / 'Images' / 'reports' / 'figures'
for p in [OUT_TBL, OUT_FIG]:
    p.mkdir(parents=True, exist_ok=True)
print('Paths ready:', OUT_TBL, OUT_FIG, sep='\n')
print('Resolved ROOT:', BASE_DIR)

Paths ready:
C:\Users\HAD\Desktop\Machine Learning\Weather-type-prediction-on-tabular-dataset\reports\tables
C:\Users\HAD\Desktop\Machine Learning\Weather-type-prediction-on-tabular-dataset\Images\reports\figures
Resolved ROOT: C:\Users\HAD\Desktop\Machine Learning\Weather-type-prediction-on-tabular-dataset


In [None]:
import numpy as np
from typing import List, Optional
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SKPipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer

def make_preprocessor(
    numeric_features: List[str],
    categorical_features: List[str],
    log1p_features: Optional[List[str]] = None,
) -> ColumnTransformer:
    log1p_features = list(log1p_features or [])
    num_no_log = [f for f in numeric_features if f not in log1p_features]

    transformers = []

    if log1p_features:
        transformers.append(
            (
                'num_log1p',
                SKPipeline(
                    steps=[
                        ('log1p', FunctionTransformer(np.log1p, feature_names_out='one-to-one')),
                        ('scaler', StandardScaler()),
                    ]
                ),
                log1p_features,
            )
        )

    if num_no_log:
        transformers.append(('num', StandardScaler(), num_no_log))

    if categorical_features:
        transformers.append(
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
        )

    preprocessor = ColumnTransformer(transformers)
    return preprocessor

In [3]:
df = pd.read_csv(DATA_PATH)
print(df.shape); display(df.head(3))
schema = {"n_rows": int(df.shape[0]), "n_cols": int(df.shape[1]), "columns": df.columns.tolist()}
(OUT_TBL / "feature_schema.json").write_text(json.dumps(schema, indent=2), encoding="utf-8")

(13200, 11)


Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny


270

In [None]:
n = len(df)

missing_counts = df.isna().sum().sort_values(ascending=False)
missing_summary = (
    missing_counts.to_frame("missing")
    .assign(missing_rate=lambda t: (t["missing"] / n).round(4))
    .reset_index()
    .rename(columns={"index": "column"})
)

rows_with_any_missing = int(df.isna().any(axis=1).sum())


display(missing_summary)
print(f"Số hàng có >=1 giá trị thiếu: {rows_with_any_missing}/{n} ({rows_with_any_missing/n:.2%})")


try:
    from pathlib import Path
    import matplotlib.pyplot as plt

    (OUT_TBL).mkdir(parents=True, exist_ok=True)
    missing_summary.to_csv(OUT_TBL / "missing_summary.csv", index=False)

    if (missing_summary["missing"] > 0).any():
        ax = missing_summary.plot.bar(x="column", y="missing", figsize=(10, 4), legend=False)
        ax.set_ylabel("Số giá trị thiếu")
        ax.set_xlabel("Thuộc tính")
        ax.set_title("Thiếu dữ liệu theo thuộc tính")
        plt.tight_layout()
        (OUT_FIG).mkdir(parents=True, exist_ok=True)
        plt.savefig(OUT_FIG / "missing_bar.png", dpi=200)
        plt.show()
    else:
        print("Không phát hiện giá trị thiếu theo cột.")
except Exception as e:
    print("Bỏ qua lưu CSV/hình (không có OUT_TBL/OUT_FIG hoặc thiếu matplotlib):", e)


Unnamed: 0,column,missing,missing_rate
0,Temperature,0,0.0
1,Humidity,0,0.0
2,Wind Speed,0,0.0
3,Precipitation (%),0,0.0
4,Cloud Cover,0,0.0
5,Atmospheric Pressure,0,0.0
6,UV Index,0,0.0
7,Season,0,0.0
8,Visibility (km),0,0.0
9,Location,0,0.0


Số hàng có >=1 giá trị thiếu: 0/13200 (0.00%)
Không phát hiện giá trị thiếu theo cột.


In [4]:
label_col = "Weather Type"
cat_features = ["Cloud Cover", "Season", "Location"]
num_features = [c for c in df.columns if c not in cat_features + [label_col]]
log1p_features = ["Wind Speed", "Precipitation (%)", "UV Index", "Visibility (km)"]
assert set(log1p_features).issubset(set(num_features))
meta = {"label_col": label_col, "categorical": cat_features, "numeric": num_features, "log1p": log1p_features}
(OUT_TBL / "feature_lists.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
print("Numeric:", num_features); print("Categorical:", cat_features); print("log1p:", log1p_features)

Numeric: ['Temperature', 'Humidity', 'Wind Speed', 'Precipitation (%)', 'Atmospheric Pressure', 'UV Index', 'Visibility (km)']
Categorical: ['Cloud Cover', 'Season', 'Location']
log1p: ['Wind Speed', 'Precipitation (%)', 'UV Index', 'Visibility (km)']


In [5]:
# Split 8/1/1
from sklearn.model_selection import train_test_split
X = df.drop(columns=[label_col]); y = df[label_col]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
split_info = {"train": len(X_train), "val": len(X_val), "test": len(X_test)}
(OUT_TBL / "split_counts.json").write_text(json.dumps(split_info, indent=2), encoding="utf-8")
print("Split:", split_info)

Split: {'train': 10560, 'val': 1320, 'test': 1320}


In [None]:

plt.rcParams["figure.figsize"] = (6,4)
for col in ["Precipitation (%)", "UV Index", "Visibility (km)", "Wind Speed"]:
    ax = df[col].hist(bins=40)
    ax.set_title(f"Histogram {col}"); 
    fp = OUT_FIG / f"eda_hist_{col.replace(' ','_').replace('(','').replace(')','').replace('%','pct')}.png"
    plt.tight_layout(); plt.savefig(fp, dpi=150); plt.clf()
print("Saved EDA histograms ->", OUT_FIG)

Saved EDA histograms -> C:\Users\HAD\Desktop\Machine Learning\Weather-type-prediction-on-tabular-dataset\Images\reports\figures


<Figure size 600x400 with 0 Axes>