# Experiment template (baseline feature config)

This notebook is a minimal template for model experiments.

It loads:
- `cleaned.parquet` (or `cleaned.csv`)
- `split.csv`
- `config/baseline_feature_config.json`

Then applies baseline drops/transforms before fitting a one-hot + numeric preprocessing pipeline.

In [3]:
from __future__ import annotations

import sys
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# --- Reproducibility ---
SEED = 42
np.random.seed(SEED)

# --- Paths ---
REPO_ROOT = Path.cwd().resolve().parents[0]  # notebooks/ -> repo root
if (REPO_ROOT / "src").exists():
    sys.path.insert(0, str(REPO_ROOT))

PREPROCESSED_ROOT = REPO_ROOT / "data" / "02-preprocessed"
BASELINE_CONFIG_JSON = REPO_ROOT / "config" / "baseline_feature_config.json"

# Locate latest prepared dataset folder
prepared_dirs = sorted(
    [p for p in PREPROCESSED_ROOT.iterdir() if p.is_dir()],
    key=lambda p: p.name,
)
if not prepared_dirs:
    raise FileNotFoundError(f"No prepared datasets found under: {PREPROCESSED_ROOT}")

DATASET_DIR = prepared_dirs[-1]

cleaned_parquet = DATASET_DIR / "cleaned.parquet"
cleaned_csv = DATASET_DIR / "cleaned.csv"
split_csv = DATASET_DIR / "split.csv"

print(f"Using prepared dataset: {DATASET_DIR}")
print(f"Using baseline config:  {BASELINE_CONFIG_JSON}")

# Load cleaned dataset
if cleaned_parquet.exists():
    df = pd.read_parquet(cleaned_parquet)
elif cleaned_csv.exists():
    df = pd.read_csv(cleaned_csv)
else:
    raise FileNotFoundError("Expected cleaned.parquet or cleaned.csv")

splits = pd.read_csv(split_csv)

# Load baseline feature config and apply it
from src.pipelines.features import apply_baseline_feature_config, load_baseline_feature_config

cfg = load_baseline_feature_config(BASELINE_CONFIG_JSON)

# Sanity checks
required_cols = {cfg.row_id_col, cfg.target_col}
missing_required = required_cols - set(df.columns)
if missing_required:
    raise KeyError(f"Missing required columns in cleaned data: {sorted(missing_required)}")

X_full = apply_baseline_feature_config(df, cfg)
y_full = df[cfg.target_col].astype(str)

print("Shapes:")
print(f"  df:     {df.shape}")
print(f"  X_full: {X_full.shape}")
print(f"  y_full: {y_full.shape}")

# Join split info
df_split = df[[cfg.row_id_col]].merge(splits[[cfg.row_id_col, "split"]], on=cfg.row_id_col, how="left")
if df_split["split"].isna().any():
    raise ValueError("Some rows are missing split assignments (split.csv join failed)")

mask_train = df_split["split"].eq("train")
mask_val = df_split["split"].eq("val")
mask_test = df_split["split"].eq("test")

X_train, y_train = X_full.loc[mask_train].reset_index(drop=True), y_full.loc[mask_train].reset_index(drop=True)
X_val, y_val = X_full.loc[mask_val].reset_index(drop=True), y_full.loc[mask_val].reset_index(drop=True)
X_test, y_test = X_full.loc[mask_test].reset_index(drop=True), y_full.loc[mask_test].reset_index(drop=True)

print("Split shapes:")
print(f"  train: {X_train.shape}, labels: {y_train.shape}")
print(f"  val:   {X_val.shape}, labels: {y_val.shape}")
print(f"  test:  {X_test.shape}, labels: {y_test.shape}")

print("\nLabel distribution by split:")
for name, y in [("train", y_train), ("val", y_val), ("test", y_test)]:
    vc = y.value_counts(dropna=False)
    print(f"\n{name}:")
    print(vc)


Using prepared dataset: C:\repos\ml-cybersecurity_attacks\data\02-preprocessed\cybersecurity_attacks_v1_2025-12-29
Using baseline config:  C:\repos\ml-cybersecurity_attacks\config\baseline_feature_config.json
Shapes:
  df:     (40000, 26)
  X_full: (40000, 25)
  y_full: (40000,)
Split shapes:
  train: (28000, 25), labels: (28000,)
  val:   (6000, 25), labels: (6000,)
  test:  (6000, 25), labels: (6000,)

Label distribution by split:

train:
Attack Type
DDoS         9400
Malware      9315
Intrusion    9285
Name: count, dtype: int64

val:
Attack Type
DDoS         2014
Malware      1996
Intrusion    1990
Name: count, dtype: int64

test:
Attack Type
DDoS         2014
Malware      1996
Intrusion    1990
Name: count, dtype: int64


In [2]:
# --- Build preprocessing for one-hot baseline ---

cat_cols = [c for c in X_train.columns if X_train[c].dtype == "object" or str(X_train[c].dtype).startswith("string")]
num_cols = [c for c in X_train.columns if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        (
            "cat",
            Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    (
                        "onehot",
                        OneHotEncoder(handle_unknown="ignore", sparse_output=False),
                    ),
                ]
            ),
            cat_cols,
        ),
        (
            "num",
            Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))]),
            num_cols,
        ),
    ],
    remainder="drop",
)

print(f"Categorical cols: {len(cat_cols)}")
print(f"Numeric cols:     {len(num_cols)}")

# Placeholder: plug in a model below.
# Example:
# from sklearn.linear_model import LogisticRegression
# model = LogisticRegression(max_iter=1000, class_weight='balanced', n_jobs=None)
# clf = Pipeline(steps=[('preprocess', preprocess), ('model', model)])
# clf.fit(X_train, y_train)
# print('val accuracy:', clf.score(X_val, y_val))


Categorical cols: 11
Numeric cols:     14
