# Experiment: DBSCAN

Task: **clustering**

In [5]:
from __future__ import annotations

import sys
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

SEED = 42
np.random.seed(SEED)

def find_repo_root(start: Path) -> Path:
    """Find the repo root by walking upward until key markers are found."""
    start = start.resolve()
    for candidate in (start, *start.parents):
        if (candidate / 'requirements.txt').exists() and (candidate / 'src').is_dir():
            return candidate
    raise FileNotFoundError(
        f'Could not locate repo root from {start}. Expected to find requirements.txt and src/.'
    )

REPO_ROOT = find_repo_root(Path.cwd())
sys.path.insert(0, str(REPO_ROOT))

PREPROCESSED_ROOT = REPO_ROOT / 'data' / '02-preprocessed'
BASELINE_CONFIG_JSON = REPO_ROOT / 'config' / 'baseline_feature_config.json'

if not PREPROCESSED_ROOT.exists():
    raise FileNotFoundError(f'Preprocessed root not found: {PREPROCESSED_ROOT}')

prepared_dirs = sorted([p for p in PREPROCESSED_ROOT.iterdir() if p.is_dir()], key=lambda p: p.name)
if not prepared_dirs:
    raise FileNotFoundError(f'No prepared datasets found under: {PREPROCESSED_ROOT}')
DATASET_DIR = prepared_dirs[-1]

cleaned_parquet = DATASET_DIR / 'cleaned.parquet'
cleaned_csv = DATASET_DIR / 'cleaned.csv'
split_csv = DATASET_DIR / 'split.csv'

print(f'Using prepared dataset: {DATASET_DIR}')
print(f'Using baseline config:  {BASELINE_CONFIG_JSON}')

if cleaned_parquet.exists():
    df = pd.read_parquet(cleaned_parquet)
elif cleaned_csv.exists():
    df = pd.read_csv(cleaned_csv)
else:
    raise FileNotFoundError('Expected cleaned.parquet or cleaned.csv')

if not split_csv.exists():
    raise FileNotFoundError(f'Expected split.csv at: {split_csv}')
splits = pd.read_csv(split_csv)

from src.pipelines.features import apply_baseline_feature_config, load_baseline_feature_config
cfg = load_baseline_feature_config(BASELINE_CONFIG_JSON)

required_cols = {cfg.row_id_col, cfg.target_col}
missing_required = required_cols - set(df.columns)
if missing_required:
    raise KeyError(f'Missing required columns in cleaned data: {sorted(missing_required)}')

X_full = apply_baseline_feature_config(df, cfg)
y_full = df[cfg.target_col].astype(str)

df_split = df[[cfg.row_id_col]].merge(splits[[cfg.row_id_col, 'split']], on=cfg.row_id_col, how='left')
if df_split['split'].isna().any():
    raise ValueError('Some rows are missing split assignments (split.csv join failed)')

mask_train = df_split['split'].eq('train')
mask_val = df_split['split'].eq('val')
mask_test = df_split['split'].eq('test')

X_train, y_train = X_full.loc[mask_train].reset_index(drop=True), y_full.loc[mask_train].reset_index(drop=True)
X_val, y_val = X_full.loc[mask_val].reset_index(drop=True), y_full.loc[mask_val].reset_index(drop=True)
X_test, y_test = X_full.loc[mask_test].reset_index(drop=True), y_full.loc[mask_test].reset_index(drop=True)

print('Split sizes:', X_train.shape, X_val.shape, X_test.shape)


Using prepared dataset: C:\repos\ml-cybersecurity_attacks\data\02-preprocessed\cybersecurity_attacks_v1_2025-12-29
Using baseline config:  C:\repos\ml-cybersecurity_attacks\config\baseline_feature_config.json
Split sizes: (28000, 25) (6000, 25) (6000, 25)


In [8]:
# Build preprocessing: impute + one-hot for categoricals; impute (+ optional scale) for numeric

from pandas.api.types import is_bool_dtype, is_numeric_dtype

cat_cols = [
    c for c in X_train.columns
    if (not is_numeric_dtype(X_train[c])) or is_bool_dtype(X_train[c])
]
num_cols = [c for c in X_train.columns if c not in cat_cols]

cat_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
])

num_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

preprocess = ColumnTransformer(
    transformers=[('cat', cat_pipe, cat_cols), ('num', num_pipe, num_cols)],
    remainder='drop',
)

print(f'Categorical cols: {len(cat_cols)}')
print(f'Numeric cols:     {len(num_cols)}')


Categorical cols: 11
Numeric cols:     14


In [9]:
# Model definition + clustering (+ "predict-like" assignment for val/test)

from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors

model = DBSCAN(eps=0.5, min_samples=10)

# Fit on train only (no leakage).
X_train_t = preprocess.fit_transform(X_train)
train_labels = model.fit_predict(X_train_t)

def assign_dbscan_labels(dbscan_model: DBSCAN, X_new: np.ndarray) -> np.ndarray:
    """Assign DBSCAN cluster labels to new points.

    scikit-learn's DBSCAN does not implement predict(). This assigns each new point
    the label of its nearest *core sample* within eps; otherwise it is labeled -1.
    """
    if not hasattr(dbscan_model, 'components_') or dbscan_model.components_.shape[0] == 0:
        return np.full(X_new.shape[0], -1, dtype=int)

    core_X = dbscan_model.components_
    core_labels = dbscan_model.labels_[dbscan_model.core_sample_indices_]

    nn = NearestNeighbors(radius=dbscan_model.eps)
    nn.fit(core_X)

    neigh_dist, neigh_ind = nn.radius_neighbors(
        X_new, return_distance=True, sort_results=True
    )

    out = np.full(X_new.shape[0], -1, dtype=int)
    for i, inds in enumerate(neigh_ind):
        if len(inds) == 0:
            continue
        out[i] = int(core_labels[inds[0]])
    return out

X_val_t = preprocess.transform(X_val)
X_test_t = preprocess.transform(X_test)

val_labels = assign_dbscan_labels(model, X_val_t)
test_labels = assign_dbscan_labels(model, X_test_t)

def summarize_labels(name: str, labels: np.ndarray) -> None:
    n_noise = int(np.sum(labels == -1))
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    print(f'{name}: clusters={n_clusters} | noise={n_noise} / {len(labels)}')

summarize_labels('train', train_labels)
summarize_labels('val  ', val_labels)
summarize_labels('test ', test_labels)


train: clusters=0 | noise=28000 / 28000
val  : clusters=0 | noise=6000 / 6000
test : clusters=0 | noise=6000 / 6000


In [10]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

def eval_clusters(name: str, y_true, labels) -> None:
    print(f'\n== {name} ==')
    print('ARI:', adjusted_rand_score(y_true, labels))
    print('NMI:', normalized_mutual_info_score(y_true, labels))

eval_clusters('train', y_train, train_labels)
eval_clusters('val', y_val, val_labels)
eval_clusters('test', y_test, test_labels)



== train ==
ARI: 0.0
NMI: 0.0

== val ==
ARI: 0.0
NMI: 0.0

== test ==
ARI: 0.0
NMI: 0.0
