## Random Forest 

**Cell 1: Imports, Paths, Labels, Part lists**

In [3]:
from pathlib import Path
import json
import numpy as np
import scipy.sparse as sp
from glob import glob
import re

# Paths (ίδια λογική με τα προηγούμενα notebooks)
BASE = Path.home() / "Desktop" / "Malware Project"
VEC  = BASE / "data" / "behavior_vectors_paper"

# Models & results folders για Random Forest
MODELS_DIR  = VEC / "models_rf_capped"
RESULTS_DIR = VEC / "results_rf_capped_eval"
MODELS_DIR.mkdir(exist_ok=True, parents=True)
RESULTS_DIR.mkdir(exist_ok=True, parents=True)

# Φόρτωση label names (προτιμούμε το balanced αν υπάρχει)
label_map_paths = [
    VEC / "label_map_balanced.json",
    VEC / "label_map.json",
]
label_names = None
for p in label_map_paths:
    if p.exists():
        with open(p, "r") as f:
            data = json.load(f)
        if isinstance(data, dict) and "labels" in data:
            label_names = data["labels"]
            break

if label_names is None:
    raise FileNotFoundError("Δεν βρέθηκε ούτε label_map_balanced.json ούτε label_map.json στο VEC.")

n_labels = len(label_names)
print(f"Loaded labels: {n_labels}")
print(", ".join(label_names[:10]) + (" ..." if n_labels > 10 else ""))

# Train/Test parts (χρησιμοποιούμε τα CAPPED X και τα BALANCED Y όπως πριν)
XTRN = sorted(map(Path, glob(str(VEC / "train_part_capped*.npz"))))
YTRN_CAP = sorted(map(Path, glob(str(VEC / "y_train_part_capped*.npy"))))

# Load TEST parts properly: match capped X with balanced Y using part indexes

def _index_from_name(p: Path) -> int:
    # Πάρε τα ψηφία στο τέλος του stem, π.χ. "test_part_capped023" -> 23
    m = re.search(r'(\d+)$', p.stem)
    if not m:
        raise ValueError(f"No trailing index found in filename: {p.name}")
    return int(m.group(1))

# Load test X (capped)
XTE = sorted(map(Path, glob(str(VEC / "test_part_capped*.npz"))))
idx_te = [_index_from_name(p) for p in XTE]

# Load all TEST Y (balanced) και ταυτοποίηση με βάση το index
all_YTE = list(map(Path, glob(str(VEC / "y_test_part_capped*.npy"))))
map_Y = {_index_from_name(p): p for p in all_YTE}

# Κράτα μόνο όσα Y αντιστοιχούν σε X
YTE = [map_Y[i] for i in idx_te if i in map_Y]

print("Train parts (X):", len(XTRN), "| Train parts (Y_cap):", len(YTRN_CAP))
print("Test parts (X):", len(XTE), "| Test parts (Y):", len(YTE))


# Προαιρετικός γρήγορος έλεγχος ίδιου πλήθους rows στο part 0
def _rows_X(path): return sp.load_npz(path).shape[0]
def _rows_Y(path): return np.load(path, allow_pickle=False).shape[0]




assert len(XTRN) == len(YTRN_CAP), "Ασυμφωνία πλήθους train X με y_train_part_capped*.npy"

print(f"[train part 0] rows X={_rows_X(XTRN[0])}, Y={_rows_Y(YTRN_CAP[0])}")
print(f"[test  part 0] rows X={_rows_X(XTE[0])},  Y={_rows_Y(YTE[0])}")


Loaded labels: 64
adware, antiav, antifw, autorun, backdoor, banker, bho, binder, blocker, bundler ...
Train parts (X): 551 | Train parts (Y_cap): 551
Test parts (X): 45 | Test parts (Y): 45
[train part 0] rows X=2851, Y=2851
[test  part 0] rows X=12,  Y=12


**Cell 2: Config & Helpers**

In [4]:
import re
import json
import gc
from math import ceil
import numpy as np
import scipy.sparse as sp
from pathlib import Path

# Ίδιες σταθερές με πριν
NEG_POS_RATIO = 3     # πόσα negatives ανά positive (ανά label)
VAL_FRACTION = 0.10   # 10% validation από το train set (per-label)

# Πού θα σωθούν τα RF μοντέλα & thresholds
MODELS_DIR = VEC / "models_rf_perlabel_capped"
MODELS_DIR.mkdir(exist_ok=True, parents=True)

THRESH_PATH = VEC / "rf_per_label_thresholds.json"  # θα γραφτεί στο τέλος

def model_path_for(label_idx: int) -> Path:
    return MODELS_DIR / f"rf_label_{label_idx:03d}.joblib"

# Κατανομή POS ανά part (όπως στο LGBM)

import numpy as np

def plan_per_part_for_label(label_idx: int, cap_pos: int):
    """
    Επιστρέφει ανά part πόσα POS θα κρατήσουμε για το συγκεκριμένο label.
    Κατανέμει αναλογικά στα YTRN_BAL, με στρογγυλοποιήσεις ώστε να "γεμίσει" έως cap_pos.
    """
    pos_counts = []
    for yp in YTRN_CAP:
        y = np.load(yp, allow_pickle=False)
        pos_counts.append(int(y[:, label_idx].sum()))
    pos_counts = np.array(pos_counts, dtype=np.int64)

    total_pos = int(pos_counts.sum())
    keep_total = min(total_pos, cap_pos)
    if total_pos == 0:
        return np.zeros(len(YTRN_CAP), dtype=np.int64)

    frac = pos_counts / total_pos
    plan = np.floor(frac * keep_total).astype(np.int64)
    short = int(keep_total - plan.sum())

    if short > 0:
        # μοίρασε το υπόλοιπο στα "αναλογικά πιο πλούσια"
        order = np.argsort(-(frac - plan/keep_total))
        for i in order[:short]:
            plan[i] += 1

    return plan  # ανά part πόσα POS


**Cell 3: Build dataset per label**

In [5]:
from numpy.random import default_rng

def build_dataset_for_label(label_idx: int, cap_pos: int, rng_seed: int = 12345):
    """
    Συνθέτει X, y για συγκεκριμένο label, ακολουθώντας το ίδιο φίλτρο/plan/ratio
    με το LightGBM: capped POS ανά part + NEG_POS_RATIO * POS, έπειτα shuffle και split.
    Επιστρέφει: X_tr, y_tr, X_val, y_val (όλα στο ίδιο sparse CSR format).
    Deterministic per-label: uses a local RNG seeded with rng_seed + label_idx.
    """
    # deterministic RNG per label so all later calls (train / cal / beta) reproduce same splits
    rng = default_rng(rng_seed + int(label_idx))

    # Πλάνο: πόσα POS ανά part
    plan = plan_per_part_for_label(label_idx, cap_pos)

    X_list = []
    y_all = []

    for (xp, yp, k_pos) in zip(XTRN, YTRN_CAP, plan):
        X = sp.load_npz(xp)               # csr
        Y = np.load(yp, allow_pickle=False).astype(np.int8)  # (rows, n_labels)

        y_col = Y[:, label_idx].astype(np.int8)
        pos_idx = np.flatnonzero(y_col == 1)
        neg_idx = np.flatnonzero(y_col == 0)

        if k_pos <= 0 or len(pos_idx) == 0:
            continue

        # πόσα POS από αυτό το part
        if len(pos_idx) > k_pos:
            pos_pick = rng.choice(pos_idx, size=k_pos, replace=False)
        else:
            pos_pick = pos_idx

        # NEG δείγματα
        k_neg = min(len(neg_idx), NEG_POS_RATIO * len(pos_pick))
        if k_neg > 0:
            neg_pick = rng.choice(neg_idx, size=k_neg, replace=False)
        else:
            neg_pick = np.array([], dtype=np.int64)

        pick = np.concatenate([pos_pick, neg_pick])
        if pick.size == 0:
            continue

        X_list.append(X[pick])
        y_all.append(y_col[pick])

        del X, Y
        gc.collect()

    if not X_list:
        return None, None, None, None

    X_all = sp.vstack(X_list).tocsr()
    y_all = np.concatenate(y_all).astype(np.int8)

    # shuffle (deterministic)
    order = rng.permutation(X_all.shape[0])
    X_all = X_all[order]
    y_all = y_all[order]

    # split train/val
    n_val = int(ceil(VAL_FRACTION * X_all.shape[0]))
    if n_val == 0:
        return X_all, y_all, None, None

    X_val = X_all[:n_val];    y_val = y_all[:n_val]
    X_tr  = X_all[n_val:];    y_tr  = y_all[n_val:]

    return X_tr, y_tr, X_val, y_val


**Cell 4: Training with Random forest + threshold tuning**

In [6]:
# ==== Cell 4: Train one label with RandomForest + threshold tuning on validation ====
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
import joblib
import numpy as np

def train_one_label_rf(label_idx: int, cap_pos: int = 7500):
    """
    Εκπαιδεύει RandomForest για ένα label, με το ίδιο capped sampling και
    grid-search threshold πάνω στο validation για max F1.
    Σώζει το μοντέλο και επιστρέφει το βέλτιστο threshold.
    """
    name = label_names[label_idx]
    print(f"[{label_idx:03d}] {name} | cap_pos={cap_pos}")

    X_tr, y_tr, X_val, y_val = build_dataset_for_label(label_idx, cap_pos)
    if X_tr is None or y_tr is None:
        print(f"[{label_idx:03d}] {name} -> no data, skipped")
        return None

    # Μοντέλο RandomForest (ισορροπημένο, multi-core)
    rf = RandomForestClassifier(
        n_estimators=500,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        class_weight="balanced",   # για imbalance
        n_jobs=-1,
        random_state=42
    )

    rf.fit(X_tr, y_tr)

    # Αν δεν έχουμε validation (σπάνιο), σώζουμε το μοντέλο και επιστρέφουμε thr=0.5
    if X_val is None or y_val is None or len(y_val) == 0:
        joblib.dump(rf, model_path_for(label_idx))
        print(f"[{label_idx:03d}] saved model (no val); thr=0.50 (default)")
        return 0.5

    # predict_proba στο validation
    val_prob = rf.predict_proba(X_val)[:, 1]

    # threshold tuning για max F1 στο validation
    thr_grid = np.linspace(0.05, 0.95, 19)  # 0.05, 0.10, ..., 0.95
    best_thr = 0.5
    best_f1 = -1.0

    for thr in thr_grid:
        y_pred = (val_prob >= thr).astype(np.int8)
        _, _, f1, _ = precision_recall_fscore_support(
            y_val, y_pred, average="binary", zero_division=0
        )
        if f1 > best_f1:
            best_f1 = f1
            best_thr = float(thr)

    joblib.dump(rf, model_path_for(label_idx))
    print(f"[{label_idx:03d}] best_thr={best_thr:.2f} | val_F1={best_f1:.3f}")
    return best_thr


**Cell 5: Full training loop + save thresholds**

In [10]:
per_label_thr = {}

# cap_pos
DEFAULT_CAP_POS = 7500

for li in range(n_labels):
    thr = train_one_label_rf(li, cap_pos=DEFAULT_CAP_POS)
    if thr is not None:
        per_label_thr[str(li)] = thr

# γράψε thresholds στο JSON
with open(THRESH_PATH, "w") as f:
    json.dump({"thresholds": per_label_thr, "labels": label_names}, f, indent=2)

print("Saved per-label thresholds ->", THRESH_PATH)


[000] adware | cap_pos=7500
[000] best_thr=0.35 | val_F1=0.914
[001] antiav | cap_pos=7500
[001] best_thr=0.30 | val_F1=0.948
[002] antifw | cap_pos=7500
[002] best_thr=0.20 | val_F1=0.996
[003] autorun | cap_pos=7500
[003] best_thr=0.30 | val_F1=0.905
[004] backdoor | cap_pos=7500
[004] best_thr=0.35 | val_F1=0.748
[005] banker | cap_pos=7500
[005] best_thr=0.35 | val_F1=0.903
[006] bho | cap_pos=7500
[006] best_thr=0.30 | val_F1=0.965
[007] binder | cap_pos=7500
[007] best_thr=0.35 | val_F1=0.929
[008] blocker | cap_pos=7500
[008] best_thr=0.30 | val_F1=0.878
[009] bundler | cap_pos=7500
[009] best_thr=0.35 | val_F1=0.879
[010] clicker | cap_pos=7500
[010] best_thr=0.30 | val_F1=0.891
[011] coinminer | cap_pos=7500
[011] best_thr=0.35 | val_F1=0.958
[012] constructor | cap_pos=7500
[012] best_thr=0.40 | val_F1=0.879
[013] cryptor | cap_pos=7500
[013] best_thr=0.35 | val_F1=0.886
[014] ddos | cap_pos=7500
[014] best_thr=0.30 | val_F1=0.881
[015] dialer | cap_pos=7500
[015] best_thr=0.

**Cell 6: Test evaluation (without calibration)**

In [11]:
# ==== Cell 6: Evaluate on TEST with per-label thresholds (Random Forest) ====
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import joblib
import numpy as np
import pandas as pd

# φόρτωσε thresholds
with open(THRESH_PATH, "r") as f:
    thr_info = json.load(f)
thr_map = {int(k): float(v) for k, v in thr_info["thresholds"].items()}

# μαζεύουμε y_true, prob και pred για ΟΛΑ τα test parts
y_true_all, y_prob_all, y_pred_all = [], [], []

for i, (xp, yp) in enumerate(zip(XTE, YTE), start=1):
    X = sp.load_npz(xp)
    Y = np.load(yp, allow_pickle=False).astype(np.int8)   # (rows, n_labels)

    # προβλέψεις πιθανότητας για κάθε label με το αποθηκευμένο RF
    probs = np.zeros((X.shape[0], n_labels), dtype=np.float32)
    preds = np.zeros((X.shape[0], n_labels), dtype=np.int8)

    for li in range(n_labels):
        model_file = model_path_for(li)
        if not model_file.exists():
            # αν δεν υπάρχει μοντέλο (π.χ. skipped), βάλε 0.0
            continue
        rf = joblib.load(model_file)
        p = rf.predict_proba(X)[:, 1]
        probs[:, li] = p
        thr = thr_map.get(li, 0.5)
        preds[:, li] = (p >= thr).astype(np.int8)

    y_true_all.append(Y)
    y_prob_all.append(probs)
    y_pred_all.append(preds)
    print(f"[test] part {i}/{len(XTE)} done")

y_true = np.vstack(y_true_all)
y_prob = np.vstack(y_prob_all)
y_pred = np.vstack(y_pred_all)

# ---- συνολικές μετρικές ----
p_micro, r_micro, f1_micro, _ = precision_recall_fscore_support(
    y_true, y_pred, average="micro", zero_division=0
)
p_macro, r_macro, f1_macro, _ = precision_recall_fscore_support(
    y_true, y_pred, average="macro", zero_division=0
)
p_w, r_w, f1_w, _ = precision_recall_fscore_support(
    y_true, y_pred, average="weighted", zero_division=0
)

# AUCs (μπορεί να ρίχνει warning αν ένα label έχει μόνο μία κλάση στο test)
try:
    auc_micro = roc_auc_score(y_true, y_prob, average="micro")
except ValueError:
    auc_micro = np.nan

# macro AUC = μέσος per-label AUC όπου γίνεται ο ορισμός
auc_per_label, num = [], 0
for li in range(n_labels):
    y = y_true[:, li]
    s = y_prob[:, li]
    if np.unique(y).size == 2:
        try:
            auc_per_label.append(roc_auc_score(y, s))
            num += 1
        except ValueError:
            pass
auc_macro = float(np.nanmean(auc_per_label)) if num > 0 else np.nan

try:
    auc_weighted = roc_auc_score(y_true, y_prob, average="weighted")
except ValueError:
    auc_weighted = np.nan

summary = pd.DataFrame(
    {
        "Micro":    [p_micro, r_micro, f1_micro, auc_micro],
        "Macro":    [p_macro, r_macro, f1_macro, auc_macro],
        "Weighted": [p_w, r_w, f1_w, auc_weighted],
    },
    index=["Precision", "Recall", "F1-score", "AUC"],
).round(4)

display(summary)

# save
OUT_DIR = RESULTS_DIR
OUT_DIR.mkdir(exist_ok=True, parents=True)
summary.to_csv(OUT_DIR / "overall_metrics_perlabel_capped_RF.csv")
summary.to_json(OUT_DIR / "overall_metrics_perlabel_capped_RF.json", orient="index", indent=2)
print("Saved RF test summary ->", OUT_DIR / "overall_metrics_perlabel_capped_RF.*")


[test] part 1/45 done
[test] part 2/45 done
[test] part 3/45 done
[test] part 4/45 done
[test] part 5/45 done
[test] part 6/45 done
[test] part 7/45 done
[test] part 8/45 done
[test] part 9/45 done
[test] part 10/45 done
[test] part 11/45 done
[test] part 12/45 done
[test] part 13/45 done
[test] part 14/45 done
[test] part 15/45 done
[test] part 16/45 done
[test] part 17/45 done
[test] part 18/45 done
[test] part 19/45 done
[test] part 20/45 done
[test] part 21/45 done
[test] part 22/45 done
[test] part 23/45 done
[test] part 24/45 done
[test] part 25/45 done
[test] part 26/45 done
[test] part 27/45 done
[test] part 28/45 done
[test] part 29/45 done
[test] part 30/45 done
[test] part 31/45 done
[test] part 32/45 done
[test] part 33/45 done
[test] part 34/45 done
[test] part 35/45 done
[test] part 36/45 done
[test] part 37/45 done
[test] part 38/45 done
[test] part 39/45 done
[test] part 40/45 done
[test] part 41/45 done
[test] part 42/45 done
[test] part 43/45 done
[test] part 44/45 do



Unnamed: 0,Micro,Macro,Weighted
Precision,0.1625,0.2279,0.2887
Recall,0.4318,0.4312,0.4318
F1-score,0.2362,0.2573,0.3064
AUC,0.8098,0.8128,0.7749


Saved RF test summary -> /Users/georgektenas/Desktop/Malware Project/data/behavior_vectors_paper/results_rf_capped_eval/overall_metrics_perlabel_capped_RF.*


**Cell 7: Platt calibration per label + new threshold tuning**

**Cell 8: TEST evaluaton with calibrated probs + calibrated thresholds**

In [16]:
# === Cell 0: Config & helpers ===
import json, joblib, numpy as np, pandas as pd
from pathlib import Path
import scipy.sparse as sp
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, precision_recall_curve

# --- Paths (προσαρμόσ' τα αν θες) ---
try:
    CAL_DIR = Path(CAL_DIR)
except NameError:
    CAL_DIR = Path("./calibration")
CAL_DIR.mkdir(parents=True, exist_ok=True)

try:
    RESULTS_DIR = Path(RESULTS_DIR)
except NameError:
    RESULTS_DIR = Path("./results")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# --- Ορισμός VEC όπως στο υπόλοιπο notebook σου ---
try:
    VEC  # αν υπάρχει ήδη, κράτα το
except NameError:
    from pathlib import Path
    VEC = Path(".")  # <-- άλλαξέ το αν έχεις άλλο base dir

# Εκεί που έχεις αποθηκεύσει τα RF μοντέλα σου:
MODEL_DIR = VEC / "models_rf_perlabel_capped"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

def model_path_for(label_idx: int) -> Path:
    """Πλήρης διαδρομή στο αποθηκευμένο RF μοντέλο ανά label."""
    p = MODEL_DIR / f"rf_label_{label_idx:03d}.joblib"
    if not p.exists():
        raise FileNotFoundError(f"Λείπει μοντέλο για label {label_idx}: {p}")
    return p


try:
    MODEL_DIR = Path(MODEL_DIR)
except NameError:
    MODEL_DIR = Path("./models")

# --- Hooks που έχεις ήδη (προσαρμόζεις αν θες) ---
def model_path_for(label_idx: int) -> Path:
    """Δώσε το path του εκπαιδευμένου μοντέλου για το label_idx."""
    p = MODEL_DIR / f"rf_label_{label_idx:03d}.joblib"   # <-- αν θες LGBM: lgb_label_XXX.joblib
    if not p.exists():
        raise FileNotFoundError(f"Λείπει μοντέλο για label {label_idx}: {p}")
    return p

# --- labels ---
try:
    _ = n_labels
except NameError:
    # Προσπάθησε να βρεις από τα αρχεία μοντέλων
    model_files = sorted(MODEL_DIR.glob("rf_label_*.joblib"))
    if not model_files:
        raise RuntimeError("Δεν μπορώ να συναγάγω n_labels: βάλε n_labels=... ή βεβαιώσου ότι υπάρχουν μοντέλα rf_label_*.joblib")
    n_labels = len(model_files)

try:
    _ = label_names
except NameError:
    label_names = [f"label_{i:03d}" for i in range(n_labels)]

# --- Άλλα ---
try:
    DEFAULT_CAP_POS
except NameError:
    DEFAULT_CAP_POS = None  # αν δεν χρησιμοποιείς cap στα θετικά


In [17]:
# === Cell 1: Validation loader → (y_val, p_cal_val) per label ===
from typing import Tuple, Optional

def get_val_probs_for_label_calibrated(label_idx: int, cap_pos=None) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
    """
    Επιστρέφει (y_val, p_cal_val) για το συγκεκριμένο label από το validation.
    Προσπαθεί:
      A) build_dataset_for_label(label_idx, cap_pos) -> X_tr, y_tr, X_val, y_val
         (ΣΥΝΙΣΤΑΤΑΙ)
      B) TODO: αν δεν υπάρχει, βάλε δικό σου loader για XVAL/YVAL lists (δες το σημείο).
    """
    # --- A) Συνήθης περίπτωση: έχεις build_dataset_for_label(...)
    try:
        X_tr, y_tr, X_val, y_val = build_dataset_for_label(label_idx, cap_pos)
        if X_val is None or y_val is None or len(y_val) == 0:
            return None, None
    except NameError:
        # --- B) TODO: Manual loader για validation, π.χ. από λίστες XVAL/YVAL (πρόσθεσε τα δικά σου)
        raise RuntimeError(
            "Δεν βρέθηκε build_dataset_for_label(). Πρόσθεσε loader για validation εδώ (XVAL/YVAL) ή ορίσε τη συνάρτηση."
        )

    # --- Φόρτωση μοντέλου & raw probs ---
    rf = joblib.load(model_path_for(label_idx))
    p_raw = rf.predict_proba(X_val)[:, 1].ravel()

    # --- Platt scaling (αν έχεις εκπαιδεύσει ήδη) ---
    platt_fp = CAL_DIR / f"platt_label_{label_idx:03d}.joblib"
    if platt_fp.exists():
        lr = joblib.load(platt_fp)
        p_cal = lr.predict_proba(p_raw.reshape(-1, 1))[:, 1].ravel()
    else:
        p_cal = p_raw

    return y_val.astype(np.int32).ravel(), p_cal.astype(np.float32).ravel()


In [18]:
# === Cell 2: Train Platt (Logistic) per label on validation ===
trained, skipped = 0, 0
for li in range(n_labels):
    yv, pv = get_val_probs_for_label_calibrated(li, cap_pos=DEFAULT_CAP_POS)
    if yv is None or pv is None or np.unique(yv).size < 2:
        skipped += 1
        continue
    # train logistic on raw probs of validation
    # Προσοχή: αν στο Cell 1 φόρτωσες ήδη Platt, εδώ θέλουμε raw.
    # Για ασφάλεια, υπολόγισε ξανά p_raw:
    X_tr, y_tr, X_val, y_val = build_dataset_for_label(li, DEFAULT_CAP_POS)
    rf = joblib.load(model_path_for(li))
    p_raw = rf.predict_proba(X_val)[:, 1].ravel()

    if np.unique(y_val).size < 2:
        skipped += 1
        continue

    lr = LogisticRegression(solver="lbfgs")
    lr.fit(p_raw.reshape(-1,1), y_val.astype(np.int32).ravel())
    joblib.dump(lr, CAL_DIR / f"platt_label_{li:03d}.joblib")
    trained += 1

print(f"✅ Platt trained for {trained} labels; skipped {skipped} (μονοταξικά ή χωρίς val).")


✅ Platt trained for 64 labels; skipped 0 (μονοταξικά ή χωρίς val).


In [19]:
# === Cell 3: Collect val (y,p_cal) for ALL labels + Beta-tuning ===
def thresholds_for_beta(y_val_list, p_val_list, beta: float):
    thr_map = {}
    for li, (yv, pv) in enumerate(zip(y_val_list, p_val_list)):
        if yv is None or pv is None or np.unique(yv).size < 2:
            thr_map[li] = 0.5
            continue
        prec, rec, thr = precision_recall_curve(yv, pv)
        fb = (1 + beta**2) * prec * rec / (beta**2 * prec + rec + 1e-12)
        if len(thr) == 0 or np.all(np.isnan(fb)):
            thr_map[li] = 0.5
        else:
            idx = int(np.nanargmax(fb))
            thr_map[li] = float(thr[max(0, min(idx, len(thr)-1))])
    return thr_map

def eval_macro_on_val(y_val_list, p_val_list, thr_map):
    P, R, F = [], [], []
    for li, (yv, pv) in enumerate(zip(y_val_list, p_val_list)):
        if yv is None or pv is None or np.unique(yv).size < 2:
            continue
        yhat = (pv >= thr_map.get(li, 0.5)).astype(np.int32)
        p, r, f1, _ = precision_recall_fscore_support(yv, yhat, average="binary", zero_division=0)
        P.append(p); R.append(r); F.append(f1)
    macroP = float(np.mean(P)) if P else np.nan
    macroR = float(np.mean(R)) if R else np.nan
    macroF = float(np.mean(F)) if F else np.nan
    return macroP, macroR, macroF

# --- Συλλογή από validation (με Platt) ---
y_val_list, p_val_list = [], []
for li in range(n_labels):
    # παραγωγή p_cal με το Platt που εκπαιδεύσαμε μόλις
    X_tr, y_tr, X_val, y_val = build_dataset_for_label(li, DEFAULT_CAP_POS)
    if X_val is None or y_val is None or len(y_val) == 0 or np.unique(y_val).size < 2:
        y_val_list.append(None); p_val_list.append(None)
        continue
    rf = joblib.load(model_path_for(li))
    p_raw = rf.predict_proba(X_val)[:, 1].ravel()

    platt_fp = CAL_DIR / f"platt_label_{li:03d}.joblib"
    if platt_fp.exists():
        lr = joblib.load(platt_fp)
        p_cal = lr.predict_proba(p_raw.reshape(-1,1))[:,1].ravel()
    else:
        p_cal = p_raw

    y_val_list.append(y_val.astype(np.int32).ravel())
    p_val_list.append(p_cal.astype(np.float32).ravel())

# --- baseline: thresholds = 0.5 (ή από προηγούμενο calibration αν έχεις) ---
cal_thr_map = {li: 0.5 for li in range(n_labels)}
baseP, baseR, baseF = eval_macro_on_val(y_val_list, p_val_list, cal_thr_map)
print(f"[VAL baseline] MacroP={baseP:.4f} | MacroR={baseR:.4f} | MacroF1={baseF:.4f}")

# --- grid search στο beta ---
candidates = np.linspace(0.6, 1.6, 11)  # 0.6,0.7,...,1.6
best = None
best_rec = None
for beta in candidates:
    thr_map = thresholds_for_beta(y_val_list, p_val_list, beta=beta)
    mP, mR, mF = eval_macro_on_val(y_val_list, p_val_list, thr_map)
    # απαίτηση: να μην πέφτουν MacroP & MacroR έναντι baseline
    if (np.isnan(mP) or np.isnan(mR) or np.isnan(mF)): 
        continue
    if (mP >= baseP - 1e-9) and (mR >= baseR - 1e-9):
        if best is None or (mF > best_rec[2]):
            best = (beta, thr_map)
            best_rec = (mP, mR, mF)

# fallback: αν δεν ικανοποιήθηκε constraint, πάμε για μέγιστο Macro-F1
if best is None:
    for beta in candidates:
        thr_map = thresholds_for_beta(y_val_list, p_val_list, beta=beta)
        mP, mR, mF = eval_macro_on_val(y_val_list, p_val_list, thr_map)
        if best is None or (mF > best_rec[2]):
            best = (beta, thr_map)
            best_rec = (mP, mR, mF)
    print("[VAL] Δεν βρέθηκε beta που να ανεβάζει και P&R. Επιλέγω beta με μέγιστο Macro-F1.")

beta_star, tuned_thr_map = best
print(f"[VAL] Επιλογή beta={beta_star:.2f} | MacroP={best_rec[0]:.4f} | MacroR={best_rec[1]:.4f} | MacroF1={best_rec[2]:.4f}")

# --- αποθήκευση σε calibrated_thresholds.json ---
cal_info = {
    "beta": float(beta_star),
    "thresholds": {str(k): float(v) for k, v in tuned_thr_map.items()},
    # προαιρετικά αποθήκευσε paths platt για διαφάνεια
    "models": {f"{li}": str((CAL_DIR / f"platt_label_{li:03d}.joblib").resolve()) for li in range(n_labels)}
}
with open(CAL_DIR / "calibrated_thresholds.json", "w") as f:
    json.dump(cal_info, f, indent=2)
print("✅ Saved tuned thresholds to", CAL_DIR / "calibrated_thresholds.json")

[VAL baseline] MacroP=0.9474 | MacroR=0.8560 | MacroF1=0.8977
[VAL] Επιλογή beta=0.60 | MacroP=0.9515 | MacroR=0.8649 | MacroF1=0.9042
✅ Saved tuned thresholds to /Users/georgektenas/Desktop/Malware Project/data/behavior_vectors_paper/results_rf_capped_eval/calibration_rf/calibrated_thresholds.json


In [20]:
# === Cell 4: Integrity check & pairing for TEST chunks (must be 1-to-1) ===
import re
from glob import glob
from collections import defaultdict

# Αν έχεις ήδη XTE, YTE λίστες, αυτό το cell απλά ελέγχει· αλλιώς βρίσκει μόνο του.
def _idx(p: Path) -> int:
    m = re.search(r'(\d+)(?=\.\w+$)', p.name)
    return int(m.group(1)) if m else -1

try:
    XTE  # noqa
    YTE  # noqa
    X_candidates = [Path(p) for p in XTE]
    Y_candidates = [Path(p) for p in YTE]
except NameError:
    # TODO: Αν δεν έχεις ορίσει XTE/YTE, βρες τα εδώ:
    VEC = Path("./vectors")  # <-- ΠΡΟΣΑΡΜΟΣΕ ΦΑΚΕΛΟ
    x_patterns = ["test_part_capped*.npz", "X_test_part_capped*.npz", "X_te_part_*.npz"]
    y_patterns = ["y_test_part_capped*.npy", "y_test_part_*.npy", "y_te_part_*.npy"]
    X_candidates, Y_candidates = [], []
    for pat in x_patterns: X_candidates += list(map(Path, glob(str(VEC / pat))))
    for pat in y_patterns: Y_candidates += list(map(Path, glob(str(VEC / pat))))

X_candidates = sorted([p for p in X_candidates if _idx(p) >= 0], key=_idx)
Y_by_idx = defaultdict(list)
for yp in Y_candidates:
    i = _idx(yp)
    if i >= 0:
        Y_by_idx[i].append(yp)

def _pri(p: Path) -> int:
    n = p.name
    if "y_test_part_capped" in n: return 0
    if "y_test_part_" in n:       return 1
    if "y_te_part_" in n:         return 2
    return 9

XTE_new, YTE_new, bad = [], [], []
for xp in X_candidates:
    i = _idx(xp)
    Xn = sp.load_npz(xp).shape[0]
    cands = sorted(Y_by_idx.get(i, []), key=_pri)
    chosen = None
    for yp in cands:
        Yn = np.load(yp, allow_pickle=False).shape[0]
        if Yn == Xn:
            chosen = yp; break
    if chosen is None:
        bad.append((i, xp.name, Xn, [f"{p.name}:{np.load(p, allow_pickle=False).shape[0]}" for p in cands]))
    else:
        XTE_new.append(xp); YTE_new.append(chosen)

print(f"✅ Paired {len(XTE_new)} X parts with matching Y out of {len(X_candidates)} candidates.")
if bad:
    print("⚠️ Unpaired indices:")
    for i, xname, Xn, ys in bad[:10]:
        print(f"  idx {i:03d} | X={xname} ({Xn}) | Y-cands -> {ys if ys else 'NONE'}")

# Χρησιμοποίησέ τα:
XTE, YTE = XTE_new, YTE_new

# Fail-fast έλεγχος
for j, (xp, yp) in enumerate(zip(XTE, YTE), start=1):
    nx = sp.load_npz(xp).shape[0]
    ny = np.load(yp, allow_pickle=False).shape[0]
    assert nx == ny, f"[TEST misaligned] Part {j}: X rows {nx} != Y rows {ny}."

print("✅ All paired TEST parts are 1-to-1 aligned and ready.")


✅ Paired 45 X parts with matching Y out of 45 candidates.
✅ All paired TEST parts are 1-to-1 aligned and ready.


In [22]:
# === Cell 5: Overall TEST metrics for RF + Calibration + Beta-Tuned thresholds ===
import numpy as np
import pandas as pd
import joblib, json
import scipy.sparse as sp
from pathlib import Path
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score

# --- Paths & config assumed από προηγούμενα κελιά ---
# CAL_DIR, RESULTS_DIR, n_labels, XTE, YTE, model_path_for(...)

cal_file = CAL_DIR / "calibrated_thresholds.json"
assert cal_file.exists(), f"Missing {cal_file}"
with open(cal_file, "r") as f:
    cal_info = json.load(f)

# thresholds + beta από το προηγούμενο cell
cal_thr_map = {int(k): float(v) for k, v in cal_info["thresholds"].items()}
beta_star   = float(cal_info.get("beta", 1.0))

# προαιρετικά: μονοπάτια platt (αν σώθηκαν μέσα στο json)
platt_map = {}
for k, v in cal_info.get("models", {}).items():
    p = Path(v)
    if p.exists():
        platt_map[int(k)] = p
# fallback: αν δεν έχει map στο json, ψάξε με βάση pattern στο CAL_DIR
def platt_path_for(li: int) -> Path:
    p = platt_map.get(li, CAL_DIR / f"platt_label_{li:03d}.joblib")
    return p

print(f"[TEST] Using beta*={beta_star} and per-label thresholds from {cal_file.name}")

y_true_all, y_prob_all, y_pred_all = [], [], []

for i, (xp, yp) in enumerate(zip(XTE, YTE), start=1):
    X = sp.load_npz(xp)
    Y = np.load(yp, allow_pickle=False).astype(np.int8)

    # --- MUST be 2D: (rows, n_labels) ---
    assert Y.ndim == 2 and Y.shape[1] == n_labels, (
        f"Part {i}: Y must be 2D with shape (*,{n_labels}), got {Y.shape}"
    )
    assert X.shape[0] == Y.shape[0], (
        f"Part {i}: X {X.shape[0]} vs Y {Y.shape[0]} rows mismatch"
    )
    n_i = X.shape[0]

    probs = np.zeros((n_i, n_labels), dtype=np.float32)
    preds = np.zeros((n_i, n_labels), dtype=np.int8)

    for li in range(n_labels):
        # φόρτωσε RF μοντέλο
        try:
            rf = joblib.load(model_path_for(li))
        except FileNotFoundError:
            # αν δεν υπάρχει (π.χ. skipped), κράτα μηδενικά
            continue

        # raw prob από RF
        p_raw = rf.predict_proba(X)[:, 1].reshape(-1, 1)

        # Platt calibration αν υπάρχει, αλλιώς raw
        pp = platt_path_for(li)
        if pp.exists():
            lr = joblib.load(pp)
            p_cal = lr.predict_proba(p_raw)[:, 1]
        else:
            p_cal = p_raw.ravel()

        probs[:, li] = p_cal.astype(np.float32)

        thr = cal_thr_map.get(li, 0.5)
        preds[:, li] = (p_cal >= thr).astype(np.int8)

    y_true_all.append(Y)
    y_prob_all.append(probs)
    y_pred_all.append(preds)
    print(f"[TEST RF+CAL+BETA] part {i}/{len(XTE)} done")

# --- Συγκέντρωση ---
y_true = np.vstack(y_true_all)          # (N, L)
y_prob = np.vstack(y_prob_all)          # (N, L)
y_pred = np.vstack(y_pred_all)          # (N, L)

# --- Συνολικές μετρικές ---
p_micro, r_micro, f1_micro, _ = precision_recall_fscore_support(
    y_true, y_pred, average="micro", zero_division=0
)
p_macro, r_macro, f1_macro, _ = precision_recall_fscore_support(
    y_true, y_pred, average="macro", zero_division=0
)
p_w, r_w, f1_w, _ = precision_recall_fscore_support(
    y_true, y_pred, average="weighted", zero_division=0
)

# AUCs
try:
    auc_micro = roc_auc_score(y_true, y_prob, average="micro")
except ValueError:
    auc_micro = np.nan

# macro AUC: μέσος per-label όπου ορίζεται
auc_per_label, num = [], 0
for li in range(n_labels):
    y = y_true[:, li]
    s = y_prob[:, li]
    if np.unique(y).size == 2:
        try:
            auc_per_label.append(roc_auc_score(y, s))
            num += 1
        except ValueError:
            pass
auc_macro = float(np.nanmean(auc_per_label)) if num > 0 else np.nan

try:
    auc_weighted = roc_auc_score(y_true, y_prob, average="weighted")
except ValueError:
    auc_weighted = np.nan

summary_test = pd.DataFrame(
    {
        "Micro":    [p_micro, r_micro, f1_micro, auc_micro],
        "Macro":    [p_macro, r_macro, f1_macro, auc_macro],
        "Weighted": [p_w, r_w, f1_w, auc_weighted],
    },
    index=["Precision", "Recall", "F1-score", "AUC"],
).round(4)

display(summary_test)

# Save
out_csv  = RESULTS_DIR / "overall_metrics_RF_CAL_BETA.csv"
out_json = RESULTS_DIR / "overall_metrics_RF_CAL_BETA.json"
summary_test.to_csv(out_csv)
summary_test.to_json(out_json, orient="index", indent=2)

print("✅ Saved overall metrics ->", out_csv, " & ", out_json)


[TEST] Using beta*=0.6 and per-label thresholds from calibrated_thresholds.json
[TEST RF+CAL+BETA] part 1/45 done
[TEST RF+CAL+BETA] part 2/45 done
[TEST RF+CAL+BETA] part 3/45 done
[TEST RF+CAL+BETA] part 4/45 done
[TEST RF+CAL+BETA] part 5/45 done
[TEST RF+CAL+BETA] part 6/45 done
[TEST RF+CAL+BETA] part 7/45 done
[TEST RF+CAL+BETA] part 8/45 done
[TEST RF+CAL+BETA] part 9/45 done
[TEST RF+CAL+BETA] part 10/45 done
[TEST RF+CAL+BETA] part 11/45 done
[TEST RF+CAL+BETA] part 12/45 done
[TEST RF+CAL+BETA] part 13/45 done
[TEST RF+CAL+BETA] part 14/45 done
[TEST RF+CAL+BETA] part 15/45 done
[TEST RF+CAL+BETA] part 16/45 done
[TEST RF+CAL+BETA] part 17/45 done
[TEST RF+CAL+BETA] part 18/45 done
[TEST RF+CAL+BETA] part 19/45 done
[TEST RF+CAL+BETA] part 20/45 done
[TEST RF+CAL+BETA] part 21/45 done
[TEST RF+CAL+BETA] part 22/45 done
[TEST RF+CAL+BETA] part 23/45 done
[TEST RF+CAL+BETA] part 24/45 done
[TEST RF+CAL+BETA] part 25/45 done
[TEST RF+CAL+BETA] part 26/45 done
[TEST RF+CAL+BETA] 



Unnamed: 0,Micro,Macro,Weighted
Precision,0.2678,0.3474,0.4235
Recall,0.3373,0.3485,0.3373
F1-score,0.2986,0.2889,0.3251
AUC,0.7997,0.8128,0.7749


✅ Saved overall metrics -> /Users/georgektenas/Desktop/Malware Project/data/behavior_vectors_paper/results_rf_capped_eval/overall_metrics_RF_CAL_BETA.csv  &  /Users/georgektenas/Desktop/Malware Project/data/behavior_vectors_paper/results_rf_capped_eval/overall_metrics_RF_CAL_BETA.json
