## XGBoost 

**Cell 1: Imports, Paths, Labels,Part files**

In [12]:
from pathlib import Path
import json
import numpy as np
import scipy.sparse as sp
from glob import glob
import re

# Paths (ίδια λογική με τα προηγούμενα notebooks)
BASE = Path.home() / "Desktop" / "Malware Project"
VEC  = BASE / "data" / "behavior_vectors_paper"

# Models & results folders για XGBoost
MODELS_DIR = VEC / "models_xgb_capped"
RESULTS_DIR = VEC / "results_xgb_capped_eval"
MODELS_DIR.mkdir(exist_ok=True, parents=True)
RESULTS_DIR.mkdir(exist_ok=True, parents=True)

# Φόρτωση label names (προτιμούμε label_map_balanced.json αν υπάρχει)
label_map_paths = [
    VEC / "label_map_balanced.json",
    VEC / "label_map.json",
]

label_names = None
for p in label_map_paths:
    if p.exists():
        with open(p, "r") as f:
            data = json.load(f)
        if isinstance(data, dict) and "labels" in data:
            label_names = data["labels"]
            break

if label_names is None:
    raise FileNotFoundError("Δεν βρέθηκε ούτε label_map_balanced.json ούτε label_map.json στο VEC.")

n_labels = len(label_names)
print(f"Loaded labels: {n_labels}")
print(", ".join(label_names[:10]) + (" ..." if n_labels > 10 else ""))

# Train/Test parts: X CAPPED + Y BALANCED (όπως σε RF & LightGBM)
XTRN = sorted(map(Path, glob(str(VEC / "train_part_capped*.npz"))))
YTRN_CAP = sorted(map(Path, glob(str(VEC / "y_train_part_capped*.npy"))))

# Load TEST parts: match capped X with balanced Y using part indexes
def _index_from_name(p: Path) -> int:
    m = re.search(r'(\d+)$', p.stem)
    if not m:
        raise ValueError(f"No trailing index found in filename: {p.name}")
    return int(m.group(1))

XTE = sorted(map(Path, glob(str(VEC / "test_part_capped*.npz"))))
idx_te = [_index_from_name(p) for p in XTE]

all_YTE = list(map(Path, glob(str(VEC / "y_test_part_capped*.npy"))))
map_Y = {_index_from_name(p): p for p in all_YTE}

YTE = [map_Y[i] for i in idx_te if i in map_Y]

print("Train parts (X):", len(XTRN), "| Train parts (Y_cap):", len(YTRN_CAP))
print("Test  parts (X):", len(XTE), "| Test  parts (Y):", len(YTE))

# Έλεγχος row counts
def _rows_X(path): return sp.load_npz(path).shape[0]
def _rows_Y(path): return np.load(path, allow_pickle=False).shape[0]

assert len(XTRN) == len(YTRN_CAP), "Αναντιστοιχία train X με y_train_part_capped*.npy"

print(f"[train part 0] rows X={_rows_X(XTRN[0])}, Y={_rows_Y(YTRN_CAP[0])}")
print(f"[test  part 0] rows X={_rows_X(XTE[0])}, Y={_rows_Y(YTE[0])}")


Loaded labels: 64
adware, antiav, antifw, autorun, backdoor, banker, bho, binder, blocker, bundler ...
Train parts (X): 551 | Train parts (Y_cap): 551
Test  parts (X): 45 | Test  parts (Y): 45
[train part 0] rows X=2851, Y=2851
[test  part 0] rows X=12, Y=12


**Cell 2: Assemble X_Train,Y_train multilabel (SPARSE)**

In [None]:
import numpy as np
import scipy.sparse as sp

# Train X (sparse)
X_train = sp.vstack([sp.load_npz(p) for p in XTRN]).tocsr()

# Train Y (multi-label, shape = [N, n_labels])
Y_train = np.concatenate([np.load(p, allow_pickle=False) for p in YTRN_CAP])
print("X_train:", X_train.shape, "| Y_train:", Y_train.shape)

# Test X
X_test_parts = [sp.load_npz(p).tocsr() for p in XTE]
Y_test_parts = [np.load(p, allow_pickle=False) for p in YTE]
print("Loaded", len(X_test_parts), "test X parts &", len(Y_test_parts), "test Y parts")

# sanity
assert Y_train.shape[1] == len(label_names)
for Xt, Yt in zip(X_test_parts, Y_test_parts):
    assert Xt.shape[0] == Yt.shape[0]
    assert Yt.shape[1] == len(label_names)

print(" Multi-label data loaded correctly.")


X_train: (1233020, 2381) | Y_train: (1233020, 64)
Loaded 45 test X parts & 45 test Y parts
✅ Multi-label data loaded correctly.


**Cell 3: Train one XGBoost model per label**

In [None]:
from xgboost import XGBClassifier
from collections import Counter
import joblib
import time

n_labels = Y_train.shape[1]
print("Training", n_labels, "label-wise XGBoost models...")

base_params = dict(
    objective="binary:logistic",
    booster="gbtree",
    tree_method="hist",
    eval_metric="logloss",
    n_jobs=-1,
    random_state=42,
    n_estimators=200,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    reg_alpha=0.0,
)

def model_path_for(li: int):
    return MODELS_DIR / f"xgb_label_{li:02d}.joblib"

total_start = time.time()
trained = 0
skipped = 0

for li in range(n_labels):
    y_col = Y_train[:, li].astype(int)
    cls = np.unique(y_col)

    # αν η ετικέτα δεν έχει και τις 2 κλάσεις, δεν έχει νόημα να εκπαιδεύσουμε
    if len(cls) < 2:
        print(f"[label {li:02d}] only class {cls[0]} -> skip")
        skipped += 1
        continue

    print(f"[label {li:02d}] '{label_names[li]}' counts:", Counter(y_col))

    model = XGBClassifier(**base_params)
    t0 = time.time()
    model.fit(X_train, y_col)
    dt = time.time() - t0

    joblib.dump(model, model_path_for(li))
    print(f"    saved -> {model_path_for(li).name} ({dt:.1f}s)")
    trained += 1

print(f"\n Trained {trained} models, skipped {skipped}.")
print(f"Total training time: {time.time() - total_start:.1f} sec")


Training 64 label-wise XGBoost models...
[label 00] 'adware' counts: Counter({np.int64(0): 1160488, np.int64(1): 72532})
    saved -> xgb_label_00.joblib (82.2s)
[label 01] 'antiav' counts: Counter({np.int64(0): 1175085, np.int64(1): 57935})
    saved -> xgb_label_01.joblib (80.5s)
[label 02] 'antifw' counts: Counter({np.int64(0): 1195844, np.int64(1): 37176})
    saved -> xgb_label_02.joblib (100.8s)
[label 03] 'autorun' counts: Counter({np.int64(0): 1211925, np.int64(1): 21095})
    saved -> xgb_label_03.joblib (83.9s)
[label 04] 'backdoor' counts: Counter({np.int64(0): 1166086, np.int64(1): 66934})
    saved -> xgb_label_04.joblib (80.5s)
[label 05] 'banker' counts: Counter({np.int64(0): 1177651, np.int64(1): 55369})
    saved -> xgb_label_05.joblib (83.4s)
[label 06] 'bho' counts: Counter({np.int64(0): 1190143, np.int64(1): 42877})
    saved -> xgb_label_06.joblib (87.1s)
[label 07] 'binder' counts: Counter({np.int64(0): 1229527, np.int64(1): 3493})
    saved -> xgb_label_07.joblib

**Cell 4: Evaluate on Test with per-label thresholds (XGBoost)**

In [23]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import json
import numpy as np
import pandas as pd
import joblib

# ---- thresholds ----
USE_THRESHOLDS = False  # αν έχεις αρχείο thresholds από tuning, κάντο True

if USE_THRESHOLDS:
    THRESH_PATH = RESULTS_DIR / "xgb_label_thresholds.json"  # προσαρμοσε αν υπάρχει άλλο
    with open(THRESH_PATH, "r") as f:
        thr_info = json.load(f)
    thr_map = {int(k): float(v) for k, v in thr_info["thresholds"].items()}
    print("Loaded thresholds from", THRESH_PATH)
else:
    thr_map = {}
    print("Using default threshold 0.5 for all labels.")

def model_path_for(li: int):
    return MODELS_DIR / f"xgb_label_{li:02d}.joblib"

# ---- collect predictions across all test parts ----
y_true_all, y_prob_all, y_pred_all = [], [], []

for i, (Xt, Yt) in enumerate(zip(X_test_parts, Y_test_parts), start=1):
    Xt = Xt.tocsr()
    Yt = Yt.astype(np.int8)

    probs = np.zeros_like(Yt, dtype=np.float32)
    preds = np.zeros_like(Yt, dtype=np.int8)

    for li in range(n_labels):
        model_file = model_path_for(li)
        if not model_file.exists():
            continue

        xgb_li = joblib.load(model_file)
        p = xgb_li.predict_proba(Xt)[:, 1]
        probs[:, li] = p

        thr = thr_map.get(li, 0.5)
        preds[:, li] = (p >= thr).astype(np.int8)

    y_true_all.append(Yt)
    y_prob_all.append(probs)
    y_pred_all.append(preds)
    print(f"[test] part {i}/{len(X_test_parts)} done")

y_true = np.vstack(y_true_all)
y_prob = np.vstack(y_prob_all)
y_pred = np.vstack(y_pred_all)

# ---- συνολικές μετρικές multi-label ----
p_micro, r_micro, f1_micro, _ = precision_recall_fscore_support(
    y_true, y_pred, average="micro", zero_division=0
)
p_macro, r_macro, f1_macro, _ = precision_recall_fscore_support(
    y_true, y_pred, average="macro", zero_division=0
)
p_w, r_w, f1_w, _ = precision_recall_fscore_support(
    y_true, y_pred, average="weighted", zero_division=0
)

# AUCs (με προσοχή για labels με 1 κλάση)
try:
    auc_micro = roc_auc_score(y_true, y_prob, average="micro")
except ValueError:
    auc_micro = np.nan

# macro AUC = μέσος per-label AUC όπου ορίζεται
auc_per_label = []
for li in range(n_labels):
    y = y_true[:, li]
    s = y_prob[:, li]
    if np.unique(y).size == 2:
        try:
            auc_per_label.append(roc_auc_score(y, s))
        except ValueError:
            pass
auc_macro = float(np.nanmean(auc_per_label)) if auc_per_label else np.nan

try:
    auc_weighted = roc_auc_score(y_true, y_prob, average="weighted")
except ValueError:
    auc_weighted = np.nan

summary = pd.DataFrame(
    {
        "Micro":    [p_micro, r_micro, f1_micro, auc_micro],
        "Macro":    [p_macro, r_macro, f1_macro, auc_macro],
        "Weighted": [p_w,     r_w,     f1_w,     auc_weighted],
    },
    index=["Precision", "Recall", "F1-score", "AUC"],
).round(4)

display(summary)

# save (ίδιο στυλ με RF)
OUT_DIR = RESULTS_DIR
OUT_DIR.mkdir(exist_ok=True, parents=True)
summary.to_csv(OUT_DIR / "overall_metrics_xgb_capped_multi_label.csv")
summary.to_json(OUT_DIR / "overall_metrics_xgb_capped_multi_label.json",
                orient="index", indent=2)

print("Saved XGB test summary -> overall_metrics_xgb_capped_multi_label.(csv/json)")


Using default threshold 0.5 for all labels.
[test] part 1/45 done
[test] part 2/45 done
[test] part 3/45 done
[test] part 4/45 done
[test] part 5/45 done
[test] part 6/45 done
[test] part 7/45 done
[test] part 8/45 done
[test] part 9/45 done
[test] part 10/45 done
[test] part 11/45 done
[test] part 12/45 done
[test] part 13/45 done
[test] part 14/45 done
[test] part 15/45 done
[test] part 16/45 done
[test] part 17/45 done
[test] part 18/45 done
[test] part 19/45 done
[test] part 20/45 done
[test] part 21/45 done
[test] part 22/45 done
[test] part 23/45 done
[test] part 24/45 done
[test] part 25/45 done
[test] part 26/45 done
[test] part 27/45 done
[test] part 28/45 done
[test] part 29/45 done
[test] part 30/45 done
[test] part 31/45 done
[test] part 32/45 done
[test] part 33/45 done
[test] part 34/45 done
[test] part 35/45 done
[test] part 36/45 done
[test] part 37/45 done
[test] part 38/45 done
[test] part 39/45 done
[test] part 40/45 done
[test] part 41/45 done
[test] part 42/45 done



Unnamed: 0,Micro,Macro,Weighted
Precision,0.5912,0.4352,0.5082
Recall,0.256,0.2382,0.256
F1-score,0.3573,0.2694,0.2904
AUC,0.8238,0.8164,0.7892


Saved XGB test summary -> overall_metrics_xgb_capped_multi_label.(csv/json)


**Cell 5: Calibration set & per-label β-threshold tuning**


In [None]:
import numpy as np
import json
from sklearn.metrics import precision_recall_fscore_support
import joblib

# Χρησιμοποιούμε μέρος του training ως calibration set για thresholds
np.random.seed(42)

cal_size = min(300000, X_train.shape[0])   # μπορείς να το αλλάξεις αν θες
cal_indices = np.random.choice(X_train.shape[0], size=cal_size, replace=False)

X_cal = X_train[cal_indices]
Y_cal = Y_train[cal_indices]

print("Calibration set:", X_cal.shape, Y_cal.shape)

n_labels = Y_train.shape[1]
BETA = 1.5   # β>1 -> δίνουμε περισσότερο βάρος στο Recall 

def model_path_for(li: int):
    return MODELS_DIR / f"xgb_label_{li:02d}.joblib"

def f_beta(prec, rec, beta):
    if prec == 0 and rec == 0:
        return 0.0
    b2 = beta * beta
    return (1 + b2) * prec * rec / (b2 * prec + rec)

thr_grid = np.linspace(0.05, 0.95, 19)  # 0.05, 0.10, ..., 0.95

best_thresholds = {}
report_rows = []

for li in range(n_labels):
    model_file = model_path_for(li)
    if not model_file.exists():
        # label skipped στο training
        continue

    y_true = Y_cal[:, li].astype(int)
    if np.unique(y_true).size < 2:
        # δεν έχουμε και 0 και 1 για calibration -> κράτα default 0.5
        best_thresholds[li] = 0.5
        report_rows.append((li, label_names[li], 0.5, 0.0, 0.0, 0.0))
        continue

    xgb_li = joblib.load(model_file)
    proba = xgb_li.predict_proba(X_cal)[:, 1]

    best_thr = 0.5
    best_fb = -1.0
    best_p = best_r = 0.0

    for thr in thr_grid:
        y_pred = (proba >= thr).astype(int)
        p, r, _, _ = precision_recall_fscore_support(
            y_true, y_pred, average="binary", zero_division=0
        )
        fb = f_beta(p, r, BETA)
        if fb > best_fb:
            best_fb = fb
            best_thr = float(thr)
            best_p, best_r = float(p), float(r)

    best_thresholds[li] = best_thr
    report_rows.append((li, label_names[li], best_thr, best_p, best_r, best_fb))

# Save thresholds
thr_info = {
    "beta": BETA,
    "thresholds": {str(li): thr for li, thr in best_thresholds.items()}
}

THRESH_PATH = RESULTS_DIR / f"xgb_label_thresholds_beta{BETA:.1f}.json"
with open(THRESH_PATH, "w") as f:
    json.dump(thr_info, f, indent=2)

print(f"\n Saved per-label thresholds -> {THRESH_PATH}")

# μικρός πίνακας για να δω τα πρώτα 10 
import pandas as pd
thr_df = pd.DataFrame(report_rows, columns=["label_id", "label_name", "best_thr", "P", "R", f"F_beta_{BETA:.1f}"])
display(thr_df.head(10))


Calibration set: (300000, 2381) (300000, 64)

✅ Saved per-label thresholds -> /Users/georgektenas/Desktop/Malware Project/data/behavior_vectors_paper/results_xgb_capped_eval/xgb_label_thresholds_beta1.5.json


Unnamed: 0,label_id,label_name,best_thr,P,R,F_beta_1.5
0,0,adware,0.25,0.820513,0.924771,0.889976
1,1,antiav,0.15,0.906283,0.908536,0.907841
2,2,antifw,0.75,0.998014,0.999889,0.999312
3,3,autorun,0.15,0.648714,0.838584,0.769303
4,4,backdoor,0.15,0.539407,0.671996,0.624745
5,5,banker,0.25,0.816415,0.870838,0.853335
6,6,bho,0.25,0.914827,0.924727,0.921658
7,7,binder,0.2,0.756632,0.767251,0.763952
8,8,blocker,0.15,0.619695,0.691141,0.667463
9,9,bundler,0.15,0.67619,0.78022,0.744956


**Cell 6: Re-evaluate the TEST with tuned thresholds**


In [None]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import numpy as np
import pandas as pd
import json
import joblib

# φόρτωσε τα thresholds από το προηγούμενο cell
with open(THRESH_PATH, "r") as f:
    thr_info = json.load(f)

beta_used = thr_info["beta"]
thr_map = {int(k): float(v) for k, v in thr_info["thresholds"].items()}
print(f"Loaded thresholds (beta={beta_used}) from {THRESH_PATH.name}")

n_labels = Y_train.shape[1]

def model_path_for(li: int):
    return MODELS_DIR / f"xgb_label_{li:02d}.joblib"

y_true_all, y_prob_all, y_pred_all = [], [], []

for i, (Xt, Yt) in enumerate(zip(X_test_parts, Y_test_parts), start=1):
    Xt = Xt.tocsr()
    Yt = Yt.astype(np.int8)

    probs = np.zeros_like(Yt, dtype=np.float32)
    preds = np.zeros_like(Yt, dtype=np.int8)

    for li in range(n_labels):
        model_file = model_path_for(li)
        if not model_file.exists():
            continue

        xgb_li = joblib.load(model_file)
        p = xgb_li.predict_proba(Xt)[:, 1]
        probs[:, li] = p

        thr = thr_map.get(li, 0.5)
        preds[:, li] = (p >= thr).astype(np.int8)

    y_true_all.append(Yt)
    y_prob_all.append(probs)
    y_pred_all.append(preds)
    print(f"[test+thr] part {i}/{len(X_test_parts)} done")

y_true = np.vstack(y_true_all)
y_prob = np.vstack(y_prob_all)
y_pred = np.vstack(y_pred_all)

# ---- συνολικές μετρικές ----
p_micro, r_micro, f1_micro, _ = precision_recall_fscore_support(
    y_true, y_pred, average="micro", zero_division=0
)
p_macro, r_macro, f1_macro, _ = precision_recall_fscore_support(
    y_true, y_pred, average="macro", zero_division=0
)
p_w, r_w, f1_w, _ = precision_recall_fscore_support(
    y_true, y_pred, average="weighted", zero_division=0
)

# AUCs (ίδιο handling όπως πριν)
try:
    auc_micro = roc_auc_score(y_true, y_prob, average="micro")
except ValueError:
    auc_micro = np.nan

auc_per_label = []
for li in range(n_labels):
    y = y_true[:, li]
    s = y_prob[:, li]
    if np.unique(y).size == 2:
        try:
            auc_per_label.append(roc_auc_score(y, s))
        except ValueError:
            pass
auc_macro = float(np.nanmean(auc_per_label)) if auc_per_label else np.nan

try:
    auc_weighted = roc_auc_score(y_true, y_prob, average="weighted")
except ValueError:
    auc_weighted = np.nan

summary_thr = pd.DataFrame(
    {
        "Micro":    [p_micro, r_micro, f1_micro, auc_micro],
        "Macro":    [p_macro, r_macro, f1_macro, auc_macro],
        "Weighted": [p_w,     r_w,     f1_w,     auc_weighted],
    },
    index=["Precision", "Recall", "F1-score", "AUC"],
).round(4)

display(summary_thr)

# Save tuned results
OUT_DIR = RESULTS_DIR
OUT_DIR.mkdir(exist_ok=True, parents=True)
csv_path  = OUT_DIR / f"overall_metrics_xgb_capped_multi_label_beta{beta_used:.1f}.csv"
json_path = OUT_DIR / f"overall_metrics_xgb_capped_multi_label_beta{beta_used:.1f}.json"

summary_thr.to_csv(csv_path)
summary_thr.to_json(json_path, orient="index", indent=2)

print("Saved tuned XGB test summary ->", csv_path)
print("Saved tuned XGB test summary ->", json_path)


Loaded thresholds (beta=1.5) from xgb_label_thresholds_beta1.5.json
[test+thr] part 1/45 done
[test+thr] part 2/45 done
[test+thr] part 3/45 done
[test+thr] part 4/45 done
[test+thr] part 5/45 done
[test+thr] part 6/45 done
[test+thr] part 7/45 done
[test+thr] part 8/45 done
[test+thr] part 9/45 done
[test+thr] part 10/45 done
[test+thr] part 11/45 done
[test+thr] part 12/45 done
[test+thr] part 13/45 done
[test+thr] part 14/45 done
[test+thr] part 15/45 done
[test+thr] part 16/45 done
[test+thr] part 17/45 done
[test+thr] part 18/45 done
[test+thr] part 19/45 done
[test+thr] part 20/45 done
[test+thr] part 21/45 done
[test+thr] part 22/45 done
[test+thr] part 23/45 done
[test+thr] part 24/45 done
[test+thr] part 25/45 done
[test+thr] part 26/45 done
[test+thr] part 27/45 done
[test+thr] part 28/45 done
[test+thr] part 29/45 done
[test+thr] part 30/45 done
[test+thr] part 31/45 done
[test+thr] part 32/45 done
[test+thr] part 33/45 done
[test+thr] part 34/45 done
[test+thr] part 35/45 d



Unnamed: 0,Micro,Macro,Weighted
Precision,0.3508,0.3632,0.405
Recall,0.3206,0.2884,0.3206
F1-score,0.335,0.2804,0.3093
AUC,0.8238,0.8164,0.7892


Saved tuned XGB test summary -> /Users/georgektenas/Desktop/Malware Project/data/behavior_vectors_paper/results_xgb_capped_eval/overall_metrics_xgb_capped_multi_label_beta1.5.csv
Saved tuned XGB test summary -> /Users/georgektenas/Desktop/Malware Project/data/behavior_vectors_paper/results_xgb_capped_eval/overall_metrics_xgb_capped_multi_label_beta1.5.json
