## TabNet

**Cell 1: Imports, Paths, Labels and Part files**

In [2]:
from pathlib import Path
from glob import glob
import json
import re
import time

import numpy as np
import scipy.sparse as sp
import pandas as pd

import torch
from pytorch_tabnet.multitask import TabNetMultiTaskClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    fbeta_score,
)

RANDOM_STATE = 42

# --- Paths (ίδια λογική με τα άλλα notebooks) ---
BASE = Path.home() / "Desktop" / "Malware Project"
VEC  = BASE / "data" / "behavior_vectors_paper"

MODELS_DIR  = VEC / "models_tabnet_capped"
RESULTS_DIR = VEC / "results_tabnet_capped_eval"
MODELS_DIR.mkdir(exist_ok=True, parents=True)
RESULTS_DIR.mkdir(exist_ok=True, parents=True)

print("BASE:", BASE)
print("VEC:", VEC)
print("MODELS_DIR:", MODELS_DIR)
print("RESULTS_DIR:", RESULTS_DIR)

# --- Load label names (προτιμάμε balanced map αν υπάρχει) ---
label_names = None
for p in [VEC / "label_map_balanced.json", VEC / "label_map.json"]:
    if p.exists():
        with open(p, "r") as f:
            data = json.load(f)
        if isinstance(data, dict) and "labels" in data:
            label_names = data["labels"]
            print("Loaded labels from:", p.name)
            break

if label_names is None:
    raise FileNotFoundError(
        "Δεν βρέθηκε ούτε label_map_balanced.json ούτε label_map.json στο VEC."
    )

n_labels = len(label_names)
print(f"n_labels = {n_labels}")
print("First labels:", ", ".join(label_names[:10]) + (" ..." if n_labels > 10 else ""))

# --- Part files: X capped + Y capped (όπως στα άλλα capped runs) ---
XTRN = sorted(map(Path, glob(str(VEC / "train_part_capped*.npz"))))
YTRN_CAP = sorted(map(Path, glob(str(VEC / "y_train_part_capped*.npy"))))

XTE = sorted(map(Path, glob(str(VEC / "test_part_capped*.npz"))))

def _index_from_name(p: Path) -> int:
    m = re.search(r"(\d+)$", p.stem)
    if not m:
        raise ValueError(f"No trailing index found in filename: {p.name}")
    return int(m.group(1))

idx_te = [_index_from_name(p) for p in XTE]
all_YTE = list(map(Path, glob(str(VEC / "y_test_part_capped*.npy"))))
map_Y = {_index_from_name(p): p for p in all_YTE}
YTE = [map_Y[i] for i in idx_te if i in map_Y]

print("Train parts  X:", len(XTRN), "| Y:", len(YTRN_CAP))
print("Test  parts   X:", len(XTE),  "| Y:", len(YTE))

assert len(XTRN) == len(YTRN_CAP), "Αναντιστοιχία #train X parts με #train Y parts"

# Γρήγορος sanity check στις πρώτες παρτίδες
def _rows_X(path): return sp.load_npz(path).shape[0]
def _rows_Y(path): return np.load(path, allow_pickle=False).shape[0]

if XTRN and YTRN_CAP:
    print(f"[train part 0] rows X={_rows_X(XTRN[0])}, Y={_rows_Y(YTRN_CAP[0])}")
if XTE and YTE:
    print(f"[test  part 0] rows X={_rows_X(XTE[0])}, Y={_rows_Y(YTE[0])}")

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


BASE: /Users/georgektenas/Desktop/Malware Project
VEC: /Users/georgektenas/Desktop/Malware Project/data/behavior_vectors_paper
MODELS_DIR: /Users/georgektenas/Desktop/Malware Project/data/behavior_vectors_paper/models_tabnet_capped
RESULTS_DIR: /Users/georgektenas/Desktop/Malware Project/data/behavior_vectors_paper/results_tabnet_capped_eval
Loaded labels from: label_map_balanced.json
n_labels = 64
First labels: adware, antiav, antifw, autorun, backdoor, banker, bho, binder, blocker, bundler ...
Train parts  X: 551 | Y: 551
Test  parts   X: 45 | Y: 45
[train part 0] rows X=2851, Y=2851
[test  part 0] rows X=12, Y=12
Using device: cpu


**Cell 2: Load full Train/Test matrices**

In [3]:
# Φόρτωση όλων των parts σε ενιαία matrices

# Train
X_train_parts = [sp.load_npz(p) for p in XTRN]
Y_train_parts = [np.load(p, allow_pickle=False) for p in YTRN_CAP]

X_train = sp.vstack(X_train_parts).tocsr()
Y_train = np.concatenate(Y_train_parts, axis=0)

# Test
X_test_parts = [sp.load_npz(p) for p in XTE]
Y_test_parts = [np.load(p, allow_pickle=False) for p in YTE]

X_test = sp.vstack(X_test_parts).tocsr()
Y_test = np.concatenate(Y_test_parts, axis=0)

print("X_train:", X_train.shape, "| Y_train:", Y_train.shape)
print("X_test: ", X_test.shape,  "| Y_test: ", Y_test.shape)

assert Y_train.shape[1] == n_labels, "Y_train columns != n_labels"
assert Y_test.shape[1] == n_labels,  "Y_test columns != n_labels"

print("OK: Multi-label capped train/test φορτώθηκαν σωστά.")


X_train: (1233020, 2381) | Y_train: (1233020, 64)
X_test:  (743, 2381) | Y_test:  (743, 64)
OK: Multi-label capped train/test φορτώθηκαν σωστά.


**Cell 3: Train / Val split**

In [4]:
X_tr, X_val, Y_tr, Y_val = train_test_split(
    X_train,
    Y_train,
    test_size=0.10,
    random_state=RANDOM_STATE,
    shuffle=True
    # multi-label -> stratify=None
)

print("X_tr:", X_tr.shape, "| Y_tr:", Y_tr.shape)
print("X_val:", X_val.shape, "| Y_val:", Y_val.shape)


X_tr: (1109718, 2381) | Y_tr: (1109718, 64)
X_val: (123302, 2381) | Y_val: (123302, 64)


**Cell 4: Convert sparse to dense float 32 and define and train TabNet with subsample for compution cost(baseline capped, lighter)**

In [9]:
import numpy as np
import time
import gc
from pytorch_tabnet.multitask import TabNetMultiTaskClassifier

# --- Subsample για πρακτικό training σε CPU ---
max_train_samples = 300000
max_val_samples = 60000
rng = np.random.RandomState(RANDOM_STATE)

def subsample_csr(X, Y, max_n, rng):
    if X.shape[0] > max_n:
        idx = rng.choice(X.shape[0], size=max_n, replace=False)
        return X[idx], Y[idx]
    return X, Y

X_tr_sub, Y_tr_sub = subsample_csr(X_tr, Y_tr, max_train_samples, rng)
X_val_sub, Y_val_sub = subsample_csr(X_val, Y_val, max_val_samples, rng)

print("X_tr_sub:", X_tr_sub.shape, "| Y_tr_sub:", Y_tr_sub.shape)
print("X_val_sub:", X_val_sub.shape, "| Y_val_sub:", Y_val_sub.shape)

# --- Dense μόνο για subsets + full test ---
X_tr_dense   = X_tr_sub.toarray().astype(np.float32)
X_val_dense  = X_val_sub.toarray().astype(np.float32)
X_test_dense = X_test.toarray().astype(np.float32)

Y_tr_sub = Y_tr_sub.astype(np.int32)
Y_val_sub = Y_val_sub.astype(np.int32)
Y_test = Y_test.astype(np.int32)

n_features = X_tr_dense.shape[1]
n_tasks    = Y_tr_sub.shape[1]
print("n_features:", n_features)
print("n_tasks (labels):", n_tasks)

# --- Hyperparameters (regularized, ασφαλή για overfitting/CPU) ---
tabnet_params = dict(
    n_d=16,
    n_a=16,
    n_steps=4,
    gamma=1.5,
    n_independent=2,
    n_shared=2,
    lambda_sparse=1e-3,  # πιο δυνατό sparsity -> λιγότερο overfitting
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(
        lr=2e-3,
        weight_decay=1e-5   # L2 regularization
    ),
    mask_type="sparsemax",
    seed=RANDOM_STATE,
    verbose=1
)

train_params = dict(
    max_epochs=40,
    patience=7,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False,
    eval_metric=["auc"],
    eval_name=["val"],
    eval_set=[(X_val_dense, Y_val_sub)],
    compute_importance=False   # Κρίσιμο για να μην κάνει τεράστιο explain στο τέλος
)

tabnet_mt = TabNetMultiTaskClassifier(**tabnet_params)

print("Starting TabNet training (subset, regularized config)...")
start_time = time.time()

tabnet_mt.fit(
    X_tr_dense,
    Y_tr_sub,
    **train_params
)

train_time = time.time() - start_time
print(f"Training time (TabNet subset, regularized): {train_time:.2f} seconds")

# Αδειάζουμε train dense για μνήμη (κρατάμε val/test για evaluation)
del X_tr_dense
gc.collect()

# --- Save model για να μην το ξανατρέχεις ---
MODEL_PATH = MODELS_DIR / "tabnet_multitask_capped_subset_regularized"
tabnet_mt.save_model(str(MODEL_PATH))
print("Saved TabNet model ->", MODEL_PATH)


X_tr_sub: (300000, 2381) | Y_tr_sub: (300000, 64)
X_val_sub: (60000, 2381) | Y_val_sub: (60000, 64)
n_features: 2381
n_tasks (labels): 64
Starting TabNet training (subset, regularized config)...




epoch 0  | loss: 0.29501 | val_auc: 0.57058 |  0:03:09s
epoch 1  | loss: 0.09135 | val_auc: 0.64699 |  0:06:14s
epoch 2  | loss: 0.08474 | val_auc: 0.71021 |  0:09:19s
epoch 3  | loss: 0.081   | val_auc: 0.75194 |  0:12:23s
epoch 4  | loss: 0.07642 | val_auc: 0.78528 |  0:15:35s
epoch 5  | loss: 0.07289 | val_auc: 0.80351 |  0:18:45s
epoch 6  | loss: 0.06996 | val_auc: 0.81827 |  0:21:50s
epoch 7  | loss: 0.06768 | val_auc: 0.82669 |  0:24:54s
epoch 8  | loss: 0.06562 | val_auc: 0.84522 |  0:27:56s
epoch 9  | loss: 0.06381 | val_auc: 0.85383 |  0:31:01s
epoch 10 | loss: 0.06185 | val_auc: 0.86198 |  0:34:03s
epoch 11 | loss: 0.06012 | val_auc: 0.87191 |  0:37:05s
epoch 12 | loss: 0.05847 | val_auc: 0.87883 |  0:40:05s
epoch 13 | loss: 0.0569  | val_auc: 0.87853 |  0:43:07s
epoch 14 | loss: 0.05541 | val_auc: 0.89171 |  0:46:08s
epoch 15 | loss: 0.05383 | val_auc: 0.89682 |  0:49:10s
epoch 16 | loss: 0.05242 | val_auc: 0.90087 |  0:52:12s
epoch 17 | loss: 0.05093 | val_auc: 0.90778 |  0



**Cell 5: Evaluation with threshold=0.5**

In [12]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

RESULTS_DIR.mkdir(exist_ok=True, parents=True)

def compute_all_metrics(y_true, y_pred, y_prob):
    metrics = {}

    for avg in ["micro", "macro", "weighted"]:
        metrics[f"{avg}_precision"] = precision_score(
            y_true, y_pred, average=avg, zero_division=0
        )
        metrics[f"{avg}_recall"] = recall_score(
            y_true, y_pred, average=avg, zero_division=0
        )
        metrics[f"{avg}_f1"] = f1_score(
            y_true, y_pred, average=avg, zero_division=0
        )

    for avg in ["micro", "macro", "weighted"]:
        try:
            metrics[f"{avg}_auc"] = roc_auc_score(
                y_true, y_prob, average=avg
            )
        except ValueError:
            metrics[f"{avg}_auc"] = np.nan

    return metrics

def make_pretty_table(metrics_dict, caption):
    index = ["Precision", "Recall", "F1-score", "AUC"]
    columns = ["Micro", "Macro", "Weighted"]

    data = [
        [
            metrics_dict["micro_precision"],
            metrics_dict["macro_precision"],
            metrics_dict["weighted_precision"],
        ],
        [
            metrics_dict["micro_recall"],
            metrics_dict["macro_recall"],
            metrics_dict["weighted_recall"],
        ],
        [
            metrics_dict["micro_f1"],
            metrics_dict["macro_f1"],
            metrics_dict["weighted_f1"],
        ],
        [
            metrics_dict["micro_auc"],
            metrics_dict["macro_auc"],
            metrics_dict["weighted_auc"],
        ],
    ]

    df = pd.DataFrame(data, index=index, columns=columns)
    display(df.round(6).style.set_caption(caption))
    return df

# --- Probabilities από TabNet ---
val_prob_list  = tabnet_mt.predict_proba(X_val_dense)
test_prob_list = tabnet_mt.predict_proba(X_test_dense)

val_proba = np.column_stack([p[:, 1] for p in val_prob_list])
test_proba = np.column_stack([p[:, 1] for p in test_prob_list])

print("val_proba shape:", val_proba.shape)
print("test_proba shape:", test_proba.shape)

# --- Threshold 0.5 ---
thr_default = 0.5
val_pred_default = (val_proba >= thr_default).astype(int)
test_pred_default = (test_proba >= thr_default).astype(int)

# --- Metrics ---
val_metrics_default = compute_all_metrics(Y_val_sub, val_pred_default, val_proba)
test_metrics_default = compute_all_metrics(Y_test,     test_pred_default, test_proba)

# Pretty tables σε μορφή όπως στο screenshot
val_table_pretty = make_pretty_table(
    val_metrics_default,
    "TabNet Multi-Task (VAL set, threshold = 0.5)"
)

test_table_pretty = make_pretty_table(
    test_metrics_default,
    "TabNet Multi-Task (TEST set, threshold = 0.5)"
)


# --- Save raw metrics και tables ---
# 1) raw dicts σε csv-friendly μορφή (μία γραμμή για VAL, μία για TEST)
val_series = pd.Series(val_metrics_default, name="VAL_TabNet_thr0.5")
test_series = pd.Series(test_metrics_default, name="TEST_TabNet_thr0.5")
summary_df = pd.concat([val_series, test_series], axis=1).T

summary_df.to_csv(
    RESULTS_DIR / "tabnet_capped_summary_thr0.5.csv",
    float_format="%.6f",
    index=True
)

# 2) pretty πίνακες χωριστά (όπως τους βλέπεις)
val_table_pretty.to_csv(
    RESULTS_DIR / "tabnet_capped_val_pretty_thr0.5.csv",
    float_format="%.6f"
)
test_table_pretty.to_csv(
    RESULTS_DIR / "tabnet_capped_test_pretty_thr0.5.csv",
    float_format="%.6f"
)

print("\nSaved TabNet metrics (thr=0.5) in:", RESULTS_DIR)


val_proba shape: (60000, 64)
test_proba shape: (743, 64)




Unnamed: 0,Micro,Macro,Weighted
Precision,0.879379,0.720007,0.852301
Recall,0.58992,0.434152,0.58992
F1-score,0.706137,0.508548,0.67776
AUC,0.980656,0.963747,0.964681


Unnamed: 0,Micro,Macro,Weighted
Precision,0.488064,0.28847,0.319541
Recall,0.220096,0.192268,0.220096
F1-score,0.30338,0.212873,0.239116
AUC,0.777607,,0.746854



Saved TabNet metrics (thr=0.5) in: /Users/georgektenas/Desktop/Malware Project/data/behavior_vectors_paper/results_tabnet_capped_eval


In [14]:
# Cell 6: Evaluation (VAL & TEST), tables + save to RESULTS_DIR

import numpy as np
import pandas as pd
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

THRESHOLD = 0.5

def probs_from_tabnet_multitask(proba_list):
    """
    TabNetMultiTaskClassifier.predict_proba επιστρέφει λίστα μήκους L,
    όπου κάθε στοιχείο έχει shape (N, 2) = [p(y=0), p(y=1)] για κάθε label.
    Αυτό το helper φτιάχνει matrix (N, L) με p(y=1) ανά label.
    """
    return np.stack([p[:, 1] for p in proba_list], axis=1)

def evaluate_multilabel(y_true, y_prob, threshold=0.5):
    """
    Υπολογίζει micro/macro/weighted Precision, Recall, F1, AUC
    για multi-label binary πρόβλημα.
    """
    y_pred = (y_prob >= threshold).astype(int)

    metrics = {}

    # Precision
    metrics["micro_precision"]    = precision_score(y_true, y_pred, average="micro", zero_division=0)
    metrics["macro_precision"]    = precision_score(y_true, y_pred, average="macro", zero_division=0)
    metrics["weighted_precision"] = precision_score(y_true, y_pred, average="weighted", zero_division=0)

    # Recall
    metrics["micro_recall"]    = recall_score(y_true, y_pred, average="micro", zero_division=0)
    metrics["macro_recall"]    = recall_score(y_true, y_pred, average="macro", zero_division=0)
    metrics["weighted_recall"] = recall_score(y_true, y_pred, average="weighted", zero_division=0)

    # F1
    metrics["micro_f1"]    = f1_score(y_true, y_pred, average="micro", zero_division=0)
    metrics["macro_f1"]    = f1_score(y_true, y_pred, average="macro", zero_division=0)
    metrics["weighted_f1"] = f1_score(y_true, y_pred, average="weighted", zero_division=0)

    # AUC
    try:
        metrics["micro_auc"]    = roc_auc_score(y_true, y_prob, average="micro")
    except ValueError:
        metrics["micro_auc"] = np.nan

    try:
        metrics["macro_auc"]    = roc_auc_score(y_true, y_prob, average="macro")
    except ValueError:
        metrics["macro_auc"] = np.nan

    try:
        metrics["weighted_auc"] = roc_auc_score(y_true, y_prob, average="weighted")
    except ValueError:
        metrics["weighted_auc"] = np.nan

    return metrics

# ---------- VAL ----------
val_proba_list = tabnet_mt.predict_proba(X_val_dense)
val_proba = probs_from_tabnet_multitask(val_proba_list)
val_metrics = evaluate_multilabel(Y_val, val_proba, threshold=THRESHOLD)

# ---------- TEST ----------
test_proba_list = tabnet_mt.predict_proba(X_test_dense)
test_proba = probs_from_tabnet_multitask(test_proba_list)
test_metrics = evaluate_multilabel(Y_test, test_proba, threshold=THRESHOLD)

# ---------- Build pretty tables (όπως στα προηγούμενα) ----------

val_table = pd.DataFrame({
    "Micro": [
        val_metrics["micro_precision"],
        val_metrics["micro_recall"],
        val_metrics["micro_f1"],
        val_metrics["micro_auc"],
    ],
    "Macro": [
        val_metrics["macro_precision"],
        val_metrics["macro_recall"],
        val_metrics["macro_f1"],
        val_metrics["macro_auc"],
    ],
    "Weighted": [
        val_metrics["weighted_precision"],
        val_metrics["weighted_recall"],
        val_metrics["weighted_f1"],
        val_metrics["weighted_auc"],
    ],
}, index=["Precision", "Recall", "F1-score", "AUC"])

test_table = pd.DataFrame({
    "Micro": [
        test_metrics["micro_precision"],
        test_metrics["micro_recall"],
        test_metrics["micro_f1"],
        test_metrics["micro_auc"],
    ],
    "Macro": [
        test_metrics["macro_precision"],
        test_metrics["macro_recall"],
        test_metrics["macro_f1"],
        test_metrics["macro_auc"],
    ],
    "Weighted": [
        test_metrics["weighted_precision"],
        test_metrics["weighted_recall"],
        test_metrics["weighted_f1"],
        test_metrics["weighted_auc"],
    ],
}, index=["Precision", "Recall", "F1-score", "AUC"])

# Round μόνο για εμφάνιση
val_table_rounded = val_table.round(4)
test_table_rounded = test_table.round(4)

print("=== TabNet Multi-Task (VAL, thr=0.5) ===")
display(val_table_rounded.style.set_table_styles(
    [{'selector': 'th', 'props': [('text-align', 'center')]},
     {'selector': 'caption', 'props': [('text-align', 'center'),
                                      ('font-size', '14px'),
                                      ('font-weight', 'bold')]}]
).set_caption("TabNet Multi-Task (VAL, thr=0.5)"))

print("\n=== TabNet Multi-Task (TEST, thr=0.5) ===")
display(test_table_rounded.style.set_table_styles(
    [{'selector': 'th', 'props': [('text-align', 'center')]},
     {'selector': 'caption', 'props': [('text-align', 'center'),
                                      ('font-size', '14px'),
                                      ('font-weight', 'bold')]}]
).set_caption("TabNet Multi-Task (TEST, thr=0.5)"))

# ---------- Save metrics to RESULTS_DIR ----------

RESULTS_DIR.mkdir(exist_ok=True, parents=True)

# raw dicts σε JSON
pd.Series(val_metrics).to_json(RESULTS_DIR / "tabnet_multitask_capped_val_metrics_raw.json")
pd.Series(test_metrics).to_json(RESULTS_DIR / "tabnet_multitask_capped_test_metrics_raw.json")

# formatted tables σε CSV
val_table.to_csv(RESULTS_DIR / "tabnet_multitask_capped_val_summary.csv", float_format="%.6f")
test_table.to_csv(RESULTS_DIR / "tabnet_multitask_capped_test_summary.csv", float_format="%.6f")

print(f"\nSaved TabNet metrics in: {RESULTS_DIR}")




=== TabNet Multi-Task (VAL, thr=0.5) ===


Matplotlib is building the font cache; this may take a moment.


Unnamed: 0,Micro,Macro,Weighted
Precision,0.8794,0.7823,0.865
Recall,0.6034,0.4591,0.6034
F1-score,0.7157,0.54,0.6905
AUC,0.9834,0.9676,0.9685



=== TabNet Multi-Task (TEST, thr=0.5) ===


Unnamed: 0,Micro,Macro,Weighted
Precision,0.4768,0.2926,0.323
Recall,0.2093,0.185,0.2093
F1-score,0.2909,0.2062,0.2302
AUC,0.7722,,0.7272



Saved TabNet metrics in: /Users/georgektenas/Desktop/Malware Project/data/behavior_vectors_paper/results_tabnet_capped_eval
