**NEW**

**Cell 1: Imports**


In [1]:
from pathlib import Path
import json, numpy as np
from tqdm import tqdm
from ember import PEFeatureExtractor
from scipy import sparse as sp


**Cell 2: Paths που δεν εξαρτώνται από username**


In [2]:
# 1) Paths που δεν εξαρτώνται από username
BASE = Path.home() / "Desktop" / "Malware Project"
DATA = BASE / "data" / "behavior"          # train_features.jsonl / test_features.jsonl
TAGS = BASE / "data" / "tags"              # behavior_train.jsonl / behavior_test.jsonl
OUT  = BASE / "data" / "behavior_vectors_paper"  # εδώ θα γραφτούν τα νέα αρχεία
OUT.mkdir(parents=True, exist_ok=True)


**Cell 3: Helpers**

In [3]:
# 2) helpers
def stream_jsonl(fp):
    with open(fp, "r") as f:
        for line in f:
            yield json.loads(line)

def build_md5_set(tag_fp):
    keep = set()
    for rec in stream_jsonl(tag_fp):
        md5 = rec.get("md5")
        if md5:
            keep.add(md5)
    return keep

def vectorize_split(split_name, feat_fp, keep_set, out_dir, chunk_rows=5000):
    extractor = PEFeatureExtractor()
    X_chunk = []
    kept = 0
    saved_parts = 0

    for meta in tqdm(stream_jsonl(feat_fp), desc=split_name):
        md5 = meta.get("md5")
        if md5 not in keep_set:
            continue
        try:
            vec = extractor.process_raw_features(meta).astype(np.float32)
        except Exception:
            continue
        X_chunk.append(vec)
        kept += 1

        if len(X_chunk) >= chunk_rows:
            X_csr = sp.csr_matrix(np.vstack(X_chunk))
            sp.save_npz(out_dir / f"{split_name}_part{saved_parts:03d}.npz", X_csr, compressed=False)
            X_chunk = []
            saved_parts += 1

    if X_chunk:
        X_csr = sp.csr_matrix(np.vstack(X_chunk))
        sp.save_npz(out_dir / f"{split_name}_part{saved_parts:03d}.npz", X_csr, compressed=False)
        saved_parts += 1

    print(f"[{split_name}] DONE: kept={kept}, parts={saved_parts}")

**Cell 4: Checks**

In [4]:
# 3) checks
assert (DATA/"train_features.jsonl").exists(), "Λείπει train_features.jsonl"
assert (DATA/"test_features.jsonl").exists(),  "Λείπει test_features.jsonl"
assert (TAGS/"behavior_train.jsonl").exists(), "Λείπει behavior_train.jsonl"
assert (TAGS/"behavior_test.jsonl").exists(),  "Λείπει behavior_test.jsonl"


**Cell 5: Build tag sets and run**

In [5]:
# 4) build tag sets και τρέξιμο
keep_train = build_md5_set(TAGS / "behavior_train.jsonl")
keep_test  = build_md5_set(TAGS / "behavior_test.jsonl")

vectorize_split("train", DATA / "train_features.jsonl", keep_train, OUT, chunk_rows=5000)
vectorize_split("test",  DATA / "test_features.jsonl",  keep_test,  OUT, chunk_rows=5000)

train: 2754289it [48:10, 952.90it/s] 


[train] DONE: kept=2754289, parts=551


test: 223300it [04:00, 926.80it/s] 


[test] DONE: kept=223300, parts=45


**Cell 6: Helpers for tags/labels**

In [7]:
# -- helpers για tags/labels --

def _labels_from_ranking(rec):
    """Παίρνω τα string labels από το πεδίο 'ranking' ενός tag record."""
    labs = []
    ranking = rec.get("ranking") or []
    for it in ranking:
        if isinstance(it, (list, tuple)) and it:
            labs.append(str(it[0]).lower())
        else:
            labs.append(str(it).lower())
    return labs

def build_label_map(train_tag_fp, test_tag_fp):
    """Μαζεύω ΟΛΑ τα unique labels (behavior tags) -> (all_labels, tag_to_idx)."""
    seen = set()
    for fp in (train_tag_fp, test_tag_fp):
        with open(fp, "r") as f:
            for line in f:
                rec = json.loads(line)
                for lab in _labels_from_ranking(rec):
                    seen.add(lab)
    all_labels = sorted(seen)
    tag_to_idx = {t:i for i,t in enumerate(all_labels)}
    return all_labels, tag_to_idx

def load_tag_map_for(fp, keep_set):
    """Διαβάζω tags jsonl και κρατώ ΜΟΝΟ md5 που είναι στο keep_set: md5 -> [labels]."""
    m = {}
    with open(fp, "r") as f:
        for line in f:
            rec = json.loads(line)
            md5 = rec.get("md5")
            if not md5 or md5 not in keep_set:
                continue
            m[md5] = _labels_from_ranking(rec)
    return m


**Cell 7: Create Y parts (same filtering with the X parts)** 


In [8]:
# -- φτιάχνω τα Y parts ώστε να ταιριάζουν 1-προς-1 με τα X parts --

def build_y_parts(split_name, feat_fp, tags_fp, keep_set, out_dir, tag_to_idx, chunk_rows=5000):
    """
    Ξαναπερνάω το ίδιο features jsonl ΜΕ το ίδιο φίλτρο που χρησιμοποιήσαμε στο vectorize:
    - md5 ∈ keep_set
    - extractor.process_raw_features(meta) να ΜΗΝ σκάσει (ίδιο try/except)
    Κάθε chunk_rows γράφω y_{split}_partXXX.npy (multi-hot).
    """
    extractor = PEFeatureExtractor()
    Y_chunk = []
    saved_parts = 0
    kept = 0

    # Για γρήγορο lookup των labels του md5
    tag_map = load_tag_map_for(tags_fp, keep_set)  # md5 -> [labels]

    with open(feat_fp, "r") as f:
        for line in tqdm(f, desc=f"{split_name}-labels"):
            meta = json.loads(line)
            md5 = meta.get("md5")
            if md5 not in keep_set:
                continue
            # ίδιο φίλτρο με το X: αν αποτύχει ο extractor, skip ώστε να ταιριάξει η σειρά/πλήθος
            try:
                _ = extractor.process_raw_features(meta)
            except Exception:
                continue

            labs = tag_map.get(md5, [])
            y = np.zeros(len(tag_to_idx), dtype=np.int8)
            for lab in labs:
                j = tag_to_idx.get(lab)
                if j is not None:
                    y[j] = 1
            Y_chunk.append(y)
            kept += 1

            if len(Y_chunk) >= chunk_rows:
                np.save(out_dir / f"y_{split_name}_part{saved_parts:03d}.npy", np.vstack(Y_chunk))
                Y_chunk = []
                saved_parts += 1

    if Y_chunk:
        np.save(out_dir / f"y_{split_name}_part{saved_parts:03d}.npy", np.vstack(Y_chunk))
        saved_parts += 1

    print(f"[{split_name} labels] DONE: kept={kept}, parts={saved_parts}")


**Cell 8: Build sets and run**

In [9]:
# build sets (ίδια με πριν)
keep_train = build_md5_set(TAGS / "behavior_train.jsonl")
keep_test  = build_md5_set(TAGS / "behavior_test.jsonl")

# label map
all_tags, tag_to_idx = build_label_map(TAGS / "behavior_train.jsonl", TAGS / "behavior_test.jsonl")
with open(OUT / "label_map.json", "w") as f:
    json.dump({"labels": all_tags}, f, indent=2)

# y parts (ίδιο chunk_rows με τα X!)
build_y_parts("train", DATA / "train_features.jsonl", TAGS / "behavior_train.jsonl",
              keep_train, OUT, tag_to_idx, chunk_rows=5000)

build_y_parts("test",  DATA / "test_features.jsonl",  TAGS / "behavior_test.jsonl",
              keep_test,  OUT, tag_to_idx, chunk_rows=5000)


train-labels: 2754289it [46:32, 986.35it/s] 


[train labels] DONE: kept=2754289, parts=551


test-labels: 223300it [03:45, 990.98it/s] 

[test labels] DONE: kept=223300, parts=45





**Cell 9: Quick checks**

In [10]:
# Πρέπει να ταιριάζουν πλήθος part-αρχείων και row counts
import glob, numpy as np, scipy.sparse as sp

train_X = sorted(glob.glob(str(OUT / "train_part*.npz")))
train_Y = sorted(glob.glob(str(OUT / "y_train_part*.npy")))
test_X  = sorted(glob.glob(str(OUT / "test_part*.npz")))
test_Y  = sorted(glob.glob(str(OUT / "y_test_part*.npy")))

print("counts: ", len(train_X), len(train_Y), "|", len(test_X), len(test_Y))
assert len(train_X) == len(train_Y)
assert len(test_X) == len(test_Y)

# spot check σε 2-3 ζευγάρια
def _rows_X(path): return sp.load_npz(path).shape[0]
def _rows_Y(path): return np.load(path).shape[0]

for i in [0, len(train_X)//2, len(train_X)-1]:
    rx, ry = _rows_X(train_X[i]), _rows_Y(train_Y[i])
    print(f"[train part {i:03d}] rows X={rx}, Y={ry}")
    assert rx == ry


counts:  551 551 | 45 45
[train part 000] rows X=5000, Y=5000
[train part 275] rows X=5000, Y=5000
[train part 550] rows X=4289, Y=4289
