In [63]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import ast
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
# fix random seed for reproducibility
tf.random.set_seed(7)

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder

In [64]:
def load_data(target: str, csv_path: str) -> tuple:
    """
    Load and preprocess data from a CSV file.

    :param target: One of 'all', 'pqc', 'browser', 'os', 'algo', or 'tuple'.
    :param csv_path: Path to the CSV file to load.
    :return: Tuple (data, labels) where data is a numpy array and labels is another numpy array.
    """
    try:
        data = pd.read_csv(csv_path)
    except Exception as e:
        print(f"Error loading CSV at '{csv_path}': {e}")
        return None, None

    if 'Unnamed: 0' in data.columns:
        data = data.drop(columns=['Unnamed: 0'])

    labels = data.pop('label').values

    # Log label counts
    unique, counts = np.unique(labels, return_counts=True)
    print("Label distribution:", dict(zip(unique, counts)))

    rows = data.to_numpy()
    filtered_rows, filtered_labels = [], []
    for row, label in zip(rows, labels):
        if target in ['browser', 'os'] and label % 2 == 0:
            continue
        filtered_rows.append([ast.literal_eval(cell) for cell in row])
        filtered_labels.append(label)

    X = np.array([np.array(r).flatten() for r in filtered_rows])
    y = np.array(filtered_labels)
    print(f"Total samples: {len(y)}")

    # Process labels based on target
    if target == 'algo':
        y_proc = y // 10
    elif target == 'tuple':
        y_proc = y.copy()
    else:
        base = y
        if target == 'pqc':
            y_proc = base % 2
        elif target == 'browser':
            y_proc = ((base // 10) % 10) * 10
        elif target == 'os':
            y_proc = (base // 100) * 100
        elif target == 'all':
            y_proc = base
        else:
            print(f"Unknown target: {target}")
            return None, None

    return X, y_proc

In [65]:
data, labels = load_data("pqc", "C:\\Users\\Eylon\\PQC\\JournalDatasets\\pqc-pob.csv")

labels = LabelEncoder().fit_transform(labels)
X_train, X_test, y_train, y_test = train_test_split(
        data, labels, test_size=0.1, random_state=42, stratify=labels
)
X_train

Label distribution: {210: 200, 211: 200, 220: 200, 221: 200, 310: 200, 311: 200, 320: 200, 321: 200, 410: 200, 411: 200, 420: 200, 421: 200}
Total samples: 2400


array([[   1,   60,    0, ...,    1,   52,  182],
       [   1,  118,    0, ...,    0,  421,  825],
       [   1,   60,    0, ...,    0,   83,  197],
       ...,
       [   1,  118,    0, ...,    0, 1420,  271],
       [   1,   60,    0, ...,    0,  224,  225],
       [   1,   64,    0, ...,    1,   52,  134]])

In [73]:
data, labels = load_data("pqc", 'JournalDatasets\\pqc-pob.csv')

seq_len = data.shape[1]           # אורך הרצף לל־LSTM
max_id = int(np.max(data))        # האינדקס המקסימלי שמופיע בקלט
vocab_size = max_id + 1   

labels = LabelEncoder().fit_transform(labels)
X_train, X_test, y_train, y_test = train_test_split(
        data, labels, test_size=0.1, random_state=42, stratify=labels
)

# create the model
embedding_dim = 60  # זה ה- output_dim: כמה מאפיינים לכל טוקן
model = Sequential()
model.add(Embedding(input_dim=vocab_size,    # גודל אוצר מילים = max_id+1
                    output_dim=embedding_dim,
                    input_length=seq_len))   # אורך הרצף
model.add(LSTM(100))
model.add(Dense(1, activation='relu'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=12, batch_size=128)

Label distribution: {210: 200, 211: 200, 220: 200, 221: 200, 310: 200, 311: 200, 320: 200, 321: 200, 410: 200, 411: 200, 420: 200, 421: 200}
Total samples: 2400


None
Epoch 1/12
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 108ms/step - accuracy: 0.5110 - loss: 2.1575 - val_accuracy: 0.5375 - val_loss: 0.6782
Epoch 2/12
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 76ms/step - accuracy: 0.5862 - loss: 0.6666 - val_accuracy: 0.5958 - val_loss: 0.6536
Epoch 3/12
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 96ms/step - accuracy: 0.6621 - loss: 0.6224 - val_accuracy: 0.6208 - val_loss: 0.6266
Epoch 4/12
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 98ms/step - accuracy: 0.7421 - loss: 0.5625 - val_accuracy: 0.6167 - val_loss: 0.6566
Epoch 5/12
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 65ms/step - accuracy: 0.7816 - loss: 0.4903 - val_accuracy: 0.6500 - val_loss: 0.6895
Epoch 6/12
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 92ms/step - accuracy: 0.8286 - loss: 0.3957 - val_accuracy: 0.7000 - val_loss: 0.8100
Epoch 7/12
[1m17/17[0m [3

<keras.src.callbacks.history.History at 0x17eeea0e120>

In [74]:

# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 90.00%


In [72]:
import ast
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from tensorflow.keras import layers, callbacks, Model

# -----------------------------
# 1) Load CSV as (T, F) sequences instead of flattening
# -----------------------------
def load_data_lstm(target: str, csv_path: str):
    """
    Returns:
      X_raw: list of 2D arrays, each (Ti, F) before padding
      y_proc: np.ndarray of labels
      feat_names: column names used as features (for debugging)
    """
    data = pd.read_csv(csv_path)
    if 'Unnamed: 0' in data.columns:
        data = data.drop(columns=['Unnamed: 0'])

    # Extract labels and feature columns
    labels = data.pop('label').values
    feat_names = list(data.columns)

    # Optional filter (kept as in your original function)
    rows = data.to_numpy()
    filtered_seqs, filtered_labels = [], []
    for row, label in zip(rows, labels):
        # If predicting browser/os you skipped even labels; keep your logic:
        if target in ['browser', 'os'] and label % 2 == 0:
            continue

        # Parse each cell (string -> list/sequence)
        # Each column is a feature with its own 1D time-series
        feature_series = [np.array(ast.literal_eval(cell), dtype=np.float32) for cell in row]

        # Align features along time: pad columns to the max length within this row
        max_len = max(len(col) for col in feature_series)
        F = len(feature_series)
        seq = np.zeros((max_len, F), dtype=np.float32)
        for j, col in enumerate(feature_series):
            L = len(col)
            seq[:L, j] = col  # right-padding with zeros
        filtered_seqs.append(seq)
        filtered_labels.append(label)

    y = np.array(filtered_labels)

    # Process labels based on target (same semantics as your code)
    if target == 'algo':
        y_proc = y // 10
    elif target == 'tuple':
        y_proc = y.copy()
    else:
        base = y
        if target == 'pqc':
            y_proc = base % 2
        elif target == 'browser':
            y_proc = ((base // 10) % 10) * 10
        elif target == 'os':
            y_proc = (base // 100) * 100
        elif target == 'all':
            y_proc = base
        else:
            raise ValueError(f"Unknown target: {target}")

    print("Label distribution (raw):", dict(zip(*np.unique(y_proc, return_counts=True))))
    return filtered_seqs, y_proc, feat_names


# -----------------------------
# 2) Pad to common length + scale features
# -----------------------------
def pad_and_scale(seqs_list, pad_to=None):
    """
    seqs_list: list of (Ti, F) arrays
    pad_to: optional fixed length; if None, use max Ti in list
    Returns X: (N, T, F), scaler (fitted on train only later)
    """
    N = len(seqs_list)
    F = seqs_list[0].shape[1]
    T = pad_to or max(s.shape[0] for s in seqs_list)
    X = np.zeros((N, T, F), dtype=np.float32)
    lengths = np.zeros(N, dtype=np.int32)
    for i, s in enumerate(seqs_list):
        L = min(s.shape[0], T)
        X[i, :L, :] = s[:L, :]
        lengths[i] = L
    return X, lengths


def scale_3d_fit_transform(X_train, mask_lengths):
    """
    Fit StandardScaler column-wise (feature-wise) using unmasked timesteps only,
    then transform both train and later test/val with the same scaler.
    """
    N, T, F = X_train.shape
    # Collect all valid rows across all sequences (exclude padded rows)
    rows = []
    for i in range(N):
        L = mask_lengths[i]
        if L > 0:
            rows.append(X_train[i, :L, :])
    all_valid = np.concatenate(rows, axis=0)  # (sum(L_i), F)

    scaler = StandardScaler()
    scaler.fit(all_valid)

    # Apply scaler to all timesteps (padded zeros will become scaled zeros-ish)
    Xs = X_train.reshape(-1, F)
    Xs = scaler.transform(Xs)
    Xs = Xs.reshape(N, T, F)
    return Xs, scaler


def scale_3d_transform(X, scaler):
    N, T, F = X.shape
    Xs = X.reshape(-1, F)
    Xs = scaler.transform(Xs)
    return Xs.reshape(N, T, F)


# -----------------------------
# 3) Build a solid LSTM model
# -----------------------------
def build_lstm_model(seq_len, feat_dim):
    inputs = layers.Input(shape=(seq_len, feat_dim))
    x = layers.Masking(mask_value=0.0)(inputs)
    x = layers.Conv1D(64, 5, padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.SpatialDropout1D(0.2)(x)
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
    x = layers.Bidirectional(layers.LSTM(32))(x)
    x = layers.Dense(64, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(1e-5))(x)
    x = layers.Dropout(0.3)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)

    model = Model(inputs, outputs)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss="binary_crossentropy",
        metrics=[
            "accuracy",
            tf.keras.metrics.AUC(name="auc", curve="ROC"),
            tf.keras.metrics.AUC(name="prauc", curve="PR")
        ]
    )
    return model


# -----------------------------
# 4) Train
# -----------------------------
# Load and prepare
csv_path = r"C:\Users\Eylon\PQC\JournalDatasets\pqc-pob.csv"  # <- adjust if needed
target = "pqc"

seqs_list, labels_raw, feat_names = load_data_lstm(target, csv_path)

# Encode labels to {0,1} for binary
le = LabelEncoder()
labels = le.fit_transform(labels_raw)

# Train/test split (TIP: consider GroupKFold by capture/session to avoid leakage)
X_list_tr, X_list_te, y_tr, y_te = train_test_split(
    seqs_list, labels, test_size=0.1, random_state=42, stratify=labels
)

# Pad (choose a max length; you can also cap with e.g., pad_to=512 for speed)
X_tr, L_tr = pad_and_scale(X_list_tr, pad_to=None)
X_te, L_te = pad_and_scale(X_list_te, pad_to=X_tr.shape[1])  # ensure same T

# Scale features using only training timesteps
X_tr, scaler = scale_3d_fit_transform(X_tr, L_tr)
X_te = scale_3d_transform(X_te, scaler)

seq_len, feat_dim = X_tr.shape[1], X_tr.shape[2]
print(f"seq_len={seq_len}, feat_dim={feat_dim}")

# Class weights (if imbalanced)
classes = np.unique(y_tr)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_tr)
class_weight = {int(c): float(w) for c, w in zip(classes, class_weights)}
print("class_weight:", class_weight)

# Build & train
model = build_lstm_model(seq_len, feat_dim)

cbs = [
    callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=6, restore_best_weights=True),
    callbacks.ReduceLROnPlateau(monitor="val_auc", mode="max", patience=3, factor=0.5, min_lr=1e-5),
    callbacks.ModelCheckpoint("best_lstm.keras", monitor="val_auc", mode="max", save_best_only=True)
]

history = model.fit(
    X_tr, y_tr,
    validation_data=(X_te, y_te),
    epochs=60,
    batch_size=64,
    callbacks=cbs,
    class_weight=class_weight
)

# Final evaluation of the model
scores = model.evaluate(X_te, y_te, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))


Label distribution (raw): {0: 1200, 1: 1200}
seq_len=3, feat_dim=20
class_weight: {0: 1.0, 1: 1.0}
Epoch 1/60
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 38ms/step - accuracy: 0.7595 - auc: 0.8335 - loss: 0.5679 - prauc: 0.8434 - val_accuracy: 0.8292 - val_auc: 0.9463 - val_loss: 0.4473 - val_prauc: 0.9558 - learning_rate: 0.0010
Epoch 2/60
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8847 - auc: 0.9691 - loss: 0.2262 - prauc: 0.9703 - val_accuracy: 0.8667 - val_auc: 0.9627 - val_loss: 0.3192 - val_prauc: 0.9662 - learning_rate: 0.0010
Epoch 3/60
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9020 - auc: 0.9769 - loss: 0.1830 - prauc: 0.9773 - val_accuracy: 0.8750 - val_auc: 0.9732 - val_loss: 0.2619 - val_prauc: 0.9756 - learning_rate: 0.0010
Epoch 4/60
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.8896 - auc: 0.9755 - loss: 0.1805 - prauc: 