In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join("..", ".."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
import pprint as pp

from sklearn.model_selection import train_test_split

from pathlib import Path
from src.utils.preprocessing_utils import load_dataset

from src.prototypes.torch_apk_analysis_model import (
    get_best_available_device,
    cross_val_train_nn_model,
    train_nn_model,
    extract_embeddings,
    evaluate_model_on_test_set,
    NNHyperparams,
)

from src.prototypes.torch_apk_analysis_model_io import (
    save_model_with_metadata,
    load_apk_analysis_model_from_version,
    load_apk_feature_embedder_from_version,
    load_apk_analysis_model_metadata,
)

from src.prototypes.ml_model import MLHyperparams, train_classical_models_cv
from src.prototypes.ml_model_io import save_ml_models_with_metadata

In [4]:
SEQUENCE_COLS = [
    "activities_list",
    "services_list",
    "receivers_list",
    "permissions_list",
    "api_calls_list",
]

CHAR_COLS = ["fuzzy_hash"]
VECTOR_COLS = ["opcode_counts"]
SCALAR_COLS = ["file_size"]
VECTOR_DIMS = {"opcode_counts": 768}

PROJECT_ROOT = Path().cwd().parent.parent
PATH_TO_DATASET_DIR = PROJECT_ROOT / "dataset"
PATH_TO_SAVE_NN_MODEL = PROJECT_ROOT / "model_artifacts" / "nn_models"
PATH_TO_SAVE_ML_MODEL = PROJECT_ROOT / "model_artifacts" / "ml_models"

# Load dataset
df, vocab_dict = load_dataset(
    PATH_TO_DATASET_DIR,
    SEQUENCE_COLS,
    CHAR_COLS,
    VECTOR_COLS,
    SCALAR_COLS,
    VECTOR_DIMS,
    load_fresh=False,
    sample_size=None,
)

df, df_test = train_test_split(
    df, test_size=0.1, random_state=42, stratify=df["is_malware"]
)

device = get_best_available_device()

Loading last preprocessed dataset...
Using CUDA device: NVIDIA GeForce RTX 4070 SUPER


In [4]:
display(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 17974 entries, 2247 to 5263
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   file_size         17974 non-null  int64 
 1   fuzzy_hash        17974 non-null  object
 2   activities_list   17974 non-null  object
 3   services_list     17974 non-null  object
 4   receivers_list    17974 non-null  object
 5   permissions_list  17974 non-null  object
 6   api_calls_list    17974 non-null  object
 7   opcode_counts     17974 non-null  object
 8   is_malware        17974 non-null  int64 
dtypes: int64(2), object(7)
memory usage: 1.4+ MB


None

Unnamed: 0,file_size,fuzzy_hash,activities_list,services_list,receivers_list,permissions_list,api_calls_list,opcode_counts,is_malware
2247,6909752,"[37, 65, 42, 15, 29, 33, 35, 38, 11, 28, 62, 4...","[19048, 19079, 19053, 19081, 19052, 508, 45, 1...","[20, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[2204, 2200, 18, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[69, 27, 6, 25, 3, 19, 22, 61, 14, 9, 24, 1001...","[78, 122, 2465, 119, 106, 7158, 1202, 705, 383...","[0.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.0, 0.0, ...",1
5698,9660703,"[22, 8, 25, 21, 23, 25, 17, 56, 20, 45, 3, 10,...","[139512, 3799, 34, 139502, 139528, 18, 139522,...","[23, 12423, 12421, 12422, 30, 20, 6, 26, 0, 0,...","[9139, 5, 18, 3, 67, 17, 0, 0, 0, 0, 0, 0, 0, ...","[15, 39, 11, 26, 78, 4, 2755, 5, 64, 6, 3, 9, ...","[105950, 63567, 187729, 3877, 3934, 3891, 7551...","[628.0, 7051.0, 4476.0, 0.0, 289.0, 550.0, 0.0...",1
8381,7014287,"[37, 14, 16, 40, 19, 33, 46, 52, 5, 61, 3, 51,...","[73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 8...","[129, 120, 104, 115, 108, 60, 102, 61, 96, 62,...","[62, 45, 55, 58, 54, 63, 57, 56, 53, 61, 49, 5...","[163, 43, 143, 70, 159, 84, 86, 118, 117, 4, 1...","[32224, 32071, 18370, 27218, 23182, 24664, 263...","[933.0, 10440.0, 690.0, 0.0, 304.0, 204.0, 0.0...",1
1682,5863124,"[66, 29, 4, 42, 34, 45, 20, 38, 29, 66, 14, 15...","[37, 9, 308, 224756, 315, 224757, 12, 52, 7, 2...","[139, 32, 151, 146, 35, 55, 29, 0, 0, 0, 0, 0,...","[29, 156, 39, 31, 100, 26, 40, 0, 0, 0, 0, 0, ...","[42, 22, 11, 19, 14, 6, 100, 15, 10, 46, 13, 2...","[78, 122, 2465, 119, 106, 7158, 1202, 705, 383...","[0.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
18394,8197050,"[60, 62, 12, 30, 37, 58, 13, 38, 48, 30, 3, 55...","[6, 10, 4, 586, 44, 3, 42, 41, 6293, 43, 555, ...","[9, 12, 8, 6, 13, 5, 10, 11, 3, 4, 0, 0, 0, 0,...","[94, 11, 5, 10, 13, 7, 8, 4, 12, 43, 22, 6, 21...","[33, 57, 71, 55, 23, 72, 20, 56, 5, 3, 58, 44,...","[2972, 4510, 6430, 5098, 1606, 1889, 3252, 216...","[330.0, 2802.0, 4177.0, 0.0, 1029.0, 788.0, 0....",0


In [9]:
nn_hyperparams = NNHyperparams(
    batch_size=64,
    max_learning_rate=1e-3,
    epochs=20,
    early_stopping=True,
    patience=5,
    optimizer="adamw",
    weight_decay=1e-5,
    embedding_dim=64,
    hidden_dims=[64],
    dropout=0.5,
    seq_pooling="mean",
    n_classes=2,
    label_col="is_malware",
    dataloader_num_workers=2,
    dataloader_pin_memory=True,
    dataloader_persistent_workers=True,
    grad_scaler_max_norm=1.0,
)

# nn_hyperparams = NNHyperparams(
#     batch_size=16,
#     max_learning_rate=5e-3,
#     epochs=20,
#     early_stopping=True,
#     patience=5,
#     optimizer="adamw",
#     weight_decay=5e-4,
#     embedding_dim=256,
#     hidden_dims=[256, 16],
#     dropout=0.2,
#     seq_pooling="mean",
#     n_classes=2,
#     label_col="is_malware",
#     dataloader_num_workers=2,
#     dataloader_pin_memory=True,
#     dataloader_persistent_workers=True,
#     grad_scaler_max_norm=1.0,
# )

# nn_hyperparams = NNHyperparams(
#     batch_size=64,
#     max_learning_rate=6e-3,
#     epochs=20,
#     early_stopping=True,
#     patience=5,
#     optimizer="adamw",
#     weight_decay=8e-4,
#     embedding_dim=64,
#     hidden_dims=[128],
#     dropout=0.5,
#     seq_pooling="mean",
#     n_classes=2,
#     label_col="is_malware",
#     dataloader_num_workers=2,
#     dataloader_pin_memory=True,
#     dataloader_persistent_workers=True,
#     grad_scaler_max_norm=1.0,
# )

In [10]:
nn_model, nn_results, fitted_scalers = train_nn_model(
    df=df,
    vocab_dict=vocab_dict,
    sequence_cols=SEQUENCE_COLS,
    scalar_cols=SCALAR_COLS,
    char_cols=CHAR_COLS,
    vector_cols=VECTOR_COLS,
    vector_dims=VECTOR_DIMS,
    hyperparams=nn_hyperparams,
    scoring_metric="recall",
    train_split_ratio=0.8,
    device=device,
    random_seed=42,
)

Using device: cuda
Using recall as the primary scoring metric for validation.
Performing internal train/validation split with ratio: 0.8
Training set size: 14379, Validation set size: 3595
Training set class distribution: {0: 7200, 1: 7179}
Validation set class distribution: {0: 1800, 1: 1795}
Using class weights: [0.99854167 1.0014626 ]
Starting training...
Epoch 1, Batch 0/225, Train Loss: 0.7019, LR: 4.00e-05
Epoch 1, Batch 46/225, Train Loss: 0.6681, LR: 4.29e-05
Epoch 1, Batch 92/225, Train Loss: 0.5862, LR: 5.12e-05
Epoch 1, Batch 138/225, Train Loss: 0.5033, LR: 6.49e-05
Epoch 1, Batch 184/225, Train Loss: 0.3782, LR: 8.39e-05
Epoch 1/20 — Train Loss: 0.5300
Epoch 1 — Val Loss: 0.2918, Val Recall: 0.9432 (Acc: 0.8954, P: 0.8607, R: 0.9432, F1: 0.9001, ROC: 0.9543, PR: 0.9500)
  New best model (by val_loss) saved at epoch 1.
Epoch 2, Batch 0/225, Train Loss: 0.2597, LR: 1.05e-04
Epoch 2, Batch 46/225, Train Loss: 0.1677, LR: 1.33e-04
Epoch 2, Batch 92/225, Train Loss: 0.2564, LR:

In [6]:
nn_model, nn_results, fitted_scalers = train_nn_model(
    df=df,
    vocab_dict=vocab_dict,
    sequence_cols=SEQUENCE_COLS,
    scalar_cols=SCALAR_COLS,
    char_cols=CHAR_COLS,
    vector_cols=VECTOR_COLS,
    vector_dims=VECTOR_DIMS,
    hyperparams=nn_hyperparams,
    scoring_metric="recall",
    train_split_ratio=0.8,
    device=device,
    random_seed=42,
)

Using device: cuda
Using recall as the primary scoring metric for validation.
Performing internal train/validation split with ratio: 0.8
Training set size: 14379, Validation set size: 3595
Training set class distribution: {0: 7200, 1: 7179}
Validation set class distribution: {0: 1800, 1: 1795}
Using class weights: [0.99854167 1.0014626 ]
Starting training...
Epoch 1, Batch 0/225, Train Loss: 0.6893, LR: 2.40e-04
Epoch 1, Batch 46/225, Train Loss: 0.3296, LR: 2.57e-04
Epoch 1, Batch 92/225, Train Loss: 0.2261, LR: 3.07e-04
Epoch 1, Batch 138/225, Train Loss: 0.2144, LR: 3.90e-04
Epoch 1, Batch 184/225, Train Loss: 0.0952, LR: 5.03e-04
Epoch 1/20 — Train Loss: 0.2349
Epoch 1 — Val Loss: 0.0808, Val Recall: 0.9721 (Acc: 0.9711, P: 0.9700, R: 0.9721, F1: 0.9711, ROC: 0.9960, PR: 0.9960)
  New best model (by val_loss) saved at epoch 1.
Epoch 2, Batch 0/225, Train Loss: 0.0695, LR: 6.30e-04
Epoch 2, Batch 46/225, Train Loss: 0.0478, LR: 7.99e-04
Epoch 2, Batch 92/225, Train Loss: 0.0383, LR:

In [11]:
nn_model, nn_results, fitted_scalers = cross_val_train_nn_model(
    df=df,
    vocab_dict=vocab_dict,
    sequence_cols=SEQUENCE_COLS,
    scalar_cols=SCALAR_COLS,
    char_cols=CHAR_COLS,
    vector_cols=VECTOR_COLS,
    vector_dims=VECTOR_DIMS,
    hyperparams=nn_hyperparams,
    n_folds=2,
    n_repetitions=5,
    scoring_metric="recall",
    device=device,
    random_seed=42,
)

--- Cross-Validation Training ---
Using device: cuda
Primary scoring metric for best model selection: RECALL
Number of folds: 2, Number of repetitions: 5

--- Repetition 1/5, Fold 1/2 (Overall Fold 1) ---
Using device: cuda
Using recall as the primary scoring metric for validation.
Using explicitly provided training and validation DataFrames.
Training set size: 8987, Validation set size: 8987
Training set class distribution: {0: 4500, 1: 4487}
Validation set class distribution: {0: 4500, 1: 4487}
Using class weights: [0.99855556 1.00144863]
Starting training...
Epoch 1, Batch 0/141, Train Loss: 0.7227, LR: 4.00e-05
Epoch 1, Batch 29/141, Train Loss: 0.6691, LR: 4.30e-05
Epoch 1, Batch 58/141, Train Loss: 0.6395, LR: 5.15e-05
Epoch 1, Batch 87/141, Train Loss: 0.5995, LR: 6.55e-05
Epoch 1, Batch 116/141, Train Loss: 0.4867, LR: 8.47e-05
Epoch 1/20 — Train Loss: 0.5996
Epoch 1 — Val Loss: 0.4058, Val Recall: 0.9019 (Acc: 0.8758, P: 0.8569, R: 0.9019, F1: 0.8788, ROC: 0.9423, PR: 0.9382)


In [12]:
save_paths = save_model_with_metadata(
    model=nn_model,
    vocab_dict=vocab_dict,
    hyperparams=nn_hyperparams,
    results=nn_results,
    scalers=fitted_scalers,
    save_dir=PATH_TO_SAVE_NN_MODEL,
)

Scalers saved to c:\Users\david\Desktop\Clase\TFG\tfg_24_25\model_artifacts\nn_models\20250702_162411\scalers.joblib
Model and artifacts saved to c:\Users\david\Desktop\Clase\TFG\tfg_24_25\model_artifacts\nn_models\20250702_162411


In [13]:
nn_model, _, fitted_scalers, _ = load_apk_analysis_model_from_version(
    base_dir=PATH_TO_SAVE_NN_MODEL,
)

results = evaluate_model_on_test_set(
    model=nn_model,
    df_test=df_test,
    scalers=fitted_scalers,
    sequence_cols=SEQUENCE_COLS,
    scalar_cols=SCALAR_COLS,
    char_cols=CHAR_COLS,
    vector_cols=VECTOR_COLS,
    hyperparams=nn_hyperparams,
)

print("Test set evaluation results:")
pp.pprint(results)

Using CUDA device: NVIDIA GeForce RTX 4070 SUPER
Loading latest model version: 20250702_162411
Scalers loaded from c:\Users\david\Desktop\Clase\TFG\tfg_24_25\model_artifacts\nn_models\20250702_162411\scalers.joblib
Model loaded from c:\Users\david\Desktop\Clase\TFG\tfg_24_25\model_artifacts\nn_models\20250702_162411
Using CUDA device: NVIDIA GeForce RTX 4070 SUPER
--- Evaluating on Test Set ---


  with autocast(enabled=(device.type == "cuda")):



--- Test Set Evaluation Metrics ---
  Inference Time: 4.68 seconds
  Accuracy: 0.9810
  Precision binary: 0.9762
  Recall binary: 0.9860
  F1 binary: 0.9811
  Precision weighted: 0.9810
  Recall weighted: 0.9810
  F1 weighted: 0.9810
  Confusion Matrix:
[[976  24]
 [ 14 984]]
  Inference time: 4.6806
  Roc auc: 0.9966
  Pr auc: 0.9955
---------------------------------
Test set evaluation results:
{'accuracy': 0.980980980980981,
 'classification_report': {'0': {'f1-score': 0.9809045226130654,
                                 'precision': 0.9858585858585859,
                                 'recall': 0.976,
                                 'support': 1000.0},
                           '1': {'f1-score': 0.9810568295114656,
                                 'precision': 0.9761904761904762,
                                 'recall': 0.9859719438877755,
                                 'support': 998.0},
                           'accuracy': 0.980980980980981,
                           'm

# ML MODELS TRAINING

In [14]:
# Load the latest model version
# model, vocab_dict, used_scalers, metadata = load_apk_analysis_model_from_version(
#     base_dir=PATH_TO_SAVE_NN_MODEL,
# )

# Load just the embedder from the latest version
embedder, vocab_dict, used_scalers, metadata = load_apk_feature_embedder_from_version(
    base_dir=PATH_TO_SAVE_NN_MODEL,
)

# Load just metadata to check performance metrics
metadata = load_apk_analysis_model_metadata(base_dir=PATH_TO_SAVE_NN_MODEL)
print(f"Recall Score: {metadata['summary_metrics'].get('mean_recall', 'N/A')}")

Using CUDA device: NVIDIA GeForce RTX 4070 SUPER
Loading latest model version: 20250702_162411
Scalers loaded from c:\Users\david\Desktop\Clase\TFG\tfg_24_25\model_artifacts\nn_models\20250702_162411\scalers.joblib (for context, though embedder doesn't use them directly)
Embedder loaded from c:\Users\david\Desktop\Clase\TFG\tfg_24_25\model_artifacts\nn_models\20250702_162411
Loading latest model version: 20250702_162411
Recall Score: 0.9794071762870514


In [15]:
# Train classical ML models on embeddings from the nn model
X, y = extract_embeddings(
    model=embedder,
    df=df,
    scalers=used_scalers,
    sequence_cols=SEQUENCE_COLS,
    scalar_cols=SCALAR_COLS,
    char_cols=CHAR_COLS,
    vector_cols=VECTOR_COLS,
    device=device,
    label_col="is_malware",
)

ml_hyperparams_dict = {
    "RandomForest": MLHyperparams(
        model_type="random_forest",
        n_estimators=100,
        max_depth=10,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42,
    ),
    "XGBoost": MLHyperparams(
        model_type="xgboost",
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42,
    ),
    "KNN": MLHyperparams(
        model_type="knn",
        n_neighbors=5,
        weights="uniform",
    ),
    "SVM": MLHyperparams(
        model_type="svm",
        C=1.0,
        kernel="linear",
        probability=False,
        random_state=42,
        verbose=True,
    ),
    "LogisticRegression": MLHyperparams(
        model_type="logistic_regression",
        C=1.0,
        solver="liblinear",
        random_state=42,
    ),
}

ml_results, ml_best_models = train_classical_models_cv(
    X=X,
    y=y,
    hyperparams_dict=ml_hyperparams_dict,
    n_folds=2,
    n_repetitions=5,
    scoring_metric="recall",
    random_state=42,
)

# Save the ML models
ml_saved_paths = save_ml_models_with_metadata(
    models=ml_best_models, results=ml_results, save_dir=PATH_TO_SAVE_ML_MODEL
)

Training 5 models with 5 x 2-fold cross-validation...

=== Repetition 1/5, Fold 1/2 ===
Class distribution:
  Class 0: 4500 train, 4500 test
  Class 1: 4487 train, 4487 test
Training RandomForest...
  RandomForest: Acc=0.9876, P: 0.9877, R: 0.9876, F1: 0.9876, ROC: 0.9993, PR: 0.9993, Size=1356.3KB, Time=8.14s
  ★ New best RandomForest model: recall=0.9876
Training XGBoost...
  XGBoost: Acc=0.9909, P: 0.9909, R: 0.9909, F1: 0.9909, ROC: 0.9994, PR: 0.9994, Size=228.2KB, Time=1.01s
  ★ New best XGBoost model: recall=0.9909
Training KNN...
  KNN: Acc=0.9734, P: 0.9735, R: 0.9734, F1: 0.9734, ROC: 0.9904, PR: 0.9908, Size=15833.2KB, Time=0.00s
  ★ New best KNN model: recall=0.9734
Training SVM...
[LibSVM]  SVM: Acc=0.9846, P: 0.9847, R: 0.9846, F1: 0.9846, ROC: 0.9978, PR: 0.9947, Size=1204.1KB, Time=0.44s
  ★ New best SVM model: recall=0.9846
Training LogisticRegression...
  LogisticRegression: Acc=0.9878, P: 0.9878, R: 0.9878, F1: 0.9878, ROC: 0.9985, PR: 0.9956, Size=4.2KB, Time=0.30s
