In [3]:
import numpy as np
import pandas as pd
import pprint as pp

from torch_nn_model import (
    preprocess_dataframe,
    create_vocab_for_column,
    get_best_available_device,
    cross_val_train_nn_model,
    extract_embeddings,
    NNHyperparams,
)

from torch_nn_model_io import (
    save_model_with_metadata,
    load_debrim_model_from_version,
    load_debrim_embedder_from_version,
    load_debrim_metadata,
)

from ml_model import MLHyperparams, train_classical_models_cv
from ml_model_io import (
    save_ml_models_with_metadata,
    load_ml_models_from_version,
    load_ml_model_from_version,
    load_ml_version_metadata,
    load_ml_model_metadata,
)

In [4]:
# Define feature columns and load data

FEATURE_NAMES = [
    "hw_features",
    "req_permissions",
    "app_components",
    "intents",
    "api_calls",
    "used_permissions",
    "sus_api_calls",
    "urls",
]

SCALAR_COLS = []

# Load data
df = pd.read_csv("test_1k.csv")
df = df.replace(np.nan, "")
df = df.drop(columns=["family"])

df = preprocess_dataframe(df, FEATURE_NAMES)

# Create vocabulary dictionary
vocab_dict = {col: create_vocab_for_column(df, col) for col in FEATURE_NAMES}

print(vocab_dict["hw_features"].get_itos())
display(df)

# Get the best available device
device = get_best_available_device()

['<PAD>', '<UNK>', '<EMPTY>', 'android.hardware.touchscreen', 'android.hardware.screen.portrait', 'android.hardware.location', 'android.hardware.location.network', 'android.hardware.location.gps', 'android.hardware.telephony', 'android.hardware.wifi', 'android.hardware.screen.landscape', 'android.hardware.camera', 'android.hardware.camera.autofocus', 'android.hardware.microphone', 'android.hardware.bluetooth', 'android.software.live_wallpaper', 'android.hardware.touchscreen.multitouch', 'android.hardware.sensor.accelerometer', 'ACCESS_COARSE_LOCATION', 'READ_SMS', 'RECEIVE_SMS', 'android.hardware.camera.flash', 'android.hardware.nfc', 'android.hardware.sensor.compass']


Unnamed: 0,sha256,hw_features,req_permissions,app_components,intents,api_calls,used_permissions,sus_api_calls,urls,malware
0,00002d74a9faa53f5199c910b652ef09d3a7f6bd42b693...,"[android.hardware.touchscreen, android.hardwar...","[android.permission.READ_PHONE_STATE, android....","[.GameBootReceiver, .GameService, GameAlertDia...","[android.intent.action.BOOT_COMPLETED, android...","[java/net/HttpURLConnection, android/content/C...","[android.permission.INTERNET, android.permissi...","[Read/Write External Storage, getPackageInfo, ...",[http://client.go360days.com/client.php?action...,1
1,000068216bdb459df847bfdd67dd11069c3c50166db1ea...,[android.hardware.touchscreen],"[android.permission.WRITE_EXTERNAL_STORAGE, an...","[paklena.batterydiviner.Settings, paklena.batt...","[android.intent.category.HOME, android.intent....","[android/content/Context;->startService, andro...",[android.permission.VIBRATE],"[Read/Write External Storage, getSystemService]",[],0
2,0000764713b286cfe7e8e76c7038c92312977712d9c5a8...,"[android.hardware.touchscreen, android.hardwar...","[android.permission.ACCESS_FINE_LOCATION, andr...","[BootReceiver, ru.alpha.AlphaReceiver, ru.alph...","[android.intent.category.HOME, android.intent....",[android/net/ConnectivityManager;->getActiveNe...,"[android.permission.INTERNET, android.permissi...","[getDeviceId, Read/Write External Storage, Obf...","[http://m-001.net/i/, m-001.net, http://m-001....",1
3,0000962c2c34de1ca0c329b18be7847459da2d9d14b6b2...,"[android.hardware.touchscreen, android.hardwar...",[android.permission.INTERNET],"[.index, index]","[android.intent.action.MAIN, android.intent.ca...",[android/webkit/WebView],[android.permission.INTERNET],[getSystemService],[],0
4,000167f1ff061ea91440c40659c11c2af160342fd2e493...,"[android.hardware.touchscreen, android.hardwar...","[android.permission.INTERNET, android.permissi...","[NokyART, .NokyArtWidget$UpdateService, .NokyA...","[android.intent.action.MAIN, android.intent.ca...",[android/net/ConnectivityManager;->getNetworkI...,"[android.permission.INTERNET, android.permissi...",[getSystemService],[http://nokyart.n97.fr/Application/android/ind...,0
...,...,...,...,...,...,...,...,...,...,...
995,01e78d647cabf702bdbbb3af46ab70f2cbb407dfe6c6f8...,"[android.hardware.touchscreen, android.hardwar...","[android.permission.INTERNET, android.permissi...",[.EmotionSensor],"[android.intent.action.MAIN, android.intent.ca...",[android/net/ConnectivityManager;->getActiveNe...,"[android.permission.ACCESS_NETWORK_STATE, andr...","[getPackageInfo, getSystemService, Cipher(AES/...",[http://schemas.android.com/apk/lib/com.google...,0
996,01e7a30caff13ed959ec74a26ec27564dea94389200ed9...,[android.hardware.touchscreen],[android.permission.WRITE_EXTERNAL_STORAGE],[.main],"[android.intent.action.MAIN, android.intent.ca...",[android/content/ContentResolver;->openInputSt...,"[android.permission.INTERNET, android.permissi...","[Read/Write External Storage, getSystemService]",[],0
997,01e7a6ea485da0acbe7ac6f7c1dbb20df27faa3a06130f...,"[android.hardware.touchscreen, android.hardwar...","[android.permission.INTERNET, android.permissi...",[.zaragoza_872],"[android.intent.action.MAIN, android.intent.ca...",[android/telephony/TelephonyManager;->getDevic...,"[android.permission.ACCESS_FINE_LOCATION, andr...","[getDeviceId, printStackTrace, Read/Write Exte...",[],0
998,01e86ce55b5175a75ed513d452c5a7169a56d013d822f1...,"[android.hardware.touchscreen, android.hardwar...","[android.permission.INTERNET, com.android.vend...",[.MainActivity],"[android.intent.action.MAIN, android.intent.ca...","[android/app/Activity;->startActivity, android...","[android.permission.READ_CONTACTS, android.per...","[printStackTrace, getPackageInfo, getSystemSer...","[http://market.android.com/details?id%3D, http...",0


Using CUDA device: NVIDIA GeForce RTX 4070 SUPER



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\david\Desktop\Clase\TFG\tfg_24_25\prototypes\.venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\david\Desktop\Clase\TFG\tfg_24_25\prototypes\.venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\david\Desktop\Clase\TFG\tfg_24_25\prototypes\.venv\Lib\site-packages\ipykernel\

In [5]:
# Configure hyperparameters
nn_hyperparams = NNHyperparams(
    batch_size=64,
    learning_rate=1e-3,
    epochs=10,
    early_stopping=True,
    patience=5,
    optimizer="adam",
    weight_decay=1e-5,
    embedding_dim=128,
    hidden_dims=[128, 64],
    dropout=0.5,
    seq_pooling="mean",
    n_classes=2,
    label_col="malware",
)

# Train with cross-validation
nn_results, best_nn_model = cross_val_train_nn_model(
    df=df,
    vocab_dict=vocab_dict,
    scalar_cols=SCALAR_COLS,
    hyperparams=nn_hyperparams,
    n_folds=2,
    n_repetitions=5,
    scoring_metric="recall",
    device=device,
    random_seed=42,
)

Using device: cuda
Using recall as the primary scoring metric

=== Repetition 1/5, Fold 1/2 ===
Training set class distribution: {0: 474, 1: 26}
Validation set class distribution: {0: 473, 1: 27}
Epoch 1: Train Loss = 0.5966, Val Loss = 0.4966, recall = 0.9500
Epoch 2: Train Loss = 0.4531, Val Loss = 0.4250, recall = 0.9520
Epoch 3: Train Loss = 0.2767, Val Loss = 0.3284, recall = 0.9120
Epoch 4: Train Loss = 0.2890, Val Loss = 0.3664, recall = 0.9360
Epoch 5: Train Loss = 0.1380, Val Loss = 0.3363, recall = 0.9400
Epoch 6: Train Loss = 0.1860, Val Loss = 0.3036, recall = 0.9080
Epoch 7: Train Loss = 0.1193, Val Loss = 0.3264, recall = 0.9380
Epoch 8: Train Loss = 0.0989, Val Loss = 0.3796, recall = 0.9480
Epoch 9: Train Loss = 0.0743, Val Loss = 0.4590, recall = 0.9520
Epoch 10: Train Loss = 0.0539, Val Loss = 0.4453, recall = 0.9460
  * New best model: RECALL=0.9460
Fold 1 results: F1=0.9514, Accuracy=0.9460, Precision=0.9606, Recall=0.9460, PR-AUC=0.7248, ROC-AUC=0.9558
Training tim

In [6]:
# Save the model and components
save_paths = save_model_with_metadata(
    model=best_nn_model,
    vocab_dict=vocab_dict,
    hyperparams=nn_hyperparams,
    results=nn_results,
)

# Load the latest model version
model, vocab_dict, metadata = load_debrim_model_from_version()

# Load a specific model version
model, vocab_dict, metadata = load_debrim_model_from_version()

# Load just the embedder from the latest version
embedder, vocab_dict, metadata = load_debrim_embedder_from_version()

# Load just metadata to check performance metrics
metadata = load_debrim_metadata()
print(f"F1 Score: {metadata['all_metrics'].get('mean_f1', 'N/A')}")

Model and artifacts saved to ./model_artifacts/nn_models\20250520_001800
Using CUDA device: NVIDIA GeForce RTX 4070 SUPER
Loading latest model version: 20250520_001800
Model loaded from ./model_artifacts/nn_models\20250520_001800
Using CUDA device: NVIDIA GeForce RTX 4070 SUPER
Loading latest model version: 20250520_001800
Model loaded from ./model_artifacts/nn_models\20250520_001800
Using CUDA device: NVIDIA GeForce RTX 4070 SUPER
Loading latest model version: 20250520_001800
Embedder loaded from ./model_artifacts/nn_models\20250520_001800
Loading latest model version: 20250520_001800
F1 Score: 0.959117854641144


In [7]:
# Train classical ML models on embeddings

X, y = extract_embeddings(
    df=df,
    vocab_dict=vocab_dict,
    scalar_cols=SCALAR_COLS,
    model=embedder,
    device=device,
)


ml_hyperarams_dict = {
    "RandomForest": MLHyperparams(
        model_type="random_forest",
        n_estimators=100,
        max_depth=10,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42,
    ),
    "XGBoost": MLHyperparams(
        model_type="xgboost",
        n_estimators=100,
        max_depth=10,
        learning_rate=0.1,
        random_state=42,
    ),
    "KNeighbors": MLHyperparams(
        model_type="knn",
        n_neighbors=5,
        weights="uniform",
    ),
    "SVM": MLHyperparams(
        model_type="svm",
        kernel="linear",
        C=1.0,
        random_state=42,
    ),
    "LogisticRegression": MLHyperparams(
        model_type="logistic_regression",
        C=1.0,
        solver="lbfgs",
        random_state=42,
    ),
}

# Train classical models with cross-validation
ml_results, ml_best_models = train_classical_models_cv(
    X=X,
    y=y,
    hyperparams_dict=ml_hyperarams_dict,
    n_folds=2,
    n_repetitions=5,
    scoring_metric="recall",
    random_state=42,
)

Training 5 models with 5 x 2-fold cross-validation...

=== Repetition 1/5, Fold 1/2 ===
Class distribution:
  Class 0: 474 train, 473 test
  Class 1: 26 train, 27 test
Training RandomForest...
  RandomForest: F1=0.9375, Accuracy=0.9540, Size=201.6KB, Time=0.21s
  ★ New best RandomForest model: recall=0.9540
Training XGBoost...
  XGBoost: F1=0.9470, Accuracy=0.9540, Size=87.7KB, Time=0.41s
  ★ New best XGBoost model: recall=0.9540
Training KNeighbors...
  KNeighbors: F1=0.9516, Accuracy=0.9580, Size=4004.6KB, Time=0.00s
  ★ New best KNeighbors model: recall=0.9580
Training SVM...
  SVM: F1=0.9756, Accuracy=0.9760, Size=586.7KB, Time=0.02s
  ★ New best SVM model: recall=0.9760
Training LogisticRegression...
  LogisticRegression: F1=0.9716, Accuracy=0.9700, Size=8.7KB, Time=0.01s
  ★ New best LogisticRegression model: recall=0.9700

=== Repetition 1/5, Fold 2/2 ===
Class distribution:
  Class 0: 473 train, 474 test
  Class 1: 27 train, 26 test
Training RandomForest...
  RandomForest: F1=0

In [8]:
ml_saved_paths = save_ml_models_with_metadata(
    models=ml_best_models, results=ml_results, save_dir="./model_artifacts/ml_models"
)

# loaded_ml_models, version_metadata = load_ml_models_from_version()

Saved 5 model(s) to ./model_artifacts/ml_models\20250520_001808
  - RandomForest
  - XGBoost
  - KNeighbors
  - SVM
  - LogisticRegression
