In [1]:
import pandas as pd
import pprint as pp
import string

from torch_nn_model_2 import (
    create_vocab_for_column,
    create_char_vocab,
    get_best_available_device,
    cross_val_train_nn_model,
    extract_embeddings,
    NNHyperparams,
)

from torch_nn_model_io_2 import (
    save_model_with_metadata,
    load_debrim_model_from_version,
    load_debrim_embedder_from_version,
    load_debrim_metadata,
)

from ml_model import MLHyperparams, train_classical_models_cv
from ml_model_io import (
    save_ml_models_with_metadata
)



In [2]:
# Define feature columns and load data
# Categorize features by type
SEQUENCE_FEATURES = [
    "activities_list", 
    "services_list", 
    "receivers_list", 
    "permissions_list", 
    "api_calls_list"
]

# CHAR_COLS = ["fuzzy_hash"]
CHAR_COLS = ["fuzzy_hash"]
VECTOR_COLS = ["opcode_counts"]
SCALAR_COLS = ["file_size"]

# Set the dimensions of vector features
VECTOR_DIMS = {"opcode_counts": 768}  # Adjust this based on your actual data
    
# Load data
df = pd.read_csv("dataset/apk_analysis_dataset.csv")
df = df.sample(200, random_state=42)

def parse_list_string(s):
    if isinstance(s, float) and pd.isna(s):
        return [] # Handle NaN values as empty lists

    if not isinstance(s, str):
        return s  # Already a list or some other format

    s = s.strip('[]')  # Remove brackets
    items = [item.strip().strip("'").strip('"') for item in s.split(',')]
    return [item for item in items if item]  # Remove empty strings

# Add this function to parse vector features
def parse_vector_string(s):
    if isinstance(s, float) and pd.isna(s):
        # Return zeros vector for missing values
        return [0.0] * VECTOR_DIMS["opcode_counts"]
    
    if not isinstance(s, str):
        return s  # Already a list or array
        
    # Remove brackets and split by comma
    s = s.strip('[]')
    try:
        # Convert to float list
        vector = [float(item.strip()) for item in s.split(',') if item.strip()]
        return vector
    except ValueError:
        # Fallback to zeros if parsing fails
        print(f"Warning: Failed to parse vector: {s[:30]}...")
        return [0.0] * VECTOR_DIMS["opcode_counts"]

for col in SEQUENCE_FEATURES:
    if col in df.columns:
        df[col] = df[col].apply(parse_list_string)
        
# Pre-process vector columns
for col in VECTOR_COLS:
    if col in df.columns:
        df[col] = df[col].apply(parse_vector_string)

In [3]:
df.head()

Unnamed: 0,file_size,fuzzy_hash,activities_list,services_list,receivers_list,permissions_list,api_calls_list,opcode_counts,is_malware
11945,5320340,OunNz4OqF4IWX+vRTSH9bj2VoHXevlKhJEEj1cx5jWU4za...,"[net.hscni.covidtracker.MainActivity, com.goog...",[androidx.work.impl.foreground.SystemForegroun...,[androidx.work.impl.background.systemalarm.Con...,"[android.permission.BLUETOOTH, android.permiss...","[android/accounts/Account.<init>, android/anim...","[195.0, 6971.0, 2487.0, 0.0, 503.0, 359.0, 0.0...",0
3213,5765093,zuTNYpV4C6V7y77aXXfrLTrWIn9e017p2jJe8TSKpL+8tK...,[com.tencent.midas.proxyactivity.APMidasPayPro...,[com.tencent.wns.service.WnsMain],"[com.tencent.base.os.clock.AlarmClockReceiver,...","[android.permission.GET_TASKS, android.permiss...","[QMF_PROTOCAL/QmfUpstream.display, QMF_SERVICE...","[258.0, 7494.0, 1325.0, 0.0, 307.0, 843.0, 0.0...",1
18925,4701596,sPd7OjyhfuFN9xcOxAOrexhyzI2FjzpbZ/vYU2Sg/8OdTu...,[com.google.android.gms.common.api.GoogleApiAc...,[com.google.firebase.messaging.FirebaseMessagi...,[com.dexterous.flutterlocalnotifications.Sched...,"[android.permission.ACCESS_NETWORK_STATE, andr...","[a0/k$b.clone, a0/o$b.clone, a0/o$c.clone, a0/...","[98.0, 2786.0, 1305.0, 0.0, 235.0, 229.0, 0.0,...",0
9565,2921128,4t62fBc2Po4P+xWEDY+wT/btrV3rsCWnHMukT04jBpSROO...,[com.haibian.work.activity.uploadhomework.Sett...,"[cn.jpush.android.service.DownloadService, cn....","[cn.jpush.android.service.PushReceiver, cn.jpu...","[android.permission.READ_PHONE_STATE, android....",[android/accessibilityservice/AccessibilitySer...,"[166.0, 1982.0, 2637.0, 0.0, 29.0, 55.0, 0.0, ...",1
16571,2950303,/3UhYWSFDdngfvm6M2z278tJtgoQUu9EgHlcl3S7GZAp,[io.flutter.plugins.urllauncher.WebViewActivit...,[com.google.android.datatransport.runtime.back...,[com.google.firebase.iid.FirebaseInstanceIdRec...,"[android.permission.VIBRATE, android.permissio...","[android/accounts/Account.<init>, android/acco...","[188.0, 1259.0, 1260.0, 0.0, 341.0, 218.0, 0.0...",0


In [4]:
# Cell to create vocabularies - updated version
# Create vocabulary dictionary for sequence features
vocab_dict = {col: create_vocab_for_column(df, col) for col in SEQUENCE_FEATURES}

# Build char vocabulary for fuzzy_hash and convert them to index arrays
all_chars = string.ascii_lowercase + string.ascii_uppercase + string.digits + "+/"
char_vocab = create_char_vocab(set(all_chars))
vocab_dict['fuzzy_hash'] = char_vocab

# Function to convert a string to index array
def convert_hash_to_indices(hash_str, vocab):
    stoi = vocab.get_stoi()
    
    if isinstance(hash_str, str):
        return [stoi.get(c, stoi["<UNK>"]) for c in hash_str]
    else:
        return [stoi["<EMPTY>"]]
        
for col in CHAR_COLS:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: convert_hash_to_indices(x, vocab_dict[col]))

# Display sample data
print("Sample sequence vocabulary:", list(vocab_dict[SEQUENCE_FEATURES[0]].get_stoi().items())[:5])
print("Character vocabulary size:", len(vocab_dict['fuzzy_hash']))
display(df.head())

# Get the best available device
device = get_best_available_device()

Sample sequence vocabulary: [('com.money_design.theo.presentation.others.information.InformationMenuListActivity', 2585), ('com.tencent.tinker.loader.hotplug.ActivityStubs$STDStub_02', 117), ('cn.kuwo.player.wxapi.WXEntryActivity', 742), ('br.com.rmsolucoesmoveis.hinariocristao.SplashKt', 680), ('com.qihoo.appstore.home.LauncherActivity', 516)]
Character vocabulary size: 67


Unnamed: 0,file_size,fuzzy_hash,activities_list,services_list,receivers_list,permissions_list,api_calls_list,opcode_counts,is_malware
11945,5320340,"[29, 61, 54, 28, 66, 9, 29, 57, 20, 9, 23, 37,...","[net.hscni.covidtracker.MainActivity, com.goog...",[androidx.work.impl.foreground.SystemForegroun...,[androidx.work.impl.background.systemalarm.Con...,"[android.permission.BLUETOOTH, android.permiss...","[android/accounts/Account.<init>, android/anim...","[195.0, 6971.0, 2487.0, 0.0, 503.0, 359.0, 0.0...",0
3213,5765093,"[66, 61, 34, 28, 39, 56, 36, 9, 17, 11, 36, 12...",[com.tencent.midas.proxyactivity.APMidasPayPro...,[com.tencent.wns.service.WnsMain],"[com.tencent.base.os.clock.AlarmClockReceiver,...","[android.permission.GET_TASKS, android.permiss...","[QMF_PROTOCAL/QmfUpstream.display, QMF_SERVICE...","[258.0, 7494.0, 1325.0, 0.0, 307.0, 843.0, 0.0...",1
18925,4701596,"[59, 30, 44, 12, 29, 50, 65, 48, 46, 61, 20, 2...",[com.google.android.gms.common.api.GoogleApiAc...,[com.google.firebase.messaging.FirebaseMessagi...,[com.dexterous.flutterlocalnotifications.Sched...,"[android.permission.ACCESS_NETWORK_STATE, andr...","[a0/k$b.clone, a0/o$b.clone, a0/o$c.clone, a0/...","[98.0, 2786.0, 1305.0, 0.0, 235.0, 229.0, 0.0,...",0
9565,2921128,"[9, 60, 11, 7, 46, 16, 43, 7, 30, 55, 9, 30, 3...",[com.haibian.work.activity.uploadhomework.Sett...,"[cn.jpush.android.service.DownloadService, cn....","[cn.jpush.android.service.PushReceiver, cn.jpu...","[android.permission.READ_PHONE_STATE, android....",[android/accessibilityservice/AccessibilitySer...,"[166.0, 1982.0, 2637.0, 0.0, 29.0, 55.0, 0.0, ...",1
16571,2950303,"[4, 8, 35, 48, 39, 37, 33, 20, 18, 44, 54, 47,...",[io.flutter.plugins.urllauncher.WebViewActivit...,[com.google.android.datatransport.runtime.back...,[com.google.firebase.iid.FirebaseInstanceIdRec...,"[android.permission.VIBRATE, android.permissio...","[android/accounts/Account.<init>, android/acco...","[188.0, 1259.0, 1260.0, 0.0, 341.0, 218.0, 0.0...",0


Using CUDA device: NVIDIA GeForce RTX 4070 SUPER



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\david\Desktop\Clase\TFG\tfg_24_25\prototypes\.venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\david\Desktop\Clase\TFG\tfg_24_25\prototypes\.venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\david\Desktop\Clase\TFG\tfg_24_25\prototypes\.venv\Lib\site-packages\ipykernel\

In [5]:
# Configure hyperparameters
nn_hyperparams = NNHyperparams(
    batch_size=256,
    learning_rate=1e-3,
    epochs=10,
    early_stopping=True,
    patience=5,
    optimizer="adam",
    weight_decay=1e-5,
    embedding_dim=128,
    hidden_dims=[128, 32],
    dropout=0.5,
    seq_pooling="mean",
    n_classes=2,
    label_col="is_malware",
)

nn_results, best_nn_model = cross_val_train_nn_model(
    df=df,
    vocab_dict=vocab_dict,
    sequence_cols=SEQUENCE_FEATURES,
    scalar_cols=SCALAR_COLS,
    char_cols=CHAR_COLS,
    vector_cols=VECTOR_COLS,
    vector_dims=VECTOR_DIMS,
    hyperparams=nn_hyperparams,
    n_folds=2,
    n_repetitions=5,
    scoring_metric="recall",
    device=device,
    random_seed=42,
)

Using device: cuda
Using recall as the primary scoring metric
Using class weights: [0.97087379 1.03092784]

=== Repetition 1/5, Fold 1/2 ===
Training set class distribution: {0: 51, 1: 49}
Validation set class distribution: {0: 52, 1: 48}
Created datasets for training and validation
Created loaders for training and validation
Initialized model with specified configuration
Starting training...


AttributeError: 'DebrimEmbedder' object has no attribute 'embedders'

In [None]:
from torch_nn_model_2 import DebrimEmbedder, DebrimModel

model = DebrimModel.from_config(
    vocab_dict=vocab_dict,
    sequence_cols=SEQUENCE_FEATURES,
    scalar_cols=SCALAR_COLS,
    char_cols=CHAR_COLS,
    vector_cols=VECTOR_COLS,
    vector_dims=VECTOR_DIMS,
)

# Print models internal structure
pp.pprint(model)

In [None]:
# Save the model and components
save_paths = save_model_with_metadata(
    model=best_nn_model,
    vocab_dict=vocab_dict,
    hyperparams=nn_hyperparams,
    results=nn_results,
)

# Load the latest model version
model, vocab_dict, metadata = load_debrim_model_from_version()

# Load just the embedder from the latest version
embedder, vocab_dict, metadata = load_debrim_embedder_from_version()

# Load just metadata to check performance metrics
metadata = load_debrim_metadata()
print(f"Recall Score: {metadata['summary_metrics'].get('mean_recall', 'N/A')}")

In [None]:
# Train classical ML models on embeddings from the new model architecture
X, y = extract_embeddings(
    model=embedder,
    df=df,
    vocab_dict=vocab_dict,
    sequence_cols=SEQUENCE_FEATURES,
    scalar_cols=SCALAR_COLS,
    char_cols=CHAR_COLS,
    vector_cols=VECTOR_COLS,
    device=device,
    label_col="is_malware",
)

# Define classical ML model hyperparameters
ml_hyperparams_dict = {
    "RandomForest": MLHyperparams(
        model_type="random_forest",
        n_estimators=100,
        max_depth=10,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42,
    ),
    "XGBoost": MLHyperparams(
        model_type="xgboost",
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42,
    ),
     "SVM": MLHyperparams(
        model_type="svm",
        C=1.0,
        kernel="linear",
        probability=True,
        random_state=42,
    ),
    "LogisticRegression": MLHyperparams(
        model_type="logistic_regression",
        C=1.0,
        solver="liblinear",  # Changed to handle high-dimensional data better
        random_state=42,
    ),
}

# Train classical models with cross-validation
ml_results, ml_best_models = train_classical_models_cv(
    X=X,
    y=y,
    hyperparams_dict=ml_hyperparams_dict,
    n_folds=2,
    n_repetitions=5,
    scoring_metric="recall",
    random_state=42,
)

# Save the ML models
ml_saved_paths = save_ml_models_with_metadata(
    models=ml_best_models, 
    results=ml_results, 
    save_dir="./model_artifacts/ml_models"
)

In [None]:
ml_saved_paths = save_ml_models_with_metadata(
    models=ml_best_models, results=ml_results, save_dir="./model_artifacts/ml_models"
)

# loaded_ml_models, version_metadata = load_ml_models_from_version()