In [1]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
import json
import random
from math import floor

In [11]:
def organize_malware_data_no_overlap(train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, random_seed=42):
    """
    Deal with the malware dataset by organizing file paths according to their classes
    WITHOUT data overlap between train/val/test splits
    """    
    # Set random seed for reproducibility
    np.random.seed(random_seed)
    random.seed(random_seed)
    
    # Load CSV
    df = pd.read_csv('MalVis_dataset_small/true_feature_labels.csv')  
    

    data_by_class = defaultdict(list)    
    for _, row in df.iterrows():
        filename = row['filename']
        label = row['label']
        
        file_path = "../MalVis_dataset_small/features/" + label + "/" + filename[:-4] + ".npy"
        data_by_class[label].append(file_path)
    
    # Define fraud types and normal type
    fraud_types = ['adware', 'trojan', 'riskware', 'addisplay', 'spr', 
                   'exploit', 'spyware', 'malware', 'downloader']
    normal_type = 'benign'
    
    # Crea
    split_data = {'train': defaultdict(list), 
                  'val': defaultdict(list), 
                  'test': defaultdict(list)}
    
    for class_name, all_files in data_by_class.items():
        # Remove duplicates and shuffle
        unique_files = list(set(all_files))
        np.random.shuffle(unique_files)
        
        n_total = len(unique_files)
        n_train = int(n_total * train_ratio)
        n_val = int(n_total * val_ratio)
        
        # Split files
        train_files = unique_files[:n_train]
        val_files = unique_files[n_train:n_train+n_val]
        test_files = unique_files[n_train+n_val:]
        
        split_data['train'][class_name] = train_files
        split_data['val'][class_name] = val_files
        split_data['test'][class_name] = test_files

        # print(f"  {class_name}: Total {n_total} -> Train {len(train_files)}, Val {len(val_files)}, Test {len(test_files)}")
    
    # Define classes for train, val, test splits
    train_fraud_classes = fraud_types[:5]  # ['adware', 'trojan', 'riskware', 'addisplay', 'spr'] + ['benign']
    val_fraud_classes = fraud_types[5:7]   # ['exploit', 'spyware'] + ['benign']
    test_fraud_classes = fraud_types[7:]   # ['malware', 'downloader'] + ['benign']

    # Create final data structure
    organized_data = {
        'train': {},
        'val': {},
        'test': {}
    }

    # Distribute training data
    for fraud_class in train_fraud_classes:
        organized_data['train'][fraud_class] = split_data['train'][fraud_class]
    organized_data['train'][normal_type] = split_data['train'][normal_type]

    # Distribute validation data
    for fraud_class in val_fraud_classes:
        organized_data['val'][fraud_class] = split_data['val'][fraud_class]
    organized_data['val'][normal_type] = split_data['val'][normal_type]

    # Distribute test data
    for fraud_class in test_fraud_classes:
        organized_data['test'][fraud_class] = split_data['test'][fraud_class]
    organized_data['test'][normal_type] = split_data['test'][normal_type]

    # Save as JSON file
    output_file = 'malware_data_structure_no_overlap.json'
    with open(output_file, 'w') as f:
        json.dump(organized_data, f, indent=2)

    print(f"Saved: {output_file}")

    # Print final statistics
    print("Final data distribution:")
    total_files = 0
    
    for split, classes in organized_data.items():
        split_total = sum(len(files) for files in classes.values())
        total_files += split_total
        
        print(f"\n{split.upper()}: {list(classes.keys())} (Total: {split_total} documents)")
        for cls, files in classes.items():
            print(f"  {cls}: {len(files)} documents")
    print("Done")

    return organized_data

In [12]:
# Compute the organized data without overlap
organized_data = organize_malware_data_no_overlap(
    train_ratio=0.7,   # 70% training
    val_ratio=0.15,    # 15% validation
    test_ratio=0.15,   # 15% testing
    random_seed=42     # random seed
)

Saved: malware_data_structure_no_overlap.json
Final data distribution:

TRAIN: ['adware', 'trojan', 'riskware', 'addisplay', 'spr', 'benign'] (Total: 40884 documents)
  adware: 8288 documents
  trojan: 7419 documents
  riskware: 6325 documents
  addisplay: 5891 documents
  spr: 5777 documents
  benign: 7184 documents

VAL: ['exploit', 'spyware', 'benign'] (Total: 3491 documents)
  exploit: 1014 documents
  spyware: 938 documents
  benign: 1539 documents

TEST: ['malware', 'downloader', 'benign'] (Total: 3052 documents)
  malware: 771 documents
  downloader: 741 documents
  benign: 1540 documents
Done
