# Combine Parser

In [2]:
# libraries
import os
import json
import pandas as pd
from tqdm import tqdm  # Import tqdm for the progress bar
import pickle
import time  # Import time to track duration
import fastparquet
import pyarrow

# 1. API Calls Parser

This code processes JSON reports generated by Cuckoo sandbox, extracting information about API calls made during the analysis. For each report, it identifies all unique API calls under the `apistats` section and creates a feature for each, prefixed with "API:". The code generates a summary for each report, marking `1` if an API call occurred and `0` otherwise. It stores these summaries in a DataFrame, with the `sample_id` as the first column and API calls as subsequent columns. The `sample_id` is sorted numerically to ensure proper order (e.g., 10001, 10002, etc.).

In [3]:
def summarize_api_calls(apistats):
    """
    Summarizes API calls from the apistats section of the report.
    Returns a dictionary with keys as feature names prefixed with "API:" and values as 1 (since the API was called).
    """
    # Find all unique API calls across all processes
    unique_api_calls = set()
    for api_dict in apistats.values():
        unique_api_calls.update(api_dict.keys())

    # Initialize a dictionary to store the summary with "API:" prefix
    summary = {f"API:{api}": 0 for api in unique_api_calls}

    # Set called API calls to 1
    for api_dict in apistats.values():
        for api in api_dict.keys():
            summary[f"API:{api}"] = 1

    return summary

def process_reports_folder(reports_folder):
    """
    Processes all JSON report files in the specified folder and returns a DataFrame
    with summarized API calls for each report.
    """
    # Initialize a list to store rows of data
    data = []
    file_ids = []

    # Loop through all JSON files in the reports folder with progress bar
    all_files = [f for f in os.listdir(reports_folder) if f.isdigit() or f.endswith('.json')]

    # Initialize tqdm progress bar
    for filename in tqdm(all_files, desc="Processing files", unit="file"):
        file_path = os.path.join(reports_folder, filename)
        with open(file_path, 'r') as file:
            report = json.load(file)
            apistats = report.get('behavior', {}).get('apistats', {})
            summary = summarize_api_calls(apistats)
            file_id = os.path.splitext(filename)[0]  # Extract file ID (e.g., 10001)
            data.append(summary)
            file_ids.append(int(file_id))  # Convert file ID to integer for sorting

    # Create a DataFrame from the list of summaries and set the index as "sample_id"
    df = pd.DataFrame(data, index=file_ids).fillna(0).astype(int).rename_axis("sample_id")

    # Sort the DataFrame by sample_id to ensure it's in order (10001, 10002, etc.)
    df.sort_index(inplace=True)

    return df

if __name__ == "__main__":
    reports_folder = "json_reports" # Folder containing Cuckoo report JSON files
    df1_api = process_reports_folder(reports_folder)
    display(df1_api)


Processing files: 100%|██████████| 4884/4884 [42:58<00:00,  1.89file/s]  


Unnamed: 0_level_0,API:GetAdaptersInfo,API:MessageBoxTimeoutW,API:NtQueryValueKey,API:NtOpenFile,API:NtProtectVirtualMemory,API:SetErrorMode,API:NtFreeVirtualMemory,API:CreateProcessInternalW,API:RegOpenKeyExW,API:NtOpenDirectoryObject,...,API:NtLoadKey2,API:NtSaveKey,API:NtCreateUserProcess,API:NtLoadDriver,API:InternetGetConnectedStateExW,API:CryptUnprotectData,API:WSARecvFrom,API:WSASendTo,API:DeleteUrlCacheEntryW,API:DnsQuery_UTF8
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,1,0,1,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,1,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,1,1,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15691,0,0,1,1,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
15692,0,0,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
15693,0,0,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
15694,0,0,1,1,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Functions to automate data processing

In [5]:
def rename_columns_with_numbers_index(df, start_number=1):
    """
    Rename columns (except 'sample_id') to numbers starting from the specified number.
    Args:
    - df: The input pandas DataFrame, where 'sample_id' is already set as the index.
    - start_number: The number from which the column renaming should start. Default is 1.
    
    Returns:
    - df: DataFrame with renamed columns.
    - column_map: Dictionary mapping numbers to original column names.
    """
    # Ensure that 'sample_id' is the index (no need to set it if already done)
    if 'sample_id' in df.index.name:
        # Get all columns except 'sample_id'
        columns_to_rename = df.columns
        
        # Create a dictionary mapping new column numbers to old column names
        column_map = {i + start_number: columns_to_rename[i] for i in range(len(columns_to_rename))}
        
        # Rename columns to start from start_number
        df.columns = [str(i + start_number) for i in range(len(columns_to_rename))]
        
        return df, column_map
    else:
        raise ValueError("The DataFrame does not have 'sample_id' as its index.")



def rename_columns_with_numbers(df, start_number=1):
    """
    Rename columns (except 'sample_id') to numbers starting from the specified number.
    Args:
    - df: The input pandas DataFrame.
    - start_number: The number from which the column renaming should start. Default is 1.
    
    Returns:
    - df: DataFrame with renamed columns.
    - column_map: Dictionary mapping numbers to original column names.
    """
    # Make sure 'sample_id' is the index, and rename only the columns after it
    df.set_index('sample_id', inplace=True)
    
    # Get all columns except 'sample_id'
    columns_to_rename = df.columns
    
    # Create a dictionary mapping new column numbers to old column names
    column_map = {i + start_number: columns_to_rename[i] for i in range(len(columns_to_rename))}
    
    # Rename columns to start from start_number
    df.columns = [str(i + start_number) for i in range(len(columns_to_rename))]
    
    return df, column_map

def save_column_map(column_map, filename):
    """
    Save the column mapping dictionary to a JSON file.
    Args:
    - column_map: Dictionary mapping numbers to original column names.
    - filename: The filename to save the dictionary.
    """
    with open(filename, 'w') as file:
        json.dump(column_map, file, indent=4)

def load_column_map(filename):
    """
    Load the column mapping dictionary from a JSON file.
    Args:
    - filename: The filename to load the dictionary from.
    
    Returns:
    - column_map: Dictionary mapping numbers to original column names.
    """
    with open(filename, 'r') as file:
        column_map = json.load(file)
    
    return column_map

def merge_column_map_to_df(df, column_map):
    """
    Merge the column map to the DataFrame to get the original column names.
    Args:
    - df: DataFrame with numbered columns.
    - column_map: Dictionary mapping numbers to original column names.
    
    Returns:
    - df: DataFrame with original column names restored.
    """
    # Replace numbered columns with their original names
    df = df.rename(columns={str(i): column_map[i] for i in column_map})
    return df

def print_first_and_last_10_items(dictionary):
    """
    Prints the first 10 and last 10 items of a dictionary.
    Args:
    - dictionary: The dictionary whose first and last 10 items are to be printed.
    """
    # Get the first 10 items
    first_10_items = list(dictionary.items())[:10]
    # Get the last 10 items
    last_10_items = list(dictionary.items())[-10:]

    # Print the first 10 items
    print("First 10 items:")
    for key, value in first_10_items:
        print(f"{key}: {value}")

    # Print a separator
    print("\n" + "-"*40 + "\n")

    # Print the last 10 items
    print("Last 10 items:")
    for key, value in last_10_items:
        print(f"{key}: {value}")


In [6]:
def find_constant_features(df):
    """
    Find and list the constant features (columns) in a DataFrame.
    A constant feature is a column where all values are the same.

    Args:
    - df: The input pandas DataFrame.

    Returns:
    - None: Prints a message with constant features or indicates no constant features.
    """
    # Find constant features (columns with the same value across all rows)
    constant_features = [col for col in df.columns if df[col].nunique() == 1]

    if constant_features:
        print("Constant features found:")
        for feature in constant_features:
            print(feature)
    else:
        print("No constant features found.")


In [10]:
def check_column_uniqueness(df):
    """
    Check if the column names in the DataFrame are unique or contain duplicates,
    ignoring case sensitivity by converting all column names to lowercase.

    Args:
    - df: The input pandas DataFrame.

    Returns:
    - None: Prints whether the column names are unique or contain duplicates.
    """
    # Convert all column names to lowercase
    column_names_lower = [col.lower() for col in df.columns]
    
    # Check if the length of column names set is equal to the length of original column names (case-insensitive)
    if len(column_names_lower) == len(set(column_names_lower)):
        print("All column names are unique.")
    else:
        print("There are duplicate column names.")

def list_and_count_duplicates(df):
    """
    List duplicate column names (case-insensitive) and count how many times each appears.

    Args:
    - df: The input pandas DataFrame.

    Returns:
    - None: Prints the duplicate column names and their counts.
    """
    # Convert all column names to lowercase for case-insensitive comparison
    column_names_lower = [col.lower() for col in df.columns]
    
    # Create a dictionary to count occurrences of each column name
    from collections import Counter
    column_counts = Counter(column_names_lower)
    
    # Filter to get only the columns that have more than 1 occurrence
    duplicates = {col: count for col, count in column_counts.items() if count > 1}
    
    if duplicates:
        print("Duplicate column names and their counts (case-insensitive):")
        for col, count in duplicates.items():
            print(f"{col}: {count} times")
    else:
        print("No duplicate column names found.")


In [7]:
# Call the function to find constant features
find_constant_features(df1_api)

No constant features found.


In [8]:
# Call the function to check if column names are unique
check_column_uniqueness(df1_api)

All column names are unique.


In [9]:
 # Step 1: Rename columns starting from 1
df1_api2, column_map_api = rename_columns_with_numbers_index(df1_api, start_number=1)
    
# Step 2: Save the column map
save_column_map(column_map_api, '5_mlran_dataset/1_api_feature_names_dic.json')

In [10]:
print_first_and_last_10_items(column_map_api)

display(df1_api2.head())

df1_api2.to_csv("5_mlran_dataset/1_api_dataset.csv")

First 10 items:
1: API:GetAdaptersInfo
2: API:MessageBoxTimeoutW
3: API:NtQueryValueKey
4: API:NtOpenFile
5: API:NtProtectVirtualMemory
6: API:SetErrorMode
7: API:NtFreeVirtualMemory
8: API:CreateProcessInternalW
9: API:RegOpenKeyExW
10: API:NtOpenDirectoryObject

----------------------------------------

Last 10 items:
304: API:NtLoadKey2
305: API:NtSaveKey
306: API:NtCreateUserProcess
307: API:NtLoadDriver
308: API:InternetGetConnectedStateExW
309: API:CryptUnprotectData
310: API:WSARecvFrom
311: API:WSASendTo
312: API:DeleteUrlCacheEntryW
313: API:DnsQuery_UTF8


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,304,305,306,307,308,309,310,311,312,313
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,1,0,1,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,1,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,1,1,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


## Saving as a Parquet file for efficient data storage and retrieval

Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. It provides high performance compression and encoding schemes to handle complex data in bulk and is supported in many programming language and analytics tools.

References: 
1. https://parquet.apache.org/ 
2. https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_parquet.html

In [11]:
df1_api2.to_parquet('5_mlran_dataset/1_api_dataset.parquet', compression='snappy')

# 2. Registry Keys Parser

This code processes Cuckoo sandbox reports to extract registry key operations such as `WRITTEN, READ, OPENED, DELETED`. It reads each JSON file in a specified folder, identifies registry operations from the behavior -> summary section (under sub-section: `regkey_written`, `regkey_read`, `regkey_opened`, `regkey_deleted`) and creates a dictionary where the registry operation features are labeled as REG:OPENED, REG:DELETED, REG:READ, and REG:WRITTEN for each key. It then generates a DataFrame, where each row represents a sample report and columns represent unique registry operations, with a value of 1 if the operation exists and 0 otherwise. The sample_id column is sorted numerically.

In [12]:
def extract_registry_operations(report_path):
    """
    Extracts registry key operations (opened, deleted, read, written) from a Cuckoo report.
    Returns a dictionary with keys as feature names and values as 1 (since the key exists in this report).
    """
    with open(report_path, 'r') as f:
        report = json.load(f)

    registry_features = {}

    # Get the relevant registry operations from behavior -> summary
    behavior_summary = report.get('behavior', {}).get('summary', {})

    # Process regkey_opened
    for regkey in behavior_summary.get('regkey_opened', []):
        feature_name = f"REG:OPENED:{regkey}"
        registry_features[feature_name] = 1

    # Process regkey_deleted
    for regkey in behavior_summary.get('regkey_deleted', []):
        feature_name = f"REG:DELETED:{regkey}"
        registry_features[feature_name] = 1

    # Process regkey_read
    for regkey in behavior_summary.get('regkey_read', []):
        feature_name = f"REG:READ:{regkey}"
        registry_features[feature_name] = 1

    # Process regkey_written
    for regkey in behavior_summary.get('regkey_written', []):
        feature_name = f"REG:WRITTEN:{regkey}"
        registry_features[feature_name] = 1

    return registry_features


def create_registry_dataframe(reports_folder):
    """
    Creates a DataFrame where each row is a report and each column is a unique registry key operation.
    The value is 1 if the operation exists in that report, otherwise 0.
    """
    # Start timing the process
    start_time = time.time()

    # List to store data for each sample
    data = []

    # Set to collect all unique feature names (registry key operations)
    all_features = set()

    # Get the list of all JSON files in the reports folder
    report_files = [f for f in os.listdir(reports_folder) if f.endswith(".json")]

    # Loop through all JSON files with progress bar
    for report_file in tqdm(report_files, desc="Processing reports", unit="file"):
        sample_id = report_file.split(".")[0]  # Extract sample ID (e.g., 10001 from 10001.json)
        report_path = os.path.join(reports_folder, report_file)

        # Extract registry operations for the current report
        registry_features = extract_registry_operations(report_path)

        # Add the current sample ID and registry features to the data list
        sample_data = {"sample_id": sample_id}
        sample_data.update(registry_features)
        data.append(sample_data)

        # Add the registry features to the all_features set
        all_features.update(registry_features.keys())

    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)

    # Ensure all features (columns) are in the DataFrame, fill missing values with 0
    df = df.reindex(columns=["sample_id"] + sorted(all_features), fill_value=0)

    # Replace NaN with 0 and convert the DataFrame to integers
    df.fillna(0, inplace=True)
    df = df.astype(int)  # Ensure all values are integers (1 or 0)

    # Sort rows by sample_id in ascending order
    df['sample_id'] = df['sample_id'].astype(int)  # Convert sample_id to int for sorting
    df.sort_values(by='sample_id', inplace=True)

    # End timing the process
    end_time = time.time()

    # Calculate the total time taken and print it
    total_time = end_time - start_time
    print(f"\nTotal time to process and create the registry dataframe: {total_time:.2f} seconds")

    return df

# Implementing the code
reports_folder = "json_reports" # Folder containing Cuckoo report JSON files
df2_reg = create_registry_dataframe(reports_folder)
display(df2_reg)

Processing reports: 100%|██████████| 4884/4884 [24:06<00:00,  3.38file/s]  



Total time to process and create the registry dataframe: 1676.49 seconds


Unnamed: 0,sample_id,REG:DELETED:HKEY_CLASSES_ROOT\*\shell\Secure Eraser,REG:DELETED:HKEY_CLASSES_ROOT\.jod,REG:DELETED:HKEY_CLASSES_ROOT\.ppx,REG:DELETED:HKEY_CLASSES_ROOT\.prx,REG:DELETED:HKEY_CLASSES_ROOT\.qtl,REG:DELETED:HKEY_CLASSES_ROOT\.ttl,REG:DELETED:HKEY_CLASSES_ROOT\AIFCmp1.ISubclass,REG:DELETED:HKEY_CLASSES_ROOT\AIFCmp1.asxBars,REG:DELETED:HKEY_CLASSES_ROOT\AIFCmp1.asxButtonStrip,...,REG:WRITTEN:\REGISTRY\USER\.DEFAULT\SOFTWARE\AI Internet Solutions\CSE HTML Validator v4\ValidatorEngineDLLFullPathV240-x64,REG:WRITTEN:\REGISTRY\USER\.DEFAULT\SOFTWARE\AI Internet Solutions\CSE HTML Validator v4\ValidatorEngineDLLV240,REG:WRITTEN:\REGISTRY\USER\.DEFAULT\SOFTWARE\AI Internet Solutions\CSE HTML Validator v4\ValidatorEngineDLLV240-x64,REG:WRITTEN:\REGISTRY\USER\.DEFAULT\SOFTWARE\AdminTest\AdminTest,REG:WRITTEN:\REGISTRY\USER\.DEFAULT\SOFTWARE\Microsoft\Windows\CurrentVersion\Internet Settings\Connections\DefaultConnectionSettings,REG:WRITTEN:\REGISTRY\USER\.DEFAULT\SOFTWARE\Microsoft\Windows\CurrentVersion\Internet Settings\Connections\SavedLegacySettings,REG:WRITTEN:\REGISTRY\USER\.DEFAULT\SOFTWARE\Microsoft\Windows\CurrentVersion\Internet Settings\ProxyEnable,REG:WRITTEN:\REGISTRY\USER\.DEFAULT\SOFTWARE\Microsoft\Windows\CurrentVersion\Internet Settings\ZoneMap\AutoDetect,REG:WRITTEN:\REGISTRY\USER\.DEFAULT\SOFTWARE\Microsoft\Windows\CurrentVersion\Internet Settings\ZoneMap\UNCAsIntranet,REG:WRITTEN:\REGISTRY\USER\.DEFAULT\SOFTWARE\Piriform\Recuva\Language
278,10001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3295,10002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3933,10003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2368,10004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
501,10005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3181,15691,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,15692,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1933,15693,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4041,15694,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Call the function to find constant features
find_constant_features(df2_reg)

No constant features found.


In [14]:
# Call the function to check if column names are unique
check_column_uniqueness(df2_reg)

There are duplicate column names.


In [15]:
 # Step 1: Rename columns starting from 1
df2_reg2, column_map_reg = rename_columns_with_numbers(df2_reg, start_number=314)

print_first_and_last_10_items(column_map_reg)

display(df2_reg2.head())

First 10 items:
314: REG:DELETED:HKEY_CLASSES_ROOT\*\shell\Secure Eraser
315: REG:DELETED:HKEY_CLASSES_ROOT\.jod
316: REG:DELETED:HKEY_CLASSES_ROOT\.ppx
317: REG:DELETED:HKEY_CLASSES_ROOT\.prx
318: REG:DELETED:HKEY_CLASSES_ROOT\.qtl
319: REG:DELETED:HKEY_CLASSES_ROOT\.ttl
320: REG:DELETED:HKEY_CLASSES_ROOT\AIFCmp1.ISubclass
321: REG:DELETED:HKEY_CLASSES_ROOT\AIFCmp1.asxBars
322: REG:DELETED:HKEY_CLASSES_ROOT\AIFCmp1.asxButtonStrip
323: REG:DELETED:HKEY_CLASSES_ROOT\AIFCmp1.asxFontLabel

----------------------------------------

Last 10 items:
525809: REG:WRITTEN:\REGISTRY\USER\.DEFAULT\SOFTWARE\AI Internet Solutions\CSE HTML Validator v4\ValidatorEngineDLLFullPathV240-x64
525810: REG:WRITTEN:\REGISTRY\USER\.DEFAULT\SOFTWARE\AI Internet Solutions\CSE HTML Validator v4\ValidatorEngineDLLV240
525811: REG:WRITTEN:\REGISTRY\USER\.DEFAULT\SOFTWARE\AI Internet Solutions\CSE HTML Validator v4\ValidatorEngineDLLV240-x64
525812: REG:WRITTEN:\REGISTRY\USER\.DEFAULT\SOFTWARE\AdminTest\AdminTest
52

Unnamed: 0_level_0,314,315,316,317,318,319,320,321,322,323,...,525809,525810,525811,525812,525813,525814,525815,525816,525817,525818
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# saving the feature names dictionary
save_column_map(column_map_reg, '5_mlran_dataset/2_reg_feature_names_dic.json')

In [17]:
# Saving the data
df2_reg2.to_csv("5_mlran_dataset/2_reg_dataset.csv")

In [18]:
df2_reg2.to_parquet('5_mlran_dataset/2_reg_dataset.parquet', compression='snappy')

# 3. File Operations Parser

This code processes Cuckoo report JSON files to extract file operations such as file creation, deletion, opening, and writing. It reads each report, identifies these operations from the behavior -> summary section (under sub-section, `file_created`, `file_recreated`, `file_opened`, `file_written`, `file_deleted`, `file_exists`, `file_failed`, `file_read`) and formats them into feature names like `FILE:CREATED:<filepath>`. A DataFrame is then created where each row represents a sample report, and columns represent unique file operations, with 1 indicating the presence of an operation and 0 otherwise. The sample_id column is used to identify each report, and the rows are sorted by this ID.

In [19]:
def extract_file_operations(report_path):
    """
    Extracts file operations (created, recreated, opened, written, deleted, exists, failed, read) from a Cuckoo report.
    Returns a dictionary with keys as feature names and values as 1 (since the operation exists in this report).
    """
    file_operations = {}

    try:
        with open(report_path, 'r') as f:
            report = json.load(f)

        behavior_summary = report.get('behavior', {}).get('summary', {})

        # Mapping for file operation names to the desired format
        file_op_mapping = {
            'file_created': 'CREATED',
            'file_recreated': 'RECREATED',
            'file_opened': 'OPENED',
            'file_written': 'WRITTEN',
            'file_deleted': 'DELETED',
            'file_exists': 'EXISTS',
            'file_failed': 'FAILED',
            'file_read': 'READ'
        }

        for file_op, formatted_name in file_op_mapping.items():
            for filepath in behavior_summary.get(file_op, []):
                # Construct feature name with FILE and FORMATTED NAME in uppercase
                # Convert filepath to lowercase to ensure uniqueness
                feature_name = f"FILE:{formatted_name}:{filepath.lower()}"
                file_operations[feature_name] = 1

    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error reading {report_path}: {e}")

    return file_operations

def create_file_operations_dataframe(reports_folder):
    """
    Creates a DataFrame where each row is a report and each column is a unique file operation.
    The value is 1 if the operation exists in that report, otherwise 0.
    """
    data = []
    all_features = set()

    # Get the list of all JSON files in the reports folder
    report_files = [f for f in os.listdir(reports_folder) if f.endswith(".json")]

    # Loop through all JSON files with progress bar
    for report_file in tqdm(report_files, desc="Processing reports", unit="file"):
        sample_id = report_file.split(".")[0]  # Extract sample ID (e.g., 10001 from 10001.json)
        report_path = os.path.join(reports_folder, report_file)

        file_operations = extract_file_operations(report_path)

        sample_data = {"sample_id": sample_id}
        sample_data.update(file_operations)
        data.append(sample_data)

        all_features.update(file_operations.keys())

    df = pd.DataFrame(data)

    # Ensure columns are sorted and all feature names are unique (case-insensitive)
    df = df.reindex(columns=["sample_id"] + sorted(all_features), fill_value=0)
    df.fillna(0, inplace=True)
    df = df.astype(int)

    df['sample_id'] = df['sample_id'].astype(int)
    df.sort_values(by='sample_id', inplace=True)

    return df

# implementing the code
if __name__ == "__main__":
    reports_folder = "json_reports"  # Folder containing Cuckoo report JSON files
    df3_file = create_file_operations_dataframe(reports_folder)
    display(df3_file)

Processing reports: 100%|██████████| 4884/4884 [20:52<00:00,  3.90file/s]  


Unnamed: 0,sample_id,FILE:CREATED:c:\!!!how_to_decrypt!!!.txt,FILE:CREATED:c:\!_how_recovery_files_!.html,FILE:CREATED:c:\!help_sos.hta,FILE:CREATED:c:\!recovery_hvo.html,FILE:CREATED:c:\# decrypt my files #.html,FILE:CREATED:c:\# decrypt my files #.txt,FILE:CREATED:c:\# decrypt my files #.vbs,FILE:CREATED:c:\$recycle.bin .exe,FILE:CREATED:c:\$recycle.bin\07hsev.tmp,...,FILE:WRITTEN:z:\boot\recovery_instructions.html,FILE:WRITTEN:z:\boot\ru-ru\bootmgr.exe.mui,FILE:WRITTEN:z:\boot\sv-se\bootmgr.exe.mui,FILE:WRITTEN:z:\boot\tr-tr\bootmgr.exe.mui,FILE:WRITTEN:z:\boot\zh-cn\bootmgr.exe.mui,FILE:WRITTEN:z:\boot\zh-hk\bootmgr.exe.mui,FILE:WRITTEN:z:\boot\zh-tw\bootmgr.exe.mui,FILE:WRITTEN:z:\bootsect.bak,FILE:WRITTEN:z:\get_your_files_back.txt,FILE:WRITTEN:z:\recovery_instructions.html
278,10001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3295,10002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3933,10003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2368,10004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
501,10005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3181,15691,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,15692,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1933,15693,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4041,15694,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Call the function to find constant features
find_constant_features(df3_file)

No constant features found.


In [22]:
# Call the function to check if column names are unique
check_column_uniqueness(df3_file)

All column names are unique.


In [23]:
list_and_count_duplicates(df3_file)

No duplicate column names found.


In [24]:
 # Step 1: Rename columns starting from 525819
df3_file2, column_map_file = rename_columns_with_numbers(df3_file, start_number=525819)

print_first_and_last_10_items(column_map_file)

display(df3_file2.head())

First 10 items:
525819: FILE:CREATED:c:\!!!how_to_decrypt!!!.txt
525820: FILE:CREATED:c:\!_how_recovery_files_!.html
525821: FILE:CREATED:c:\!help_sos.hta
525822: FILE:CREATED:c:\!recovery_hvo.html
525823: FILE:CREATED:c:\# decrypt my files #.html
525824: FILE:CREATED:c:\# decrypt my files #.txt
525825: FILE:CREATED:c:\# decrypt my files #.vbs
525826: FILE:CREATED:c:\$recycle.bin .exe
525827: FILE:CREATED:c:\$recycle.bin\07hsev.tmp
525828: FILE:CREATED:c:\$recycle.bin\12340e9b7ca6ad46.tmp

----------------------------------------

Last 10 items:
2604096: FILE:WRITTEN:z:\boot\recovery_instructions.html
2604097: FILE:WRITTEN:z:\boot\ru-ru\bootmgr.exe.mui
2604098: FILE:WRITTEN:z:\boot\sv-se\bootmgr.exe.mui
2604099: FILE:WRITTEN:z:\boot\tr-tr\bootmgr.exe.mui
2604100: FILE:WRITTEN:z:\boot\zh-cn\bootmgr.exe.mui
2604101: FILE:WRITTEN:z:\boot\zh-hk\bootmgr.exe.mui
2604102: FILE:WRITTEN:z:\boot\zh-tw\bootmgr.exe.mui
2604103: FILE:WRITTEN:z:\bootsect.bak
2604104: FILE:WRITTEN:z:\get_your_files_b

Unnamed: 0_level_0,525819,525820,525821,525822,525823,525824,525825,525826,525827,525828,...,2604096,2604097,2604098,2604099,2604100,2604101,2604102,2604103,2604104,2604105
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# saving the feature names dictionary
save_column_map(column_map_file, '5_mlran_dataset/3_file_feature_names_dic.json')

In [26]:
# Saving the data
df3_file2.to_csv("5_mlran_dataset/3_file_dataset.csv")

In [27]:
df3_file2.to_parquet('5_mlran_dataset/3_file_dataset.parquet', compression='snappy')

# 4. Directory Operations Parser

This code extracts directory operations (such as directory creation and enumeration) from Cuckoo report JSON files. It reads each report, identifies directory-related activities from the `behavior -> summary` section, and formats them into feature names like `DIRECTORY:CREATED:<dirpath>`. It creates a DataFrame where each row represents a sample report, and the columns represent unique directory operations. A value of `1` indicates the presence of an operation, and `0` otherwise. The `sample_id` column identifies each report, and the DataFrame is sorted by `sample_id`.

In [28]:
def extract_directory_operations(report_path):
    """
    Extracts directory operations (created and enumerated) from a Cuckoo report.
    Returns a dictionary with keys as feature names and values as 1 (since the operation exists in this report).
    """
    directory_operations = {}

    try:
        with open(report_path, 'r') as f:
            report = json.load(f)

        behavior_summary = report.get('behavior', {}).get('summary', {})

        # Mapping for directory operation names to the desired format
        directory_op_mapping = {
            'directory_created': 'CREATED',
            'directory_enumerated': 'ENUMERATED'
        }

        # Process directory operations
        for dir_op, formatted_name in directory_op_mapping.items():
            for dirpath in behavior_summary.get(dir_op, []):
                feature_name = f"DIRECTORY:{formatted_name}:{dirpath.lower()}"
                directory_operations[feature_name] = 1

    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error reading {report_path}: {e}")

    return directory_operations

def create_directory_operations_dataframe(reports_folder):
    """
    Creates a DataFrame where each row is a report and each column is a unique directory operation.
    The value is 1 if the operation exists in that report, otherwise 0.
    """
    data = []
    all_features = set()

    # Get the list of all JSON files in the reports folder
    report_files = [f for f in os.listdir(reports_folder) if f.endswith(".json")]

    # Loop through all JSON files with progress bar
    for report_file in tqdm(report_files, desc="Processing reports", unit="file"):
        sample_id = report_file.split(".")[0]  # Extract sample ID (e.g., 10001 from 10001.json)
        report_path = os.path.join(reports_folder, report_file)

        directory_operations = extract_directory_operations(report_path)

        sample_data = {"sample_id": sample_id}
        sample_data.update(directory_operations)
        data.append(sample_data)

        all_features.update(directory_operations.keys())

    df = pd.DataFrame(data)

    df = df.reindex(columns=["sample_id"] + sorted(all_features), fill_value=0)
    df.fillna(0, inplace=True)
    df = df.astype(int)

    df['sample_id'] = df['sample_id'].astype(int)
    df.sort_values(by='sample_id', inplace=True)

    return df


# implementing the code
if __name__ == "__main__":
    reports_folder = "json_reports"  # Folder containing Cuckoo report JSON files
    df4_dir = create_directory_operations_dataframe(reports_folder)
    display(df4_dir)

Processing reports: 100%|██████████| 4884/4884 [19:06<00:00,  4.26file/s]  


Unnamed: 0,sample_id,DIRECTORY:CREATED:,DIRECTORY:CREATED:\\,DIRECTORY:CREATED:\\.\,DIRECTORY:CREATED:\\.\c:,DIRECTORY:CREATED:\\.\c:\,DIRECTORY:CREATED:\\.\c:\programdata,DIRECTORY:CREATED:\\.\c:\programdata\,DIRECTORY:CREATED:\\.\c:\programdata\avast software,DIRECTORY:CREATED:\\.\c:\programdata\avast software\persistent data,...,DIRECTORY:ENUMERATED:z:\boot\pl-pl\*,DIRECTORY:ENUMERATED:z:\boot\pt-br\*,DIRECTORY:ENUMERATED:z:\boot\pt-pt\*,DIRECTORY:ENUMERATED:z:\boot\ru-ru\*,DIRECTORY:ENUMERATED:z:\boot\sv-se\*,DIRECTORY:ENUMERATED:z:\boot\tr-tr\*,DIRECTORY:ENUMERATED:z:\boot\zh-cn\*,DIRECTORY:ENUMERATED:z:\boot\zh-hk\*,DIRECTORY:ENUMERATED:z:\boot\zh-tw\*,DIRECTORY:ENUMERATED:z:\system volume information\*
278,10001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3295,10002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3933,10003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2368,10004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
501,10005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3181,15691,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,15692,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1933,15693,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4041,15694,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# Call the function to find constant features
find_constant_features(df4_dir)

# Call the function to check if column names are unique
check_column_uniqueness(df4_dir)

list_and_count_duplicates(df4_dir)

No constant features found.
All column names are unique.
No duplicate column names found.


In [30]:
# Rename columns starting
df4_dir2, column_map_dir = rename_columns_with_numbers(df4_dir, start_number=2604137)

print_first_and_last_10_items(column_map_dir)

display(df4_dir2.head())

First 10 items:
2604137: DIRECTORY:CREATED:
2604138: DIRECTORY:CREATED:\\
2604139: DIRECTORY:CREATED:\\.\
2604140: DIRECTORY:CREATED:\\.\c:
2604141: DIRECTORY:CREATED:\\.\c:\
2604142: DIRECTORY:CREATED:\\.\c:\programdata
2604143: DIRECTORY:CREATED:\\.\c:\programdata\
2604144: DIRECTORY:CREATED:\\.\c:\programdata\avast software
2604145: DIRECTORY:CREATED:\\.\c:\programdata\avast software\persistent data
2604146: DIRECTORY:CREATED:\\.\c:\programdata\avast software\persistent data\avast

----------------------------------------

Last 10 items:
2762250: DIRECTORY:ENUMERATED:z:\boot\pl-pl\*
2762251: DIRECTORY:ENUMERATED:z:\boot\pt-br\*
2762252: DIRECTORY:ENUMERATED:z:\boot\pt-pt\*
2762253: DIRECTORY:ENUMERATED:z:\boot\ru-ru\*
2762254: DIRECTORY:ENUMERATED:z:\boot\sv-se\*
2762255: DIRECTORY:ENUMERATED:z:\boot\tr-tr\*
2762256: DIRECTORY:ENUMERATED:z:\boot\zh-cn\*
2762257: DIRECTORY:ENUMERATED:z:\boot\zh-hk\*
2762258: DIRECTORY:ENUMERATED:z:\boot\zh-tw\*
2762259: DIRECTORY:ENUMERATED:z:\system

Unnamed: 0_level_0,2604137,2604138,2604139,2604140,2604141,2604142,2604143,2604144,2604145,2604146,...,2762250,2762251,2762252,2762253,2762254,2762255,2762256,2762257,2762258,2762259
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# saving the feature names dictionary
save_column_map(column_map_dir, '5_mlran_dataset/4_dir_feature_names_dic.json')

In [32]:
# Saving the data
df4_dir2.to_csv("5_mlran_dataset/4_dir_dataset.csv")

In [33]:
df4_dir2.to_parquet('5_mlran_dataset/4_dir_dataset.parquet', compression='snappy')

# 5. Strings Parser

This code extracts unique strings from the strings section of Cuckoo report JSON files. It creates feature names in the format `STRING:<string>` and assigns a value of 1 if the string is present in the report. The function then creates a DataFrame where each row corresponds to a sample report and each column represents a unique string. A value of 1 indicates the presence of a string, and 0 indicates its absence. The DataFrame is sorted by sample_id, which identifies each report.

In [3]:
def extract_strings(report_path):
    """
    Extracts strings from a Cuckoo report without transforming them.
    Returns a dictionary with unique strings as keys and 1 as values (indicating the presence of the string).
    """
    strings_found = {}

    try:
        with open(report_path, 'r') as f:
            report = json.load(f)

        # Get the strings section from the report
        strings_section = report.get('strings', [])

        # Use a set to ensure unique strings
        unique_strings = set(strings_section)

        # Process each unique string in the strings section
        for string in unique_strings:
            string = string.lower()
            feature_name = f"STRING:{string}"
            strings_found[feature_name] = 1

    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error reading {report_path}: {e}")

    return strings_found


def create_strings_dataframe(reports_folder):
    """
    Creates a DataFrame where each row is a report and each column is a unique string.
    The value is 1 if the string exists in that report, otherwise 0.
    """
    data = []
    all_features = set()

    # Get the list of all JSON files in the reports folder
    report_files = [f for f in os.listdir(reports_folder) if f.endswith(".json")]

    # Loop through all JSON files with progress bar
    for report_file in tqdm(report_files, desc="Processing reports", unit="file"):
        sample_id = report_file.split(".")[0]  # Extract sample ID (e.g., 10001 from 10001.json)
        report_path = os.path.join(reports_folder, report_file)

        # Extract strings for the current report
        strings_found = extract_strings(report_path)

        # Add the current sample ID and strings to the data list
        sample_data = {"sample_id": sample_id}
        sample_data.update(strings_found)
        data.append(sample_data)

        # Add the strings to the all_features set
        all_features.update(strings_found.keys())

    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)

    # Ensure all features (columns) are in the DataFrame, fill missing values with 0
    df = df.reindex(columns=["sample_id"] + sorted(all_features), fill_value=0)

    # Replace NaN with 0 and convert the DataFrame to integers
    df.fillna(0, inplace=True)
    df = df.astype(int)  # Ensure all values are integers (1 or 0)

    # Sort rows by sample_id in ascending order
    df['sample_id'] = df['sample_id'].astype(int)  # Convert sample_id to int for sorting
    df.sort_values(by='sample_id', inplace=True)

    return df

# implementing the code
if __name__ == "__main__":
    reports_folder = "json_reports"  # Folder containing Cuckoo report JSON files
    df5_str = create_strings_dataframe(reports_folder)
    display(df5_str)

Processing reports:   0%|          | 0/4884 [00:00<?, ?file/s]

Processing reports: 100%|██████████| 4884/4884 [28:46<00:00,  2.83file/s]  


Unnamed: 0,sample_id,STRING:,STRING:.1,STRING:.2,STRING:.3,STRING:.4,STRING:.5,STRING:.6,STRING:.7,STRING:.8,...,STRING:~~~~~~~~~~~~~,STRING:~~~~~~~~~~~~~~$$$$$$$$$$$$$$$$,STRING:~~~~~~~~~~~~~~~~~~000,STRING:~~~~~~~~~~~~~~~~~~~~~,STRING:~~~~~~~~~~~~~~~~~~~~~},STRING:~~~~~~~~~~~~~~~~~~~~~~~,STRING:~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ooooooooooooo~~~~~~~oooo____q,STRING:~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:>>>>>?>4>>>&>>,STRING:~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~,STRING:~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
278,10001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3295,10002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3933,10003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2368,10004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
501,10005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3181,15691,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,15692,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1933,15693,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4041,15694,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Call the function to find constant features
find_constant_features(df5_str)

# Call the function to check if column names are unique
check_column_uniqueness(df5_str)

list_and_count_duplicates(df5_str)

No constant features found.
All column names are unique.
No duplicate column names found.


In [12]:
# Rename columns starting
df5_str2, column_map_str = rename_columns_with_numbers(df5_str, start_number=2762260)

print_first_and_last_10_items(column_map_str)

display(df5_str2.head())

First 10 items:
2762260: STRING:
2762261: STRING:
2762262: STRING:
2762263: STRING:
2762264: STRING:
2762265: STRING:
2762266: STRING:
2762267: STRING:
2762268: STRING:
2762269: STRING:

----------------------------------------

Last 10 items:
6394369: STRING:~~~~~~~~~~~~~
6394370: STRING:~~~~~~~~~~~~~~$$$$$$$$$$$$$$$$
6394371: STRING:~~~~~~~~~~~~~~~~~~000
6394372: STRING:~~~~~~~~~~~~~~~~~~~~~
6394373: STRING:~~~~~~~~~~~~~~~~~~~~~}
6394374: STRING:~~~~~~~~~~~~~~~~~~~~~~~
6394375: STRING:~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ooooooooooooo~~~~~~~oooo____q
6394376: STRING:~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

Unnamed: 0_level_0,2762260,2762261,2762262,2762263,2762264,2762265,2762266,2762267,2762268,2762269,...,6394369,6394370,6394371,6394372,6394373,6394374,6394375,6394376,6394377,6394378
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# saving the feature names dictionary
save_column_map(column_map_str, '5_mlran_dataset/5_str_feature_names_dic.json')

In [14]:
df5_str2.to_parquet('5_mlran_dataset/5_str_dataset.parquet', compression='snappy')

In [15]:
# Saving the data
df5_str2.to_csv("5_mlran_dataset/5_str_dataset.csv")

# 6. Network

This code extracts network-related operations such as IP connections, host connections, and DNS resolutions from Cuckoo report JSON files. For each report, it creates feature names like `NETWORK:CONNECTS_IP:<ip_address>`, `NETWORK:CONNECTS_HOST:<host>`, and `NETWORK:RESOLVES_HOST:<host>` with values of `1` indicating the presence of these operations. A DataFrame is then created where each row corresponds to a report, and each column represents a unique network operation. Missing values are filled with `0`, and the rows are sorted by `sample_id`.

In [16]:
def extract_network_operations(report_path):
    """
    Extracts network operations (connects_ip, connects_host, resolves_host)
    from a Cuckoo report.
    Returns a dictionary with keys as feature names and values as 1 (since the operation exists in this report).
    """
    network_operations = {}

    try:
        with open(report_path, 'r') as f:
            report = json.load(f)

        # Get the relevant network operations from behavior -> summary
        behavior_summary = report.get('behavior', {}).get('summary', {})

        # Process each network operation category
        for net_op in ['connects_ip', 'connects_host', 'resolves_host']:
            for network_item in behavior_summary.get(net_op, []):
                network_item = network_item.lower()
                feature_name = f"NETWORK:{net_op.upper()}:{network_item}"
                network_operations[feature_name] = 1

    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error reading {report_path}: {e}")

    return network_operations


def create_network_operations_dataframe(reports_folder):
    """
    Creates a DataFrame where each row is a report and each column is a unique network operation.
    The value is 1 if the operation exists in that report, otherwise 0.
    """
    data = []
    all_features = set()

    # Get the list of all JSON files in the reports folder
    report_files = [f for f in os.listdir(reports_folder) if f.endswith(".json")]

    # Loop through all JSON files with progress bar
    for report_file in tqdm(report_files, desc="Processing reports", unit="file"):
        sample_id = report_file.split(".")[0]  # Extract sample ID (e.g., 10001 from 10001.json)
        report_path = os.path.join(reports_folder, report_file)

        # Extract network operations for the current report
        network_operations = extract_network_operations(report_path)

        # Add the current sample ID and network operations to the data list
        sample_data = {"sample_id": sample_id}
        sample_data.update(network_operations)
        data.append(sample_data)

        # Add the network operations to the all_features set
        all_features.update(network_operations.keys())

    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)

    # Ensure all features (columns) are in the DataFrame, fill missing values with 0
    df = df.reindex(columns=["sample_id"] + sorted(all_features), fill_value=0)

    # Replace NaN with 0 and convert the DataFrame to integers
    df.fillna(0, inplace=True)
    df = df.astype(int)  # Ensure all values are integers (1 or 0)

    # Sort rows by sample_id in ascending order
    df['sample_id'] = df['sample_id'].astype(int)  # Convert sample_id to int for sorting
    df.sort_values(by='sample_id', inplace=True)

    return df


# implementing the code
if __name__ == "__main__":
    reports_folder = "json_reports"  # Folder containing Cuckoo report JSON files
    df6_net = create_network_operations_dataframe(reports_folder)
    display(df6_net)

Processing reports: 100%|██████████| 4884/4884 [17:31<00:00,  4.64file/s]  


Unnamed: 0,sample_id,NETWORK:CONNECTS_HOST:,NETWORK:CONNECTS_HOST:104.131.182.103,NETWORK:CONNECTS_HOST:107.170.20.33,NETWORK:CONNECTS_HOST:109.234.35.215,NETWORK:CONNECTS_HOST:109.236.87.106,NETWORK:CONNECTS_HOST:109.237.111.179,NETWORK:CONNECTS_HOST:123informatica.com,NETWORK:CONNECTS_HOST:123unifashion.com,NETWORK:CONNECTS_HOST:139.59.166.196,...,NETWORK:RESOLVES_HOST:yahoo.com,NETWORK:RESOLVES_HOST:yahoo.opera.com,NETWORK:RESOLVES_HOST:yahoofriend.sourceforge.net,NETWORK:RESOLVES_HOST:yahoosupportaustralia.com,NETWORK:RESOLVES_HOST:yupmaster.gaijinent.com,NETWORK:RESOLVES_HOST:yyy.wuxianlequ.com,NETWORK:RESOLVES_HOST:zjbhdgxbcn,NETWORK:RESOLVES_HOST:zs.vivo.com.cn,NETWORK:RESOLVES_HOST:zs.vivoglobal.com,"NETWORK:RESOLVES_HOST:}¡w[¥u¢fuéçz!·=dèöaüçu©~è}ëí¡}ª<©<³çu©~èsu¡}ª<©égæ}¡w[¥<¼çu©~èfu¡}ª<£çu©~èau¡}ª<£çu©~è}è`u¡}ª<¯çu©~è~u¡}ª<²çu©~èwu¡}ª<®çu©~è}èju¡}ª<£çu©~è}è`u¡}ª<©çu©~è}u¡}ª<ªçu©~èyu¡}ª<¶çu©~è}èuuæçuæçuæçuæçuæçu.ówæd³ëçõpóbsuæçuâ+ðçuæçõp÷bsuæçu¢êuæçþlîøën¹4æ´b$.h9züm¦9òuæçvüãçuæ  v¦tæç<dsu®ie0ææïåçúö~ãóie0ædêu³èõpwsuæçu/_uælrymçu¢î9íøaîçu·ú""qíæ³ú$±íö»í"
278,10001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3295,10002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3933,10003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2368,10004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
501,10005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3181,15691,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,15692,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1933,15693,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4041,15694,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Call the function to find constant features
find_constant_features(df6_net)

# Call the function to check if column names are unique
check_column_uniqueness(df6_net)

list_and_count_duplicates(df6_net)

No constant features found.
All column names are unique.
No duplicate column names found.


In [18]:
# Rename columns starting
df6_net2, column_map_net = rename_columns_with_numbers(df6_net, start_number=6394693)

print_first_and_last_10_items(column_map_net)

display(df6_net2.head())

First 10 items:
6394693: NETWORK:CONNECTS_HOST:
6394694: NETWORK:CONNECTS_HOST:104.131.182.103
6394695: NETWORK:CONNECTS_HOST:107.170.20.33
6394696: NETWORK:CONNECTS_HOST:109.234.35.215
6394697: NETWORK:CONNECTS_HOST:109.236.87.106
6394698: NETWORK:CONNECTS_HOST:109.237.111.179
6394699: NETWORK:CONNECTS_HOST:123informatica.com
6394700: NETWORK:CONNECTS_HOST:123unifashion.com
6394701: NETWORK:CONNECTS_HOST:139.59.166.196
6394702: NETWORK:CONNECTS_HOST:146.120.110.46

----------------------------------------

Last 10 items:
6399497: NETWORK:RESOLVES_HOST:yahoo.com
6399498: NETWORK:RESOLVES_HOST:yahoo.opera.com
6399499: NETWORK:RESOLVES_HOST:yahoofriend.sourceforge.net
6399500: NETWORK:RESOLVES_HOST:yahoosupportaustralia.com
6399501: NETWORK:RESOLVES_HOST:yupmaster.gaijinent.com
6399502: NETWORK:RESOLVES_HOST:yyy.wuxianlequ.com
6399503: NETWORK:RESOLVES_HOST:zjbhdgxbcn
6399504: NETWORK:RESOLVES_HOST:zs.vivo.com.cn
6399505: NETWORK:RESOLVES_HOST:zs.vivoglobal.com
6399506: NETWORK:RESOLVES_

Unnamed: 0_level_0,6394693,6394694,6394695,6394696,6394697,6394698,6394699,6394700,6394701,6394702,...,6399497,6399498,6399499,6399500,6399501,6399502,6399503,6399504,6399505,6399506
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# saving the feature names dictionary
save_column_map(column_map_net, '5_mlran_dataset/6_net_feature_names_dic.json')

In [20]:
df6_net2.to_parquet('5_mlran_dataset/6_net_dataset.parquet', compression='snappy')

In [21]:
# Saving the data
df6_net2.to_csv("5_mlran_dataset/6_net_dataset.csv")

# 7. System

This code extracts system-related operations such as DLLs loaded, command-line executions, mutexes, and GUIDs from Cuckoo report JSON files. For each report, it generates feature names like `SYSTEM:DLL_LOADED:<dll_name>` and assigns a value of `1` if the operation is present in the report. The data is compiled into a DataFrame where each row corresponds to a report, and each column represents a unique system operation. Missing values are filled with `0`, and the DataFrame is sorted by `sample_id`.

In [22]:
def extract_system_operations(report_path):
    """
    Extracts system operations (dll_loaded, command_line, mutex, guid) from a Cuckoo report.
    Returns a dictionary with keys as feature names and values as 1 (since the operation exists in this report).
    """
    system_operations = {}

    try:
        with open(report_path, 'r') as f:
            report = json.load(f)

        # Get the relevant system operations from behavior -> summary
        behavior_summary = report.get('behavior', {}).get('summary', {})

        # Process each system operation category
        for sys_op in ['dll_loaded', 'command_line', 'mutex', 'guid']:
            for system_item in behavior_summary.get(sys_op, []):
                system_item = system_item.lower()
                # Generate the feature name directly
                feature_name = f"SYSTEM:{sys_op.upper()}:{system_item}"
                system_operations[feature_name] = 1

    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error reading {report_path}: {e}")

    return system_operations


def create_system_operations_dataframe(reports_folder):
    """
    Creates a DataFrame where each row is a report and each column is a unique system operation.
    The value is 1 if the operation exists in that report, otherwise 0.
    """
    data = []
    all_features = set()

    # Get the list of all JSON files in the reports folder
    report_files = [f for f in os.listdir(reports_folder) if f.endswith(".json")]

    # Loop through all JSON files with progress bar
    for report_file in tqdm(report_files, desc="Processing reports", unit="file"):
        sample_id = report_file.split(".")[0]  # Extract sample ID (e.g., 10001 from 10001.json)
        report_path = os.path.join(reports_folder, report_file)

        # Extract system operations for the current report
        system_operations = extract_system_operations(report_path)

        # Add the current sample ID and system operations to the data list
        sample_data = {"sample_id": sample_id}
        sample_data.update(system_operations)
        data.append(sample_data)

        # Add the system operations to the all_features set
        all_features.update(system_operations.keys())

    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)

    # Ensure all features (columns) are in the DataFrame, fill missing values with 0
    df = df.reindex(columns=["sample_id"] + sorted(all_features), fill_value=0)

    # Replace NaN with 0 and convert the DataFrame to integers
    df.fillna(0, inplace=True)
    df = df.astype(int)  # Ensure all values are integers (1 or 0)

    # Sort rows by sample_id in ascending order
    df['sample_id'] = df['sample_id'].astype(int)  # Convert sample_id to int for sorting
    df.sort_values(by='sample_id', inplace=True)

    return df


# implementing the code
if __name__ == "__main__":
    reports_folder = "json_reports"  # Folder containing Cuckoo report JSON files
    df7_sys = create_system_operations_dataframe(reports_folder)
    display(df7_sys)

Processing reports: 100%|██████████| 4884/4884 [17:32<00:00,  4.64file/s]  


Unnamed: 0,sample_id,SYSTEM:COMMAND_LINE:,SYSTEM:COMMAND_LINE:þ,"SYSTEM:COMMAND_LINE: -s ""c:\\program files (x86)\\foxit software\\foxit reader\\shell extensions\\foxitthumbnailhndlr_x64.dll""","SYSTEM:COMMAND_LINE: /i ""c:\users\admini~1\appdata\local\temp\wzse0.tmp\drug calculations.msi""","SYSTEM:COMMAND_LINE: /s ""c:/program files/ultracopier\pluginloader\catchcopy-v0002\catchcopy32.dll""","SYSTEM:COMMAND_LINE: run=1 shortcut=""c:\users\administrator\appdata\local\temp\13039.exe""","SYSTEM:COMMAND_LINE: run=1 shortcut=""c:\users\administrator\appdata\local\temp\13044.exe""","SYSTEM:COMMAND_LINE: run=1 shortcut=""c:\users\administrator\appdata\local\temp\13939.exe""","SYSTEM:COMMAND_LINE: run=1 shortcut=""c:\users\administrator\appdata\local\temp\14999.exe""",...,SYSTEM:MUTEX:ªë@,SYSTEM:MUTEX:¯*@,SYSTEM:MUTEX:²ë@,SYSTEM:MUTEX:·*@,SYSTEM:MUTEX:ºë@,SYSTEM:MUTEX:¿*@,SYSTEM:MUTEX:âë@,SYSTEM:MUTEX:ÿ*@,SYSTEM:MUTEX:ကခ,SYSTEM:MUTEX:摁扯啥摰瑡牥6
278,10001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3295,10002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3933,10003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2368,10004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
501,10005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3181,15691,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,15692,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1933,15693,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4041,15694,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Call the function to find constant features
find_constant_features(df7_sys)

# Call the function to check if column names are unique
check_column_uniqueness(df7_sys)

list_and_count_duplicates(df7_sys)

No constant features found.
All column names are unique.
No duplicate column names found.


In [24]:
# Rename columns starting
df7_sys2, column_map_sys = rename_columns_with_numbers(df7_sys, start_number=6399507)

print_first_and_last_10_items(column_map_sys)

display(df7_sys2.head())

First 10 items:
6399507: SYSTEM:COMMAND_LINE:
6399508: SYSTEM:COMMAND_LINEþ
6399509: SYSTEM:COMMAND_LINE:  -s "c:\\program files (x86)\\foxit software\\foxit reader\\shell extensions\\foxitthumbnailhndlr_x64.dll"
6399510: SYSTEM:COMMAND_LINE:  /i "c:\users\admini~1\appdata\local\temp\wzse0.tmp\drug calculations.msi" 
6399511: SYSTEM:COMMAND_LINE:  /s "c:/program files/ultracopier\pluginloader\catchcopy-v0002\catchcopy32.dll"
6399512: SYSTEM:COMMAND_LINE:  run=1 shortcut="c:\users\administrator\appdata\local\temp\13039.exe"
6399513: SYSTEM:COMMAND_LINE:  run=1 shortcut="c:\users\administrator\appdata\local\temp\13044.exe"
6399514: SYSTEM:COMMAND_LINE:  run=1 shortcut="c:\users\administrator\appdata\local\temp\13939.exe"
6399515: SYSTEM:COMMAND_LINE:  run=1 shortcut="c:\users\administrator\appdata\local\temp\14999.exe"
6399516: SYSTEM:COMMAND_LINE: "c:\users\administrator\appdata\roaming\tencent\qq\stemp\~txqqintl~0\sysdir" "c:\users\administrator\appdata\roaming\tencent\qq\stemp\~txqq

Unnamed: 0_level_0,6399507,6399508,6399509,6399510,6399511,6399512,6399513,6399514,6399515,6399516,...,6416408,6416409,6416410,6416411,6416412,6416413,6416414,6416415,6416416,6416417
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# saving the feature names dictionary
save_column_map(column_map_sys, '5_mlran_dataset/7_sys_feature_names_dic.json')

In [26]:
df7_sys2.to_parquet('5_mlran_dataset/7_sys_dataset.parquet', compression='snappy')

In [27]:
# Saving the data
df7_sys2.to_csv("5_mlran_dataset/7_sys_dataset.csv")

# 8. Dropped File Extensions and Types

This code extracts dropped file information (extensions and types) from Cuckoo report JSON files. It identifies unique file extensions and types for each dropped file, then generates feature names in the format `DROP:EXTENSION:<extension>` and `DROP:TYPE:<file_type>`, assigning a value of `1` to indicate their presence. The data is stored in a DataFrame where each row represents a report and each column represents a unique dropped file extension or type. Missing values are filled with `0`, and the DataFrame is sorted by `sample_id`.

In [28]:
def extract_dropped_file_features(report_path):
    """
    Extracts dropped file extensions and types from the dropped section of a Cuckoo report.
    Returns a dictionary with keys as feature names and values as 1.
    The feature names start with 'DROP'.
    """
    dropped_features = {}

    try:
        with open(report_path, 'r') as f:
            report = json.load(f)

        # Get the dropped files section
        dropped_files = report.get('dropped', [])

        # Initialize sets for unique file extensions and types
        file_extension_set = set()
        file_type_set = set()

        # Process each dropped file
        for file_info in dropped_files:
            file_name = file_info.get('name', '')
            file_type = file_info.get('type', '')

            # Extract file extension
            if '.' in file_name:
                file_extension = file_name.split('.')[-1].lower()
                file_extension_set.add(file_extension)

            # Extract file type
            if file_type:
                file_type_set.add(file_type)

        # Create features for unique file extensions and types
        for ext in file_extension_set:
            dropped_features[f"DROP:EXTENSION:{ext.lower()}"] = 1
        for ftype in file_type_set:
            dropped_features[f"DROP:TYPE:{ftype.replace(' ', '_').lower()}"] = 1

    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error reading {report_path}: {e}")

    return dropped_features


def create_dropped_file_features_dataframe(reports_folder):
    """
    Creates a DataFrame where each row is a report and each column is a unique dropped file extension.
    The value is 1 if the extension exists in that report, otherwise 0.
    """
    data = []
    all_features = set()

    # Get the list of all JSON files in the reports folder
    report_files = [f for f in os.listdir(reports_folder) if f.endswith(".json")]

    # Loop through all JSON files with progress bar
    for report_file in tqdm(report_files, desc="Processing reports", unit="file"):
        sample_id = report_file.split(".")[0]  # Extract sample ID (e.g., 10001 from 10001.json)
        report_path = os.path.join(reports_folder, report_file)

        # Extract dropped file extensions for the current report
        dropped_extensions = extract_dropped_file_features(report_path)

        # Add the current sample ID and dropped file extensions to the data list
        sample_data = {"sample_id": sample_id}
        sample_data.update(dropped_extensions)
        data.append(sample_data)

        # Add the dropped file extensions to the all_features set
        all_features.update(dropped_extensions.keys())

    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)

    # Ensure all features (columns) are in the DataFrame, fill missing values with 0
    df = df.reindex(columns=["sample_id"] + sorted(all_features), fill_value=0)

    # Replace NaN with 0 and convert the DataFrame to integers
    df.fillna(0, inplace=True)
    df = df.astype(int)  # Ensure all values are integers (1 or 0)

    # Sort rows by sample_id in ascending order
    df['sample_id'] = df['sample_id'].astype(int)  # Convert sample_id to int for sorting
    df.sort_values(by='sample_id', inplace=True)

    return df


# implementing the code
if __name__ == "__main__":
    reports_folder = "json_reports"  # Folder containing Cuckoo report JSON files
    df8_drop = create_dropped_file_features_dataframe(reports_folder)
    display(df8_drop)

Processing reports: 100%|██████████| 4884/4884 [17:30<00:00,  4.65file/s]  


Unnamed: 0,sample_id,DROP:EXTENSION:$$$,DROP:EXTENSION:0,DROP:EXTENSION:000,DROP:EXTENSION:00002,DROP:EXTENSION:00003,DROP:EXTENSION:00004,DROP:EXTENSION:00005,DROP:EXTENSION:00006,DROP:EXTENSION:00007,...,DROP:TYPE:xpconnect_typelib_version_1.2,"DROP:TYPE:zip_archive_data,_at_least_v1.0_to_extract","DROP:TYPE:zip_archive_data,_at_least_v2.0_to_extract","DROP:TYPE:zip_archive_data,_at_least_v4.5_to_extract",DROP:TYPE:zip_archive_data_(empty),"DROP:TYPE:zip_data_(mime_type_""application/vnd.adobe.air-application-installer-package+zip""?)","DROP:TYPE:zip_data_(mime_type_""application/vnd.adobe.xfl""?)","DROP:TYPE:zip_data_(mime_type_""k""?)",DROP:TYPE:zlib_compressed_data,DROP:TYPE:zstandard_dictionary_(id_1033944336)
278,10001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3295,10002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3933,10003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2368,10004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
501,10005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3181,15691,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,15692,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1933,15693,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4041,15694,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# Call the function to find constant features
find_constant_features(df8_drop)

# Call the function to check if column names are unique
check_column_uniqueness(df8_drop)

list_and_count_duplicates(df8_drop)

No constant features found.
All column names are unique.
No duplicate column names found.


In [30]:
# Rename columns starting
df8_drop2, column_map_drop = rename_columns_with_numbers(df8_drop, start_number=6416418)

print_first_and_last_10_items(column_map_drop)

display(df8_drop2.head())

First 10 items:
6416418: DROP:EXTENSION:$$$
6416419: DROP:EXTENSION:0
6416420: DROP:EXTENSION:000
6416421: DROP:EXTENSION:00002
6416422: DROP:EXTENSION:00003
6416423: DROP:EXTENSION:00004
6416424: DROP:EXTENSION:00005
6416425: DROP:EXTENSION:00006
6416426: DROP:EXTENSION:00007
6416427: DROP:EXTENSION:00008

----------------------------------------

Last 10 items:
6468059: DROP:TYPE:xpconnect_typelib_version_1.2
6468060: DROP:TYPE:zip_archive_data,_at_least_v1.0_to_extract
6468061: DROP:TYPE:zip_archive_data,_at_least_v2.0_to_extract
6468062: DROP:TYPE:zip_archive_data,_at_least_v4.5_to_extract
6468063: DROP:TYPE:zip_archive_data_(empty)
6468064: DROP:TYPE:zip_data_(mime_type_"application/vnd.adobe.air-application-installer-package+zip"?)
6468065: DROP:TYPE:zip_data_(mime_type_"application/vnd.adobe.xfl"?)
6468066: DROP:TYPE:zip_data_(mime_type_"k"?)
6468067: DROP:TYPE:zlib_compressed_data
6468068: DROP:TYPE:zstandard_dictionary_(id_1033944336)


Unnamed: 0_level_0,6416418,6416419,6416420,6416421,6416422,6416423,6416424,6416425,6416426,6416427,...,6468059,6468060,6468061,6468062,6468063,6468064,6468065,6468066,6468067,6468068
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# saving the feature names dictionary
save_column_map(column_map_drop, '5_mlran_dataset/8_drop_feature_names_dic.json')

In [32]:
df8_drop2.to_parquet('5_mlran_dataset/8_drop_dataset.parquet', compression='snappy')

In [33]:
# Saving the data
df8_drop2.to_csv("5_mlran_dataset/8_drop_dataset.csv")

# 9. Signatures

This code extracts signature names from Cuckoo report JSON files and organizes them into a DataFrame. The function `extract_signature_operations` retrieves the signature names from each report and creates dictionary entries in the format `SIGNATURE:<signature_name>` with a value of `1` to indicate the presence of the signature. The `create_signature_operations_dataframe` function then compiles these dictionaries into a DataFrame, where each row represents a report and each column corresponds to a unique signature. Missing values are filled with `0`, and the DataFrame is sorted by `sample_id`.

In [34]:
def extract_signature_operations(report_path):
    """
    Extracts signature names from the Cuckoo report.
    Returns a dictionary with keys as signature names and values as 1 (since the signature is present).
    """
    with open(report_path, 'r') as f:
        report = json.load(f)

    signature_operations = {}

    # Get the relevant signatures from the report
    signatures = report.get('signatures', [])

    # Process each signature
    for signature in signatures:
        signature_name = f"SIGNATURE:{signature['name'].lower()}"
        signature_operations[signature_name] = 1  # Set to 1 since the signature is present

    return signature_operations


def create_signature_operations_dataframe(reports_folder):
    """
    Creates a DataFrame where each row is a report and each column is a unique signature name.
    The value is 1 if the signature is present in that report, otherwise 0.
    """
    # List to store data for each sample
    data = []

    # Set to collect all unique feature names (signature names)
    all_features = set()

    # Get the list of all JSON files in the reports folder
    report_files = [f for f in os.listdir(reports_folder) if f.endswith(".json")]

    # Loop through all JSON files with progress bar
    for report_file in tqdm(report_files, desc="Processing reports", unit="file"):
        sample_id = report_file.split(".")[0]  # Extract sample ID (e.g., 10001 from 10001.json)
        report_path = os.path.join(reports_folder, report_file)

        # Extract signature operations for the current report
        signature_operations = extract_signature_operations(report_path)

        # Add the current sample ID and signature operations to the data list
        sample_data = {"sample_id": sample_id}
        sample_data.update(signature_operations)
        data.append(sample_data)

        # Add the signature names to the all_features set
        all_features.update(signature_operations.keys())

    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)

    # Ensure all features (columns) are in the DataFrame, fill missing values with 0
    df = df.reindex(columns=["sample_id"] + sorted(all_features), fill_value=0)

    # Replace NaN with 0 and convert the DataFrame to integers
    df.fillna(0, inplace=True)
    df = df.astype(int)  # Ensure all values are integers (1 or 0)

    # Sort rows by sample_id in ascending order
    df['sample_id'] = df['sample_id'].astype(int)  # Convert sample_id to int for sorting
    df.sort_values(by='sample_id', inplace=True)

    return df


# implementing the code
if __name__ == "__main__":
    reports_folder = "json_reports"  # Folder containing Cuckoo report JSON files
    df9_sig = create_signature_operations_dataframe(reports_folder)
    display(df9_sig)

Processing reports: 100%|██████████| 4884/4884 [17:21<00:00,  4.69file/s]  


Unnamed: 0,sample_id,SIGNATURE:allocates_execute_remote_process,SIGNATURE:allocates_rwx,SIGNATURE:antianalysis_detectfile,SIGNATURE:antiav_avast_libs,SIGNATURE:antiav_detectfile,SIGNATURE:antiav_detectreg,SIGNATURE:antiav_servicestop,SIGNATURE:antidbg_devices,SIGNATURE:antidbg_windows,...,SIGNATURE:suspicious_write_exe,SIGNATURE:sysinternals_tools_usage,SIGNATURE:terminates_remote_process,SIGNATURE:trojan_redosru,SIGNATURE:upatre,SIGNATURE:uses_windows_utilities,SIGNATURE:vir_napolar,SIGNATURE:win32_process_create,SIGNATURE:wmi_antivm,SIGNATURE:wmi_service
278,10001,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3295,10002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3933,10003,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2368,10004,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
501,10005,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3181,15691,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,15692,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1933,15693,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4041,15694,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
# Call the function to find constant features
find_constant_features(df9_sig)

# Call the function to check if column names are unique
check_column_uniqueness(df9_sig)

list_and_count_duplicates(df9_sig)

No constant features found.
All column names are unique.
No duplicate column names found.


In [36]:
# Rename columns starting
df9_sig2, column_map_sig = rename_columns_with_numbers(df9_sig, start_number=6468069)

print_first_and_last_10_items(column_map_sig)

display(df9_sig2.head())

First 10 items:
6468069: SIGNATURE:allocates_execute_remote_process
6468070: SIGNATURE:allocates_rwx
6468071: SIGNATURE:antianalysis_detectfile
6468072: SIGNATURE:antiav_avast_libs
6468073: SIGNATURE:antiav_detectfile
6468074: SIGNATURE:antiav_detectreg
6468075: SIGNATURE:antiav_servicestop
6468076: SIGNATURE:antidbg_devices
6468077: SIGNATURE:antidbg_windows
6468078: SIGNATURE:antiemu_wine

----------------------------------------

Last 10 items:
6468262: SIGNATURE:suspicious_write_exe
6468263: SIGNATURE:sysinternals_tools_usage
6468264: SIGNATURE:terminates_remote_process
6468265: SIGNATURE:trojan_redosru
6468266: SIGNATURE:upatre
6468267: SIGNATURE:uses_windows_utilities
6468268: SIGNATURE:vir_napolar
6468269: SIGNATURE:win32_process_create
6468270: SIGNATURE:wmi_antivm
6468271: SIGNATURE:wmi_service


Unnamed: 0_level_0,6468069,6468070,6468071,6468072,6468073,6468074,6468075,6468076,6468077,6468078,...,6468262,6468263,6468264,6468265,6468266,6468267,6468268,6468269,6468270,6468271
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
10005,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# saving the feature names dictionary
save_column_map(column_map_sig, '5_mlran_dataset/9_sig_feature_names_dic.json')

In [38]:
df9_sig2.to_parquet('5_mlran_dataset/9_sig_dataset.parquet', compression='snappy')

In [39]:
# Saving the data
df9_sig2.to_csv("5_mlran_dataset/9_sig_dataset.csv")

# MLRan: Combined Dataset

In [46]:
df1_api2 = pd.read_csv("5_mlran_dataset/1_api_dataset.csv")
print("Shape of API:", df1_api2.shape)
display(df1_api2.head())

Shape of API: (4884, 314)


Unnamed: 0,sample_id,1,2,3,4,5,6,7,8,9,...,304,305,306,307,308,309,310,311,312,313
0,10001,0,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,10002,0,0,1,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,10003,0,0,1,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,10004,0,0,1,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,10005,0,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [47]:
df1_api2 = pd.read_parquet('5_mlran_dataset/1_api_dataset.parquet', engine='pyarrow')  # or 'fastparquet'
df2_reg2 = pd.read_parquet('5_mlran_dataset/2_reg_dataset.parquet', engine='fastparquet')

In [48]:
df3_file2 = pd.read_parquet('5_mlran_dataset/3_file_dataset.parquet', engine='fastparquet')
df4_dir2 = pd.read_parquet('5_mlran_dataset/4_dir_dataset.parquet', engine='fastparquet')

In [14]:
#df5_str2 = pd.read_parquet('5_mlran_dataset/5_str_dataset.parquet', engine='fastparquet')
#df6_net2 = pd.read_parquet('5_mlran_dataset/6_net_dataset.parquet', engine='fastparquet')

In [15]:
#df7_sys2 = pd.read_parquet('5_mlran_dataset/7_sys_dataset.parquet', engine='fastparquet')
#df8_drop2 = pd.read_parquet('5_mlran_dataset/8_drop_dataset.parquet', engine='fastparquet')
#df9_sig2 = pd.read_parquet('5_mlran_dataset/9_sig_dataset.parquet', engine='fastparquet')

In [49]:
display('API:', df1_api2.head())
display('REG:', df2_reg2.head())
display('FILE',df3_file2.head())
display('DIR',df4_dir2.head())
display('STR',df5_str2.head())
display('NET',df6_net2.head())
display('SYS',df7_sys2.head())
display('DROP',df8_drop2.head())
display('SIG',df9_sig2.head())

'API:'

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,304,305,306,307,308,309,310,311,312,313
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,1,0,1,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,1,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,1,1,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


'REG:'

Unnamed: 0_level_0,314,315,316,317,318,319,320,321,322,323,...,525809,525810,525811,525812,525813,525814,525815,525816,525817,525818
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


'FILE'

Unnamed: 0_level_0,525819,525820,525821,525822,525823,525824,525825,525826,525827,525828,...,2604096,2604097,2604098,2604099,2604100,2604101,2604102,2604103,2604104,2604105
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


'DIR'

Unnamed: 0_level_0,2604137,2604138,2604139,2604140,2604141,2604142,2604143,2604144,2604145,2604146,...,2762250,2762251,2762252,2762253,2762254,2762255,2762256,2762257,2762258,2762259
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


'STR'

Unnamed: 0_level_0,2762260,2762261,2762262,2762263,2762264,2762265,2762266,2762267,2762268,2762269,...,6394369,6394370,6394371,6394372,6394373,6394374,6394375,6394376,6394377,6394378
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


'NET'

Unnamed: 0_level_0,6394693,6394694,6394695,6394696,6394697,6394698,6394699,6394700,6394701,6394702,...,6399497,6399498,6399499,6399500,6399501,6399502,6399503,6399504,6399505,6399506
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


'SYS'

Unnamed: 0_level_0,6399507,6399508,6399509,6399510,6399511,6399512,6399513,6399514,6399515,6399516,...,6416408,6416409,6416410,6416411,6416412,6416413,6416414,6416415,6416416,6416417
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


'DROP'

Unnamed: 0_level_0,6416418,6416419,6416420,6416421,6416422,6416423,6416424,6416425,6416426,6416427,...,6468059,6468060,6468061,6468062,6468063,6468064,6468065,6468066,6468067,6468068
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


'SIG'

Unnamed: 0_level_0,6468069,6468070,6468071,6468072,6468073,6468074,6468075,6468076,6468077,6468078,...,6468262,6468263,6468264,6468265,6468266,6468267,6468268,6468269,6468270,6468271
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
10005,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


: 

In [None]:
# Merge all dataframes on the "sample_id" column
df_combined = df1_api2.merge(df2_reg2, on="sample_id", how="outer") \
                     .merge(df3_file2, on="sample_id", how="outer") \
                     .merge(df4_dir2, on="sample_id", how="outer") \
                     .merge(df5_str2, on="sample_id", how="outer") \
                     .merge(df6_net2, on="sample_id", how="outer") \
                     .merge(df7_sys2, on="sample_id", how="outer") \
                     .merge(df8_drop2, on="sample_id", how="outer") \
                     .merge(df9_sig2, on="sample_id", how="outer")

# Fill missing values with 0
df_combined.fillna(0, inplace=True)

# Ensure all values are integers (if relevant)
df_combined = df_combined.astype(int)

# Display the combined dataframe
display(df_combined.head())

In [40]:
# Merge the feature names 
# List of input JSON file paths
input_files = [
    '5_mlran_dataset/1_api_feature_names_dic.json', 
    '5_mlran_dataset/2_reg_feature_names_dic.json', 
    '5_mlran_dataset/3_file_feature_names_dic.json', 
    '5_mlran_dataset/4_dir_feature_names_dic.json', 
    '5_mlran_dataset/5_str_feature_names_dic.json', 
    '5_mlran_dataset/6_net_feature_names_dic.json', 
    '5_mlran_dataset/7_sys_feature_names_dic.json', 
    '5_mlran_dataset/8_drop_feature_names_dic.json', 
    '5_mlran_dataset/9_sig_feature_names_dic.json'
]

# Initialize an empty dictionary to hold the merged data
combined_data = {}

# Read each JSON file and merge its contents into the combined_data dictionary
for file in input_files:
    with open(file, 'r') as f:
        data = json.load(f)
        combined_data.update(data)  # Merging dictionaries

# Write the merged data to a new JSON file
with open('5_mlran_dataset/MLRan_feature_names_dic.json', 'w') as outfile:
    json.dump(combined_data, outfile, indent=4)

print("JSON files have been merged into 'combined.json'.")


JSON files have been merged into 'combined.json'.


In [None]:
df_combined.to_parquet('5_mlran_dataset/MLRan_dataset.parquet', compression='snappy')

In [None]:
# Saving the data
df_combined.to_csv("5_mlran_dataset/MLRan_dataset.csv", index=False)

## MLRan Labels

In [6]:
df_label = pd.read_csv("mlran_dataset_metadata.csv")
df_label.head()

Unnamed: 0,sample_id,sha256,sample_type,ransomware_family,family_label,ransomware_type,type_label,sha1,md5,extension,detections,timestamp,source
0,10001,0068712434bda717df0d783b560b312854cb7dd4cf0789...,1,cryptolocker,3,crypto,2,77d6871f350be911b2b5c3e16cc2c222c1887779,12be6e7241d2503f31fae01046e88d68,exe,30/56,2014:10:10,elderan
1,10002,00735e4ad3e9a9a5b3551dce371248327e74ef3c25edca...,1,cryptowall,12,crypto,2,96fb7d2e3ad9fe434a66abb15b26dd4e40aa5d4b,aea8ab12edf294ddb2804d6618fdd247,exe,44/55,2015:04:02,elderan
2,10003,007a81ed76ca6d6e10584d90dee399b4ab98e59707688c...,1,cryptolocker,3,crypto,2,8ecc04ce43c0c77978fa83169819e37a92110318,d5d010f8b2f145399a9638f457ff3990,exe,28/52,2014:01:29,elderan
3,10004,00bf847c9a53922a2b36348456ee0f1afff0eec705f221...,1,kovter,9,locker,1,0d9e3696c8516a89567ef712c612edacc3c3386b,d1510b299e8570afd352d20d516f6f48,exe,28/57,2015:08:09,elderan
4,10005,00d786974e609adf93b29e6e86f7439074149e6cceabf3...,1,matsnu,11,locker,1,b1e88afcd0ea38655005eb4a6247ce9355b518ba,bc5734bcc7e2d8e2e208ea483a09b158,zip,42/55,,elderan


In [7]:
df_labels = df_label[['sample_id', 'sample_type', 'family_label', 'type_label']]
df_labels.head()

Unnamed: 0,sample_id,sample_type,family_label,type_label
0,10001,1,3,2
1,10002,1,12,2
2,10003,1,3,2
3,10004,1,9,1
4,10005,1,11,1


In [8]:
df_labels.shape

(4884, 4)

In [10]:
# Saving the data
df_labels.to_csv("5_mlran_dataset/MLRan_labels.csv", index=False)