# Load the dataset

In [1]:
import pandas as pd
import numpy as np
import os

# Constants
DREBIN_BASE_PATH = os.path.join("..", "dataset", "drebin", "metadata")
FEATURE_VECTOR_PATH = os.path.join(DREBIN_BASE_PATH, "feature_vectors")
POSITIVE_SAMPLES_LOOKUPTABLE_PATH = os.path.join(DREBIN_BASE_PATH, "sha256_family.csv")

FEATURE_NAMES = [
    "hw_features",
    "req_permissions",
    "app_components",
    "intents",
    "api_calls",
    "used_permissions",
    "sus_api_calls",
    "urls",
]

FEATURE_LOOKUP = {
    "feature": "hw_features",
    "permission": "req_permissions",
    "activity": "app_components",
    "service_receiver": "app_components",
    "provider": "app_components",
    "service": "app_components",
    "intent": "intents",
    "api_call": "api_calls",
    "real_permission": "used_permissions",
    "call": "sus_api_calls",
    "url": "urls",
}

In [2]:
def get_features(lines: list[str]) -> dict[str, int]:
    feature_dict = {k: 0 for k in FEATURE_NAMES}

    for line in lines:
        if line != "\n":
            type = line.split("::")[0]
            feature_dict[FEATURE_LOOKUP[type]] += 1

    return feature_dict

In [3]:
def load_drebin(load_data: bool = False):
    if load_data:
        # Load the positive samples lookup table
        samples_lookup = pd.read_csv(POSITIVE_SAMPLES_LOOKUPTABLE_PATH, delimiter=",")
        samples_lookup = samples_lookup.set_index("sha256")

        # Load the feature vectors
        feature_files = os.listdir(FEATURE_VECTOR_PATH)

        # Create the feature vectors dataframe
        feature_vectors = pd.DataFrame(
            columns=["sha256"] + FEATURE_NAMES + ["malware", "family"]
        )
        feature_vectors = feature_vectors.set_index("sha256")

        # Populate the feature vectors dataframe (file name is the sha256 hash)
        for file in feature_files[:10]:
            if file in samples_lookup.index:
                malware = 1
                family = samples_lookup.loc[file].values[0]
            else:
                malware = 0
                family = "Benign"

            with open(os.path.join(FEATURE_VECTOR_PATH, file), "r") as f:
                lines = f.readlines()
                sha256 = file.split(".")[0]
                features = get_features(lines)

            feature_vectors.loc[sha256] = list(features.values()) + [malware, family]

        return feature_vectors
    else:
        # Load the previously created dataframe
        return pd.read_csv("test.csv")

In [4]:
feature_vectors = load_drebin(load_data=True)
feature_vectors

Unnamed: 0_level_0,hw_features,req_permissions,app_components,intents,api_calls,used_permissions,sus_api_calls,urls,malware,family
sha256,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
00002d74a9faa53f5199c910b652ef09d3a7f6bd42b693755a233635c3ffb0f4,2,11,5,3,7,6,11,26,1,GinMaster
000068216bdb459df847bfdd67dd11069c3c50166db1ea8772cdc9250d948bcf,1,2,9,5,2,1,2,0,0,Benign
0000764713b286cfe7e8e76c7038c92312977712d9c5a86d504be54f3c1d025a,5,11,4,4,6,5,6,3,1,Opfake
0000962c2c34de1ca0c329b18be7847459da2d9d14b6b23a21cbc6427522403c,2,1,2,2,1,1,1,0,0,Benign
000167f1ff061ea91440c40659c11c2af160342fd2e493d609e4996b8820e78f,2,4,4,2,4,3,1,4,0,Benign
00017ed2c044caf7b1047184673ec3e11ab10ac0e64fb7e7bccaca0deb13198a,1,0,1,2,0,0,1,0,0,Benign
000189f3a91b1c19f15e2838995f80d6bb40d74aa8135f6d3e4fdbb80a0bdee7,6,9,24,2,6,5,6,2,0,Benign
0003043c7e2af5e07a2638fbf2391802b0c9ff1926e5d04d06df06992147a325,1,0,11,2,0,0,0,0,0,Benign
00032ac5f91c29399f9727a082b6e1aa0a761f479a16ebd039b5916b76326701,6,10,32,5,8,7,7,55,0,Benign
00039901d26fa9121f32762d8b0e67df7cfbcbb53e0fe3da7fb1298dab84c816,1,1,1,2,3,2,1,9,0,Benign


In [5]:
feature_vectors.to_csv("test.csv")