## Utility Functions

In [42]:
from math import floor, log10
from collections import Counter
import pandas as pd
from sklearn.metrics import mean_absolute_error

benford = pd.DataFrame(
    {
        'digit': [1, 2, 3, 4, 5, 6, 7, 8, 9],
        'freq': [.301, .176, .125, .097, .079, .067, .058, .051, .046]
    })

def get_first_digit(i: int):
    if i == 0:
        return 0
    return floor(i / (10 ** floor(log10(i))))

def data_to_df(data: list[int]) -> pd.DataFrame:
    # Take list of data points and return dataframe of first digit frequencies as percentages
    range_to_fill = range(1, 10)
    fd = [get_first_digit(d) for d in data]
    counts = Counter({key: Counter(fd)[key] for key in range_to_fill})
    df = pd.DataFrame(counts.items(), columns=['digit', 'freq'])
    df = df[df['digit'] != 0]
    total = df['freq'].sum()
    df['freq'] = df['freq'].astype(float)
    df['freq'] /= total
    return df.copy()

## Lengths of constant strings

In [43]:
def score_string_length(analysis):
    strings = analysis.get_strings()  # dex[0].get_strings()
    strings = [s.get_value() for s in strings]
    strings = list(map(str.strip, strings))

    data = list(map(len, strings))

    return benford_score(data)

## Sum of Decimal encoded characters of constant strings

In [44]:
def sum_of_chars(s):
    return sum(ord(c) for c in s)

def score_string_value(analysis):
    strings = analysis.get_strings()  # dex[0].get_strings()
    strings = [s.get_value() for s in strings]

    strings = list(map(str.strip, strings))

    data = list(map(sum_of_chars, strings))

    return benford_score(data)

## Lengths of methods

In [45]:
def score_method_lengths(analysis):
    methods = analysis.get_methods()

    # The above get_methods() method returns a deprecated type, so we get_method() again below to return a usable type.
    # External methods don't have available lengths, so we skip past them
    method_lengths = [method.get_method().get_length() for method in methods if not method.is_external()]

    return benford_score(method_lengths)

## Number of fields in a class

In [46]:
def score_class_fields(analysis):
    classes = analysis.get_internal_classes()

    fields = [c.get_fields() for c in classes]

    fields_count = map(len, fields)

    return benford_score(fields_count)

## Number of methods in a class

In [47]:
def score_class_methods(analysis):
    classes = analysis.get_internal_classes()
    methods = [c.get_methods() for c in classes]
    methods_count = map(len, methods)
    return benford_score(methods_count)

## Bytes -> Hex -> Decimal of classes.dex file

In [48]:
def score_classes_dex(apk):
    all_classes = apk.get_all_dex()

    byte = [int(c, 16) for classes in all_classes for c in classes.hex(sep=' ').split(' ')]

    return benford_score(byte)

## Scoring Utility

In [49]:
import progressbar
import os
from androguard.misc import AnalyzeAPK

def benford_score(data: list[int]):
    df = data_to_df(data).copy()
    df = pd.merge(benford, df, on='digit', suffixes=['1', '2'])
    
    return round(mean_absolute_error(df['freq1'], df['freq2']), 4)

def score_all(analysis, apk):
    fields = ['str_len', 'str_val', 'method_len', 'class_fields', 'class_methods', 'classes_dex']
    
    scores = [
        [score_string_length(analysis)],
        [score_string_value(analysis)],
        [score_method_lengths(analysis)],
        [score_class_fields(analysis)],
        [score_class_methods(analysis)],
        [score_classes_dex(apk)],
    ]

    data = dict(zip(fields, scores))
    df = pd.DataFrame.from_dict(data, )

    return df.copy()

def score_directory(dir_name):
    df = pd.DataFrame()

    progress = progressbar.ProgressBar(max_value=len(os.listdir(dir_name)))

    for i, file in enumerate(os.listdir(dir_name)):
        file_path = os.path.join(dir_name, file)
        apk,dex,analysis = AnalyzeAPK(file_path)

        try:
            scores = score_all(analysis, apk)
            df = pd.concat([df, scores])
        except:
            print("Excluding APK:", file)
        
        progress.update(i+1)

    progress.finish()

    return df.reset_index(drop=True)

## Score Benign APKs

In [50]:
import configparser

config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation(), allow_no_value=True)
config.read('config.ini')

BENIGN_DIR = config['PATHS']['benign_dir']

benign_df = score_directory(BENIGN_DIR)

  0% (0 of 10) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
 20% (2 of 10) |#####                    | Elapsed Time: 0:00:12 ETA:   0:00:49
Requested API level 30 is larger than maximum we have, returning API level 28 instead.
 30% (3 of 10) |#######                  | Elapsed Time: 0:03:01 ETA:   0:19:40
 40% (4 of 10) |##########               | Elapsed Time: 0:03:16 ETA:   0:01:33
 50% (5 of 10) |############             | Elapsed Time: 0:04:27 ETA:   0:05:56
Requested API level 29 is larger than maximum we have, returning API level 28 instead.
 60% (6 of 10) |###############          | Elapsed Time: 0:04:49 ETA:   0:01:24
 70% (7 of 10) |#################        | Elapsed Time: 0:04:58 ETA:   0:00:29
 80% (8 of 10) |####################     | Elapsed Time: 0:05:11 ETA:   0:00:24
Requested API level 31 is larger than maximum we have, returning API level 28 instead.
 90% (9 of 10) |######################   | Elapsed Time: 0:05:59 ETA:   0:00:48
Requested API level

## Score Malign APKs

In [51]:
import configparser

config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation(), allow_no_value=True)
config.read('config.ini')

MALIGN_DIR = config['PATHS']['malign_dir']
malign_df = score_directory(MALIGN_DIR)

  0% (0 of 10) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
 20% (2 of 10) |#####                    | Elapsed Time: 0:00:00 ETA:   0:00:00
 30% (3 of 10) |#######                  | Elapsed Time: 0:00:06 ETA:   0:00:46
 40% (4 of 10) |##########               | Elapsed Time: 0:00:06 ETA:   0:00:20


Excluding APK: 000027D1DA96332EFCB54AF76906A7298121EBCCCDAB3D7DCE999F8043E74EE7.apk


 50% (5 of 10) |############             | Elapsed Time: 0:00:08 ETA:   0:00:13
 60% (6 of 10) |###############          | Elapsed Time: 0:00:11 ETA:   0:00:13
 70% (7 of 10) |#################        | Elapsed Time: 0:00:22 ETA:   0:00:33
 80% (8 of 10) |####################     | Elapsed Time: 0:00:24 ETA:   0:00:13
 90% (9 of 10) |######################   | Elapsed Time: 0:00:29 ETA:   0:00:04
100% (10 of 10) |########################| Elapsed Time: 0:00:29 ETA:  00:00:00
100% (10 of 10) |########################| Elapsed Time: 0:00:29 Time:  0:00:29


## Combine benign and malign DFs

In [56]:
import pandas as pd

benign_df['malign'] = 0
malign_df['malign'] = 1

df = pd.concat([benign_df, malign_df]).reset_index(drop=True)
df.to_csv(os.path.join(config['PATHS']['apk_dir'], 'data.csv'), index=False)