## Utility Functions

In [1]:
from math import floor, log10
from collections import Counter
import pandas as pd
from sklearn.metrics import mean_absolute_error

benford = pd.DataFrame(
    {
        'digit': [1, 2, 3, 4, 5, 6, 7, 8, 9],
        'freq': [.301, .176, .125, .097, .079, .067, .058, .051, .046]
    })

def get_first_digit(i: int):
    if i == 0:
        return 0
    return floor(i / (10 ** floor(log10(i))))

def data_to_df(data: list[int]) -> pd.DataFrame:
    # Take list of data points and return dataframe of first digit frequencies as percentages
    range_to_fill = range(1, 10)
    fd = [get_first_digit(d) for d in data]
    counts = Counter({key: Counter(fd)[key] for key in range_to_fill})
    df = pd.DataFrame(counts.items(), columns=['digit', 'freq'])
    df = df[df['digit'] != 0]
    total = df['freq'].sum()
    df['freq'] = df['freq'].astype(float)
    df['freq'] /= total
    return df.copy()

## Lengths of constant strings

In [2]:
def score_string_length(analysis):
    strings = analysis.get_strings()  # dex[0].get_strings()
    strings = [s.get_value() for s in strings]
    strings = list(map(str.strip, strings))

    data = list(map(len, strings))

    return benford_score(data)

## Sum of Decimal encoded characters of constant strings

In [3]:
def sum_of_chars(s):
    return sum(ord(c) for c in s)

def score_string_value(analysis):
    strings = analysis.get_strings()  # dex[0].get_strings()
    strings = [s.get_value() for s in strings]

    strings = list(map(str.strip, strings))

    data = list(map(sum_of_chars, strings))

    return benford_score(data)

## Lengths of methods

In [4]:
def score_method_lengths(analysis):
    methods = analysis.get_methods()

    # The above get_methods() method returns a deprecated type, so we get_method() again below to return a usable type.
    # External methods don't have available lengths, so we skip past them
    method_lengths = [method.get_method().get_length() for method in methods if not method.is_external()]

    return benford_score(method_lengths)

## Number of fields in a class

In [5]:
def score_class_fields(analysis):
    classes = analysis.get_internal_classes()

    fields = [c.get_fields() for c in classes]

    fields_count = map(len, fields)

    return benford_score(fields_count)

## Number of methods in a class

In [6]:
def score_class_methods(analysis):
    classes = analysis.get_internal_classes()
    methods = [c.get_methods() for c in classes]
    methods_count = map(len, methods)
    return benford_score(methods_count)

## Bytes -> Hex -> Decimal of classes.dex file

In [7]:
def score_classes_dex(apk):
    all_classes = apk.get_all_dex()

    byte = [int(c, 16) for classes in all_classes for c in classes.hex(sep=' ').split(' ')]

    return benford_score(byte)

## Scoring Utility

In [8]:
import progressbar
import os
from androguard.misc import AnalyzeAPK

def benford_score(data: list[int]):
    df = data_to_df(data).copy()
    df = pd.merge(benford, df, on='digit', suffixes=['1', '2'])
    
    return round(mean_absolute_error(df['freq1'], df['freq2']), 4)

def score_all(analysis, apk):
    fields = ['str_len', 'str_val', 'method_len', 'class_fields', 'class_methods', 'classes_dex']
    
    scores = [
        [score_string_length(analysis)],
        [score_string_value(analysis)],
        [score_method_lengths(analysis)],
        [score_class_fields(analysis)],
        [score_class_methods(analysis)],
        [score_classes_dex(apk)],
    ]

    data = dict(zip(fields, scores))
    df = pd.DataFrame.from_dict(data, )

    return df.copy()

def score_directory(dir_name):
    df = pd.DataFrame()

    progress = progressbar.ProgressBar(max_value=len(os.listdir(dir_name)))

    for i, file in enumerate(os.listdir(dir_name)):
        file_path = os.path.join(dir_name, file)
        apk,dex,analysis = AnalyzeAPK(file_path)

        try:
            scores = score_all(analysis, apk)
            df = pd.concat([df, scores])
        except:
            print("Excluding APK:", file)
        
        progress.update(i+1)

    progress.finish()

    return df.reset_index(drop=True)

## Score Benign APKs

In [9]:
import configparser

config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation(), allow_no_value=True)
config.read('config.ini')

BENIGN_DIR = config['PATHS']['benign_dir']

benign_df = score_directory(BENIGN_DIR)

  0% (0 of 100) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--
  2% (2 of 100) |                        | Elapsed Time: 0:00:20 ETA:   0:16:50
  3% (3 of 100) |                        | Elapsed Time: 0:01:51 ETA:   2:27:26
  4% (4 of 100) |                        | Elapsed Time: 0:02:07 ETA:   0:24:59
  5% (5 of 100) |#                       | Elapsed Time: 0:02:55 ETA:   1:15:47
  6% (6 of 100) |#                       | Elapsed Time: 0:03:11 ETA:   0:24:48
Requested API level 29 is larger than maximum we have, returning API level 28 instead.
  7% (7 of 100) |#                       | Elapsed Time: 0:04:06 ETA:   1:25:55
  8% (8 of 100) |#                       | Elapsed Time: 0:04:30 ETA:   0:36:50
  9% (9 of 100) |##                      | Elapsed Time: 0:04:32 ETA:   0:03:06
Requested API level 31 is larger than maximum we have, returning API level 28 instead.
 10% (10 of 100) |##                     | Elapsed Time: 0:05:07 ETA:   0:52:27
Requested API level 30 is 

## Score Malign APKs

In [10]:
import configparser

config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation(), allow_no_value=True)
config.read('config.ini')

MALIGN_DIR = config['PATHS']['malign_dir']
malign_df = score_directory(MALIGN_DIR)

  0% (0 of 100) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--
  2% (2 of 100) |                        | Elapsed Time: 0:00:01 ETA:   0:01:16
Requested API Level could not be found, using 19 instead
  3% (3 of 100) |                        | Elapsed Time: 0:00:06 ETA:   0:08:17
  4% (4 of 100) |                        | Elapsed Time: 0:00:18 ETA:   0:19:06
  5% (5 of 100) |#                       | Elapsed Time: 0:00:25 ETA:   0:10:16
  6% (6 of 100) |#                       | Elapsed Time: 0:00:39 ETA:   0:23:06
  7% (7 of 100) |#                       | Elapsed Time: 0:00:45 ETA:   0:09:08
  8% (8 of 100) |#                       | Elapsed Time: 0:00:51 ETA:   0:08:42
  9% (9 of 100) |##                      | Elapsed Time: 0:01:37 ETA:   1:10:00
 10% (10 of 100) |##                     | Elapsed Time: 0:02:14 ETA:   0:54:39
 11% (11 of 100) |##                     | Elapsed Time: 0:02:19 ETA:   0:07:37
 12% (12 of 100) |##                     | Elapsed Time: 0:03:0

Excluding APK: 32E81BAE8FF88BF7F9BD14DFC5352BA79A6914EA2427AC31114F603B1BF6698A.apk


 14% (14 of 100) |###                    | Elapsed Time: 0:03:08 ETA:   0:03:29
 15% (15 of 100) |###                    | Elapsed Time: 0:03:57 ETA:   1:08:22
 16% (16 of 100) |###                    | Elapsed Time: 0:04:01 ETA:   0:06:33
 17% (17 of 100) |###                    | Elapsed Time: 0:04:06 ETA:   0:07:12
 18% (18 of 100) |####                   | Elapsed Time: 0:05:53 ETA:   2:25:22
 19% (19 of 100) |####                   | Elapsed Time: 0:06:01 ETA:   0:11:31
 20% (20 of 100) |####                   | Elapsed Time: 0:06:02 ETA:   0:06:00
 21% (21 of 100) |####                   | Elapsed Time: 0:06:05 ETA:   0:04:06
 22% (22 of 100) |#####                  | Elapsed Time: 0:06:20 ETA:   0:19:12
 23% (23 of 100) |#####                  | Elapsed Time: 0:06:30 ETA:   0:13:36
 24% (24 of 100) |#####                  | Elapsed Time: 0:07:52 ETA:   1:43:01
 25% (25 of 100) |#####                  | Elapsed Time: 0:07:52 ETA:   0:51:17
 26% (26 of 100) |#####                 

Excluding APK: 6DE15403E827DC4A4ED5841073521C67B7EC497F2261D73BAF2A92DE4ED32873.apk


 35% (35 of 100) |########               | Elapsed Time: 0:10:25 ETA:   0:04:50
 36% (36 of 100) |########               | Elapsed Time: 0:10:26 ETA:   0:03:47
 37% (37 of 100) |########               | Elapsed Time: 0:10:26 ETA:   0:03:00
 38% (38 of 100) |########               | Elapsed Time: 0:10:41 ETA:   0:15:56
 40% (40 of 100) |#########              | Elapsed Time: 0:10:41 ETA:   0:05:11


Excluding APK: 784AEDDB2877415D6080E167F52A88505D4280882E4479E985F9FD2058D86FB3.apk


 41% (41 of 100) |#########              | Elapsed Time: 0:10:43 ETA:   0:00:41
 42% (42 of 100) |#########              | Elapsed Time: 0:10:47 ETA:   0:04:06
 43% (43 of 100) |#########              | Elapsed Time: 0:10:48 ETA:   0:02:28
 44% (44 of 100) |##########             | Elapsed Time: 0:11:01 ETA:   0:12:01
 45% (45 of 100) |##########             | Elapsed Time: 0:11:02 ETA:   0:06:24
 46% (46 of 100) |##########             | Elapsed Time: 0:11:07 ETA:   0:04:23
 47% (47 of 100) |##########             | Elapsed Time: 0:11:07 ETA:   0:02:12
 48% (48 of 100) |###########            | Elapsed Time: 0:11:21 ETA:   0:11:22
 49% (49 of 100) |###########            | Elapsed Time: 0:11:22 ETA:   0:06:21
 50% (50 of 100) |###########            | Elapsed Time: 0:11:23 ETA:   0:01:04
 51% (51 of 100) |###########            | Elapsed Time: 0:11:24 ETA:   0:01:01
 52% (52 of 100) |###########            | Elapsed Time: 0:11:27 ETA:   0:02:11
 53% (53 of 100) |############          

## Combine benign and malign DFs

In [11]:
import pandas as pd

benign_df['malign'] = 0
malign_df['malign'] = 1

df = pd.concat([benign_df, malign_df]).reset_index(drop=True)
df.to_csv(os.path.join(config['PATHS']['apk_dir'], 'data.csv'), index=False)