## Utility Functions

In [12]:
from math import floor, log10
from collections import Counter
import pandas as pd
from sklearn.metrics import mean_absolute_error

benford = {1: .301, 
           2: .176, 
           3: .125, 
           4: .097, 
           5: .079, 
           6: .067, 
           7: .058, 
           8: .051, 
           9: .046}

def get_first_digit(i: int):
    if i == 0:
        return 0
    return floor(i / (10 ** floor(log10(i))))

def data_to_df(data: list[int]) -> pd.DataFrame:
    # Take list of data points and return dataframe of first digit frequencies as percentages
    range_to_fill = range(1, 10)
    fd = [get_first_digit(d) for d in data]
    total = len(data)
    counts = Counter({key: (Counter(fd)[key]/total) for key in range_to_fill})
    df = pd.DataFrame([counts], columns=counts.keys())
    return df.copy()

## Lengths of constant strings

In [13]:
def fd_string_length(analysis):
    strings = analysis.get_strings()  # dex[0].get_strings()
    strings = [s.get_value() for s in strings]
    strings = list(map(str.strip, strings))

    data = list(map(len, strings))

    return data_to_df(data)

## Sum of Decimal encoded characters of constant strings

In [14]:
def sum_of_chars(s):
    return sum(ord(c) for c in s)

def fd_string_value(analysis):
    strings = analysis.get_strings()  # dex[0].get_strings()
    strings = [s.get_value() for s in strings]

    strings = list(map(str.strip, strings))

    data = list(map(sum_of_chars, strings))

    return data_to_df(data)

## Lengths of methods

In [15]:
def fd_method_lengths(analysis):
    methods = analysis.get_methods()

    # The above get_methods() method returns a deprecated type, so we get_method() again below to return a usable type.
    # External methods don't have available lengths, so we skip past them
    method_lengths = [method.get_method().get_length() for method in methods if not method.is_external()]

    return data_to_df(method_lengths)

## Number of fields in a class

In [16]:
def fd_class_fields(analysis):
    classes = analysis.get_internal_classes()

    fields = [c.get_fields() for c in classes]

    fields_count = map(len, fields)

    return data_to_df(fields_count)

## Number of methods in a class

In [17]:
def fd_class_methods(analysis):
    classes = analysis.get_internal_classes()
    methods = [c.get_methods() for c in classes]
    methods_count = map(len, methods)
    return data_to_df(methods_count)

## Bytes -> Hex -> Decimal of classes.dex file

In [18]:
def fd_classes_dex(apk):
    all_classes = apk.get_all_dex()

    byte = [int(c, 16) for classes in all_classes for c in classes.hex(sep=' ').split(' ')]

    return data_to_df(byte)

## Evaluation Utility

In [19]:
import progressbar
import os
from androguard.misc import AnalyzeAPK

def evaluate_directory(dir_name):
    df = pd.DataFrame()

    progress = progressbar.ProgressBar(max_value=len(os.listdir(dir_name)))

    for i, file in enumerate(os.listdir(dir_name)):
        file_path = os.path.join(dir_name, file)
        apk,dex,analysis = AnalyzeAPK(file_path)

        try:
            data = fd_string_length(analysis)
            df = pd.concat([df, data])
        except Exception as e:
            print(e)
            print("Excluding APK:", file)
        
        progress.update(i+1)

    progress.finish()

    return df.reset_index(drop=True)

## Score Benign APKs

In [20]:
import configparser

config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation(), allow_no_value=True)
config.read('config.ini')

BENIGN_DIR = config['PATHS']['benign_dir']

benign_df = evaluate_directory(BENIGN_DIR)

  0% (0 of 100) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--
  2% (2 of 100) |                        | Elapsed Time: 0:01:07 ETA:   0:55:29
  3% (3 of 100) |                        | Elapsed Time: 0:02:54 ETA:   2:52:17
  4% (4 of 100) |                        | Elapsed Time: 0:03:24 ETA:   0:48:12
  5% (5 of 100) |#                       | Elapsed Time: 0:06:10 ETA:   4:22:33
  6% (6 of 100) |#                       | Elapsed Time: 0:06:53 ETA:   1:07:58
Requested API level 29 is larger than maximum we have, returning API level 28 instead.
  7% (7 of 100) |#                       | Elapsed Time: 0:08:54 ETA:   3:06:17
  8% (8 of 100) |#                       | Elapsed Time: 0:09:23 ETA:   0:44:54
  9% (9 of 100) |##                      | Elapsed Time: 0:09:25 ETA:   0:03:48
Requested API level 31 is larger than maximum we have, returning API level 28 instead.
 10% (10 of 100) |##                     | Elapsed Time: 0:10:27 ETA:   1:31:43
Requested API level 30 is 

In [22]:
benign_df.to_csv(os.path.join(config['PATHS']['apk_dir'], 'data_benign.csv'), index=False)

## Score Malign APKs

In [23]:
import configparser

config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation(), allow_no_value=True)
config.read('config.ini')

MALIGN_DIR = config['PATHS']['malign_dir']
malign_df = evaluate_directory(MALIGN_DIR)

  0% (0 of 100) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--
  2% (2 of 100) |                        | Elapsed Time: 0:00:01 ETA:   0:01:12
Requested API Level could not be found, using 19 instead
  3% (3 of 100) |                        | Elapsed Time: 0:00:05 ETA:   0:06:00
  4% (4 of 100) |                        | Elapsed Time: 0:00:05 ETA:   0:03:08
  5% (5 of 100) |#                       | Elapsed Time: 0:00:08 ETA:   0:05:24
  6% (6 of 100) |#                       | Elapsed Time: 0:00:18 ETA:   0:15:39
  7% (7 of 100) |#                       | Elapsed Time: 0:00:22 ETA:   0:05:44
  8% (8 of 100) |#                       | Elapsed Time: 0:00:26 ETA:   0:06:36
  9% (9 of 100) |##                      | Elapsed Time: 0:01:05 ETA:   0:58:01
 10% (10 of 100) |##                     | Elapsed Time: 0:01:36 ETA:   0:47:05
 11% (11 of 100) |##                     | Elapsed Time: 0:01:36 ETA:   0:23:25
 12% (12 of 100) |##                     | Elapsed Time: 0:02:1

In [24]:
malign_df.to_csv(os.path.join(config['PATHS']['apk_dir'], 'data_malign.csv'), index=False)

## Combine benign and malign DFs

In [25]:
import pandas as pd

benign_df['malign'] = 0
malign_df['malign'] = 1

df = pd.concat([benign_df, malign_df]).reset_index(drop=True)
df.to_csv(os.path.join(config['PATHS']['apk_dir'], 'data.csv'), index=False)