In [15]:
from androguard.misc import AnalyzeAPK

file_path = "APKs/malign/7C54C79F01EEEA7256A1423C864DEC150760DFCB1662BC36CEB18739C4D1B507.apk"#"APKs/benign/2F8901939F60A55DAA7D8009A8D55A8CE3AA50C2CBB9FDCF00A35DE223C8DF70.apk"#
apk, dex, analysis = AnalyzeAPK(file_path)

## Utility Functions

In [2]:
from math import floor, log10
from collections import Counter
import pandas as pd

# Standard Benford's Law propability distribution
benford = {1: .301, 
           2: .176, 
           3: .125, 
           4: .097, 
           5: .079, 
           6: .067, 
           7: .058, 
           8: .051, 
           9: .046}

def get_first_digit(i: int):
    if i == 0:
        return 0
    return floor(i / (10 ** floor(log10(i))))

# Take list of data points and return dataframe of first digit frequencies as percentages
def data_to_df(data: list[int]) -> pd.DataFrame:
    range_to_fill = range(1, 10)
    fd = [get_first_digit(d) for d in data]
    total = len(data)
    if total == 0:
        return pd.DataFrame()
    counts = Counter({key: (Counter(fd)[key]/total) for key in range_to_fill})
    df = pd.DataFrame([counts], columns=counts.keys())
    return df.copy()

## Lengths of constant strings

In [3]:
strings = analysis.get_strings()  # dex[0].get_strings()
strings = [s.get_value() for s in strings]
strings = list(map(str.strip, strings))

data = list(map(len, strings))

data_to_df(data)

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,0.297287,0.195349,0.135659,0.104651,0.076357,0.059302,0.053876,0.037984,0.037597


## Sum of Decimal encoded characters of constant strings

In [4]:
strings = analysis.get_strings()  # dex[0].get_strings()
strings = [s.get_value() for s in strings]

strings = list(map(str.strip, strings))

data = list(map(lambda s: sum(ord(c) for c in s), strings))

data_to_df(data)

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,0.291085,0.202713,0.137597,0.099612,0.075969,0.055814,0.052713,0.042636,0.039922


## Lengths of methods

In [5]:
methods = analysis.get_methods()

# The above get_methods() method returns a deprecated type, so we get_method() again below to return a usable type.
# External methods don't have available lengths, so we skip past them
method_lengths = [method.get_method().get_length() for method in methods if not method.is_external()]

data_to_df(method_lengths)

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,0.241655,0.099428,0.101812,0.12506,0.070577,0.122079,0.082976,0.048879,0.038627


## Number of fields in a class

In [6]:
classes = analysis.get_internal_classes()

fields = [c.get_fields() for c in classes]

fields_count = list(map(len, fields))

data_to_df(fields_count)

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,0.201649,0.134933,0.06072,0.051724,0.024738,0.031484,0.023238,0.014993,0.014993


## Number of methods in a class

In [7]:
classes = analysis.get_internal_classes()
methods = [c.get_methods() for c in classes]
methods_count = list(map(len, methods))
data_to_df(methods_count)

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,0.28036,0.25937,0.096702,0.073463,0.068966,0.033733,0.035232,0.026987,0.028486


## Bytes -> Hex -> Decimal of classes.dex file

In [8]:
all_classes = apk.get_all_dex()

byte = [int(c, 16) for classes in all_classes for c in classes.hex(sep=' ').split(' ')]

data_to_df(byte)

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,0.371352,0.114944,0.053924,0.055969,0.041281,0.037343,0.036714,0.034784,0.043339


## Numbers in Strings
This doesn't seem to follow Benford's Law

In [9]:
import re

strings = analysis.get_strings()
strings = [s.get_value() for s in strings]

nums = []

for string in strings:
    n = [int(match) for match in re.findall(r'\d+', string)]
    nums.extend(n)

df = data_to_df(nums)
df

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,0.120623,0.145914,0.085603,0.142023,0.040856,0.050584,0.029183,0.029183,0.143969


## Timeouts/Durations
Interesting to look at, but unsure how this can be used with Benford's Law

In [10]:
for d in dex:
    for method in d.get_methods():
        if "Thread.sleep" in method.get_source():
            print("Potential timeout: ", method.get_source())

strings = analysis.get_strings()

for string_id in strings:
    if "timeout" in string_id.get_value().lower() or "duration" in string_id.get_value().lower():
        print("Potential timeout string:", string_id.get_value())

attributes = apk.get_permissions()
declared = apk.get_declared_permissions_details()
print("Attributes:", attributes)
print("Declared:", declared)

Potential timeout:  
    public void run()
    {
        try {
            Thread.sleep(1000);
            this.this$0.getApp().runOnUiThread(new de.appplant.cordova.plugin.background.BackgroundModeExt$1$$Lambda$0(this));
        } catch (InterruptedException v0) {
        }
        return;
    }

Potential timeout:  
    public void run()
    {
        try {
            Thread.sleep(2000);
            org.apache.cordova.CordovaWebViewImpl.access$200(this.this$1.this$0).getActivity().runOnUiThread(new org.apache.cordova.CordovaWebViewImpl$EngineClient$1$1(this));
        } catch (InterruptedException v0) {
        }
        return;
    }

Potential timeout string: hintScreenTimeout
Potential timeout string: android.media.metadata.DURATION
Potential timeout string: timeout
Potential timeout string: CordovaWebView: TIMEOUT ERROR!
Potential timeout string: LoadUrlTimeoutValue
Potential timeout string: window.setTimeout(function(){cordova.require('cordova/plugin/android/polling').pollOnce(

The following procedure was taken from [this Github repo](https://github.com/asimswati553/RGB-based-Andorid-Malware-detection/blob/master/APK2File.ipynb) to find intents. 

## API Calls
Maps suspicious external API calls to the internal methods in which they are called. 
Not currently being analyzed against Benford's Law

In [1]:
from collections import defaultdict
import json, csv
import os
import configparser

with open('./scores/api_scores.csv', 'r') as f:
    suspicious = csv.reader(f)
    suspicious = [line[0] for line in suspicious]
print(suspicious)

def evaluate_api(dex, analysis) -> dict[str, list[str]]:
    API_calls = defaultdict(list)
    for method in dex[0].get_methods():
        method_name = str(method).split('(')[0]
        if method_name.split('/')[0] == 'Landroid': #skip android libraries (not sure if these can be edited by developer)
            continue
        #print(method_name)
        for api in analysis.get_external_classes():
            for i in api.get_methods():
                api_name = str(i.get_method()).split('(')[0] # ignoring the parameters and return type.
                if api_name not in suspicious:
                    continue
                try:
                    if i.get_method().get_name() in method.get_source():
                        API_calls[api_name].append(method_name)
                    #print("\t", api_name)
                except Exception as e:
                    pass
                    #print(e)
    return API_calls

config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation(), allow_no_value=True)
config.read('config.ini')

BENIGN_DIR = config['PATHS']['benign_dir']

with open(os.path.join(config['PATHS']['apk_dir'], 'data_benign.json'), 'w') as f:
    f.write("{\n")

for file in os.listdir(BENIGN_DIR):
    file_path = os.path.join(BENIGN_DIR, file)
    temp_apk,temp_dex,temp_analysis = AnalyzeAPK(file_path)

    print(file)
    with open(os.path.join(config['PATHS']['apk_dir'], 'data_benign.json'), 'a') as f:
        f.write(f"\"{file}\": ")
        f.write(json.dumps(evaluate_api(temp_dex, temp_analysis), indent=4))
        f.write(",\n")

with open(os.path.join(config['PATHS']['apk_dir'], 'data_benign.json'), 'a') as f:
    f.write("}")

['Ljavax/sql/ConnectionEvent;-><init>', 'Ljava/nio/channels/WritableByteChannel;->close', 'Landroid/service/carrier/CarrierService;->stopSelf', 'Landroid/opengl/Matrix;->getClass', 'Landroid/view/ViewStructure;->setCheckable', 'Landroid/text/method/BaseKeyListener;->getInputType', 'Landroid/provider/MediaStore$Images$Media;->wait', 'Ljava/lang/Runtime;->exec', 'Ljava/lang/System;->loadLibrary', 'Landroid/widget/AdapterView;->refreshDrawableState', 'Landroid/widget/MultiAutoCompleteTextView;->saveHierarchyState', 'Ljava/io/BufferedOutputStream;-><init>', 'Ljava/io/FileOutputStream;-><init>', 'Landroid/app/PendingIntent;->send', 'Landroid/app/AlarmManager;->Set', 'Landroid/app/NativeActivity;->getVolumeControlStream', 'Landroid/app/AcitivityManager;->killBackgroudProcess', 'Landroid/content/pm/PacakageManager;->removePackageFromPrefe', 'Landroid/content/pm/PacakageManager;->getInastallerPackageName', 'Landroid/content/pm/PacakageManager;->getInstalledPackages', 'Landroid/content/pm/Pacak


KeyboardInterrupt



## Other APK properties
These, along with API calls above, were used in https://github.com/asimswati553/RGB-based-Andorid-Malware-detection

In [17]:
permissions = apk.get_permissions() + apk.get_declared_permissions()
collected_permissions = []
for permission in permissions:
    perm = permission.split('.')[-1]
    collected_permissions.append(perm)

#Activities
activities = apk.get_activities()
collected_activities = ['Activities:']
for activity in activities:
    act = activity.split('.')[-1]
    collected_activities.append(act)
        
#Services
services = apk.get_services()
collected_services = ['Services:']
for service in services:
    srvc = service.split('.')[-1]
    collected_services.append(srvc)
        
#Recivers
receivers = apk.get_receivers()
collected_receivers = ['Receivers:']
for receiver in receivers:
    recevr = receiver.split('.')[-1]
    collected_receivers.append(recevr)
        
#Providers
providers = apk.get_providers()
collected_providers = ['Providers:']
for provider in providers:
    collected_providers.append(provider)
    
#Intents
collected_intents = ['Intents:']
manifest_list = {'permissions':permissions,'activity' : activities, 'service': services, 'receiver':receivers, 'provider':providers}
intents_itemtype = {'activity' : activities, 'service': services, 'receiver':receivers, 'provider':providers}
for itemtype, listt in intents_itemtype.items():
    for item in listt:
        try:
            for intnts in apk.get_intent_filters(itemtype, item).values():
                for intnt in intnts:
                    collected_intents.append(intnt)
        except:
            pass
app_components = collected_activities + collected_services + collected_receivers + collected_providers + collected_intents
#print(collected_permissions)
print(collected_activities)

['Activities:', 'MainActivity', 'TestActivity', 'HideActivity', 'HideScreenActivity', 'NewActivity']


Using the scoring mechanism from the same repository, we can score the APK

In [None]:
import csv

suspicious = {}
with open('./scores/api_scores.csv', 'r') as f:
    suspicious = csv.reader(f)
    suspicious = {line[0]:int(line[1]) for line in suspicious}

score = 0
for call in evaluate_api(dex, analysis):
    if call in suspicious:
        score += suspicious[call]


print(score)

1490
