# Cluster APK

Building as an extension to: https://reports.exodus-privacy.eu.org

In [1]:
from axmlparserpy.apk import APK
import os

In [2]:
apk_path = '/home/tiger/Projects/0xff/apks/summoner.apk'

## Scape APKs

In [3]:
app_ids_for_download = {
    'com.teliasonera.cis.tps',
    'com.nianticlabs.pokemongo',
    'com.peacock.flashlight',
    'com.msales.flashlight',
    'com.viber.voip'
}

CACHED_APP_ID_TO_APK_BYTES_PATH = './apks/apk_dict.pickle'

In [4]:
from requests_html import HTMLSession
import requests
import pickle
from sys import stderr
from typing import Dict

session = HTMLSession()

from pathlib import Path

class NoMatchingSearchResultError(Exception):
    def __init__(self, app_id: str):
        super().__init__(f'No search result was found for app id {app_id}.')
        self.app_id = app_id


def download_apk(app_id: str) -> bytes:
    session = HTMLSession()
    
    search_response = session.get(
        'https://www.apkpure.com/search',
        params=dict(q=app_id)
    )
    
    try:
        app_result_page_link = '/'.join([
            'https://www.apkpure.com',
            next(
                element.attrs['href']
                for element in
                search_response.html.find('.search-dl .search-title > a')
                if element.attrs['href'].endswith(app_id)
            )[1:],
            'download?from=details'
        ])        
    except StopIteration as e:
        raise NoMatchingResultError(app_id) from e 
        
    return requests.get(
        url=session.get(app_result_page_link).html.find('#download_link', first=True).attrs['href']
    ).content
    

try:
    with open(CACHED_APP_ID_TO_APK_BYTES_PATH, 'rb') as fp:
        app_id_to_apk_bytes: Dict[str, bytes] = pickle.load(fp)
except FileNotFoundError as e:
    app_id_to_apk_bytes: Dict[str, bytes] = {}
    print(e, file=stderr)

for cached_app_id in app_id_to_apk_bytes:
    app_ids_for_download.discard(cached_app_id)
    
for app_id in app_ids_for_download:
    app_id_to_apk_bytes[app_id] = download_apk(app_id=app_id)

with open(CACHED_APP_ID_TO_APK_BYTES_PATH, 'wb') as fp:
    pickle.dump(app_id_to_apk_bytes, fp)
    
apks = {
    app_id: APK(apk_bytes, True)
    for app_id, apk_bytes in app_id_to_apk_bytes.items()
}

In [5]:
apks.keys()

dict_keys(['com.nianticlabs.pokemongo', 'com.msales.flashlight', 'com.teliasonera.cis.tps', 'com.viber.voip', 'com.peacock.flashlight'])

## Build feature set

In [6]:
apks

{'com.nianticlabs.pokemongo': <axmlparserpy.apk.APK at 0x7fce0f85f5f8>,
 'com.msales.flashlight': <axmlparserpy.apk.APK at 0x7fce0f85f160>,
 'com.teliasonera.cis.tps': <axmlparserpy.apk.APK at 0x7fce0f6ae908>,
 'com.viber.voip': <axmlparserpy.apk.APK at 0x7fce0f6a9da0>,
 'com.peacock.flashlight': <axmlparserpy.apk.APK at 0x7fce0f68c5c0>}

### Interesting APK fields

* Activities
* permissions
* services
* receivers
* libraries

In [7]:
def get_category(name):
    r = session.get('https://play.google.com/store/apps/details?id=' + name)
    elements = r.html.find("a.hrTbp.R8zArc")
    category = elements[1].attrs['href'].split('/')[-1]
    return category
    
def get_features(apks):
    l = []
    for name, ap in apks.items():
        tmp = {}
        for f in ap.activities:
            tmp[f.split('.')[-1]] = 1
        for f in ap.permissions:
            tmp[f.split('.')[-1]] = 1
        for f in ap.services:
            tmp[f.split('.')[-1]] = 1
        for f in ap.receivers:
            tmp[f.split('.')[-1]] = 1
        for f in ap.libraries:
            tmp[f.split('.')[-1]] = 1

        tmp['category'] = get_category(name)
        l.append(tmp)
    return l

In [8]:
apk_features = get_features(apks)
features_names = []
for ap in apk_features:
    features_names.extend(ap.keys())
features_names = list(set(features_names))

In [9]:
import pandas as pd
df = pd.DataFrame(apk_features, index=list(apks.keys()))
df = df.fillna(int(0))

In [10]:
print('Data summary')
print('Number of features: ', len(features_names))
df.head()['category']

Data summary
Number of features:  411


com.nianticlabs.pokemongo         GAME_ADVENTURE
com.msales.flashlight                  LIFESTYLE
com.teliasonera.cis.tps      MAPS_AND_NAVIGATION
com.viber.voip                     COMMUNICATION
com.peacock.flashlight                     TOOLS
Name: category, dtype: object

In [11]:
categories = df['category']
print(categories)

com.nianticlabs.pokemongo         GAME_ADVENTURE
com.msales.flashlight                  LIFESTYLE
com.teliasonera.cis.tps      MAPS_AND_NAVIGATION
com.viber.voip                     COMMUNICATION
com.peacock.flashlight                     TOOLS
Name: category, dtype: object


In [12]:
del df['category']

# Clustering

In [13]:
from sklearn.cluster import KMeans

In [14]:
km = KMeans(n_clusters=4)
km.fit_predict(df)

array([3, 0, 2, 1, 0], dtype=int32)

3.6.7
