In [1]:
import re
import pandas as pd
import numpy as np
from glob import glob
import networkx as nx

import matplotlib.pyplot as plt
# from functools import reduce
import os
from itertools import combinations

# !conda install -c conda-forge tqdm -y
from tqdm import tqdm
# !pip install multiprocess
from multiprocess import Pool

In [2]:
%load_ext line_profiler
%load_ext autoreload
%autoreload 2

In [3]:
from src.features.build_features import SmaliApp

In [4]:
class SmaliHIN():
    
    def __init__(self, apps_dir, nproc=4, n=8):
        self.app_dirs = glob(os.path.join(apps_dir, '*/'))[-8:]
        with Pool(nproc) as p:
            smali_apps = list(tqdm(p.imap_unordered(SmaliApp, self.app_dirs), total=len(self.app_dirs)))
        self.apps = {app.package: app for app in smali_apps}
        self.packages = list(self.apps.keys())
        
    def construct_graph_A(self):
        unique_APIs_app = [set(app.info.package + '->' + app.info.method_name) for app in self.apps.values()]
        unique_APIs_all = set.union(*unique_APIs_app)
        
        A_cols = []
        for unique in unique_APIs_all:
            bag_of_API = [1 if unique in app_set else 0 for app_set in unique_APIs_app]
            A_cols.append(bag_of_API)
            
        A_mat = np.array(A_cols).T
        # shape: (# of apps, # of unique APIs)
        self.A_mat = A_mat
        return self.A_mat
    
#     def construct_graph_B(self):
        

In [5]:
APPS_DIR = './../data/apps'
APPS_DIR = '/Volumes/exf/HinDroid/data/apps'

## EDA

In [6]:
hin = SmaliHIN(APPS_DIR)

100%|██████████| 8/8 [01:53<00:00, 14.25s/it]


In [7]:
apps = [a for a in hin.apps.values()]

In [8]:
for a in apps:
    a.info['package'] = a.package

In [9]:
df = pd.concat([a.info for a in apps], ignore_index=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1176529 entries, 0 to 1176528
Data columns (total 7 columns):
call             1176529 non-null object
relpath          1176529 non-null object
code_block_id    1176529 non-null int64
invocation       1176529 non-null object
library          1176529 non-null object
method_name      1176529 non-null object
package          1176529 non-null object
dtypes: int64(1), object(6)
memory usage: 62.8+ MB


In [11]:
df.head(5)

Unnamed: 0,call,relpath,code_block_id,invocation,library,method_name,package
0,"invoke-direct {p0}, Ljava/lang/Object;-><init>()V",smali/a/a/a/a/a$1.smali,1,invoke-direct,Ljava/lang/Object;,<init>,my.name.kiss.name
1,"invoke-virtual {v0}, La/a/a/a/e;->m()Landroid/...",smali/a/a/a/a/a$1.smali,2,invoke-virtual,La/a/a/a/e;,m,my.name.kiss.name
2,"invoke-direct {p0}, Ljava/lang/Object;-><init>()V",smali/a/a/a/a/a$2.smali,3,invoke-direct,Ljava/lang/Object;,<init>,my.name.kiss.name
3,"invoke-virtual {v0}, Landroid/view/View;->getV...",smali/a/a/a/a/a$2.smali,4,invoke-virtual,Landroid/view/View;,getViewTreeObserver,my.name.kiss.name
4,"invoke-virtual {v0, p0}, Landroid/view/ViewTre...",smali/a/a/a/a/a$2.smali,4,invoke-virtual,Landroid/view/ViewTreeObserver;,removeOnPreDrawListener,my.name.kiss.name


In [12]:
df.groupby('package')['invocation'].value_counts().unstack(fill_value=0)

invocation,invoke-direct,invoke-interface,invoke-static,invoke-super,invoke-virtual
package,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
com.app.brboldiesradio,62471,33502,62349,2786,153361
com.ct.goatsimulator3d,17237,10416,21270,705,49977
com.gallusgolf.c625.android.hawkseyegolf,12420,6126,13347,472,33894
com.mobiledoorman.themercer,24904,11159,22083,1570,63660
com.vistekmedia.ForestSmashFrenzy,11391,5438,13684,524,28393
grossacapdany.ovilsolutions.com.lagrossadecapdany,40012,22582,49392,2240,116042
my.name.kiss.name,5409,1746,4711,231,13432
uk.co.prioritysms.bgcracing,50648,28786,41677,2734,133748


In [33]:
df.groupby('package').agg({
    'call': 'size',
    'library': 'nunique',
    'code_block_id': ['mean', 'std', 'median', 'max', 'nunique'],
})

Unnamed: 0_level_0,call,code_block_id,code_block_id,code_block_id,code_block_id,code_block_id,library
Unnamed: 0_level_1,size,mean,std,median,max,nunique,nunique
package,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
com.app.brboldiesradio,314469,44924.059815,26047.87129,43838,89872,67457,13184
com.ct.goatsimulator3d,99605,15819.729873,8635.689824,16084,30656,21115,4684
com.gallusgolf.c625.android.hawkseyegolf,66259,12261.962239,6181.687412,12927,21532,15922,3539
com.mobiledoorman.themercer,123376,18288.249481,10398.392019,18923,35745,27661,6490
com.vistekmedia.ForestSmashFrenzy,59430,11349.214471,5888.003081,12248,20166,14964,3721
grossacapdany.ovilsolutions.com.lagrossadecapdany,230268,36520.605651,18864.652112,38391,68122,50638,10622
my.name.kiss.name,25529,3000.147754,1885.667432,2858,6590,4852,1613
uk.co.prioritysms.bgcracing,257593,40520.654633,23689.291276,39385,79603,59276,10875


## graphs 

In [10]:
A_mat = hin.construct_graph_A()

In [11]:
A_mat.shape

(4, 58646)

In [27]:
pd.Series(A_mat.sum(axis=0)).value_counts().sort_index()

1    123087
2     27408
3      6173
4      6505
5      3079
6      1243
7      2942
8      2410
dtype: int64

In [18]:
A_mat.shape

(8, 172847)

In [9]:
def method_pairs(df_group, graph):
    calls = df_group.api_id.unique()
    pairs = combinations(calls, 2)
    for pair in pairs:
        graph.add_edge(*pair)

In [10]:
%%time
B = nx.Graph()
B.add_nodes_from(df.api_id.unique())

B_pairs = df.groupby('code_block_id').progress_apply(method_pairs, graph=B)

100%|██████████| 25112/25112 [00:04<00:00, 5340.10it/s]

CPU times: user 4.67 s, sys: 108 ms, total: 4.78 s
Wall time: 4.73 s





In [11]:
%%time
P = nx.Graph()
P.add_nodes_from(df.api_id.unique())

P_pairs = df.groupby('package').progress_apply(method_pairs, graph=P)

100%|██████████| 5250/5250 [00:01<00:00, 3793.50it/s]

CPU times: user 1.39 s, sys: 52 ms, total: 1.44 s
Wall time: 1.43 s





In [33]:
df[['invocation', 'api_id']]

Unnamed: 0,invocation,api_id
0,invoke-direct,22062
1,invoke-direct,22062
2,invoke-direct,22062
3,invoke-direct,22062
4,invoke-direct,22062
...,...,...
103884,invoke-virtual,24772
103885,invoke-virtual,1619
103886,invoke-interface,495
103887,invoke-super,1616


In [12]:
# nx.drawing.draw(B)

In [13]:
df.groupby('invocation').apply(lambda df:len(df)**2/2).sum()

1888812409.5

In [14]:
df.call.nunique()**2/2

1088624460.5

In [15]:
%%time
I = nx.Graph()
I.add_nodes_from(df.call.unique())

# I_pairs = df.groupby('invocation').progress_apply(method_pairs, graph=I)

CPU times: user 83.2 ms, sys: 8.52 ms, total: 91.7 ms
Wall time: 89.6 ms


In [16]:
df.invocation.value_counts()

invoke-virtual      53840
invoke-static       20273
invoke-direct       19725
invoke-interface     8787
invoke-super         1264
Name: invocation, dtype: int64

In [17]:
# df_group = df.loc[df.groupby('invocation').groups['invoke-virtual'], :]
# # print(df_group.shape)
# calls = df_group.api_id.unique()
# pairs = combinations(calls, 2)
# for pair in tqdm(pairs, total=len(calls)*(len(calls)-1)/2):
#     I.add_edge(pair[0], pair[1])

#     del pair
#     break

In [18]:
df.sort_values('api_id').head()

Unnamed: 0,call,filename,code_block_id,invocation,package,method_name,api_id
4234,"invoke-virtual {p0}, Landroid/accessibilityser...",./../data/apps/com.paro.xmastreehdwallpaper/xm...,1407,invoke-virtual,Landroid/accessibilityservice/AccessibilitySer...,getCanRetrieveWindowContent,0
4240,"invoke-virtual {p0}, Landroid/accessibilityser...",./../data/apps/com.paro.xmastreehdwallpaper/xm...,1413,invoke-virtual,Landroid/accessibilityservice/AccessibilitySer...,getCapabilities,1
4235,"invoke-virtual {p0}, Landroid/accessibilityser...",./../data/apps/com.paro.xmastreehdwallpaper/xm...,1408,invoke-virtual,Landroid/accessibilityservice/AccessibilitySer...,getDescription,2
4236,"invoke-virtual {p0}, Landroid/accessibilityser...",./../data/apps/com.paro.xmastreehdwallpaper/xm...,1409,invoke-virtual,Landroid/accessibilityservice/AccessibilitySer...,getId,3
4237,"invoke-virtual {p0}, Landroid/accessibilityser...",./../data/apps/com.paro.xmastreehdwallpaper/xm...,1410,invoke-virtual,Landroid/accessibilityservice/AccessibilitySer...,getResolveInfo,4


In [19]:
n = df.api_id.nunique()
A = np.zeros((n,n))

from scipy.sparse import csr_matrix
# A = csr_matrix((n, n))

In [20]:
df_group = df.loc[df.groupby('invocation').groups['invoke-virtual'], :]
# print(df_group.shape)
calls = df_group.api_id.unique()
pairs = combinations(calls, 2)
for pair in tqdm(pairs, total=len(calls)*(len(calls)-1)/2):
    A[pair[0], pair[1]] = 1
    A[pair[1], pair[0]] = 1

100%|██████████| 79802661/79802661.0 [01:09<00:00, 1150828.63it/s]


array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [25]:
from scipy.sparse import csr_matrix

In [26]:
a = csr_matrix(A)

In [29]:
sys.getsizeof(a)

48

In [24]:
import sys
sys.getsizeof(A)/1e9

4.933025904