In [1]:
import re
import pandas as pd
import numpy as np
from glob import glob
import networkx as nx
import matplotlib.pyplot as plt
# from functools import reduce
import os
from itertools import combinations

# !conda install -c conda-forge tqdm -y
from tqdm import tqdm
# !pip install multiprocess
from multiprocess import Pool

In [2]:
tqdm.pandas()

  from pandas import Panel


In [3]:
%load_ext line_profiler

In [4]:
class SmaliHIN():
    
    def __init__(self, app_dir):
        self.app_dir = app_dir
        self.smali_fn_ls = sorted(glob(os.path.join(app_dir, 'smali*/**/*.smali'), recursive=True))
        if len(self.smali_fn_ls) == 0:
            raise Exception('Invalid app directory')

    def _extract_line_file(fn):
        with open(fn) as f:
            pattern = '(^\.method.*)|(^\.end method)|(invoke-.*)'
            data = re.findall(pattern, f.read())
            if len(data) == 0: return pd.DataFrame()
        
        data = np.array(data)
        assert data.shape[1] == 3
        
        df = pd.DataFrame(
            np.array(data),
            columns=['start', 'end', 'call']
        )
        df['filename'] = fn

        return df
    
    def _assign_code_block(df):
        df['code_block_id'] = (df.start.str.len() != 0).cumsum()
        return df
    
    def _assign_package_invoke_method(df):
        res = (
            df.call.str.extract(
                "(invoke-\w+)(?:\/range)? {.*}, "     # invoke
                + "(\[*[ZBSCFIJD]|\[?L[\w\/$-]+;)->"   # package
                + "([\w$]+|<init>).+"                 # method
            )
            .rename(columns={0: 'invocation', 1: 'package', 2: 'method_name'})
        )
        return pd.concat([df, res], axis=1)
        
    
    def extract_info(self):
        agg = [SmaliHIN._extract_line_file(f) for f in tqdm(self.smali_fn_ls)]
        df = pd.concat(agg, ignore_index=True)        
        
        df = SmaliHIN._assign_code_block(df)
        df = SmaliHIN._assign_package_invoke_method(df)
        df['api_id'] = df.groupby(['package', 'method_name']).ngroup()
        self.info = df

        # clean
        assert (df.start.str.len() > 0).sum() == (df.end.str.len() > 0).sum()
        df = df[df.call.str.len() > 0].drop(columns=['start', 'end']).reset_index(drop=True)
        
        # verify no nans
        extract_nans = df.isna().sum(axis=1)
        assert (extract_nans == 0).all(), f'nan in {extract_nans.values.nonzero()}'

        self.info = df
        return self.info


In [7]:
app = SmaliHIN('./../data/apps/com.osoperfume/oso-perfume-buy-fragrances/')

In [8]:
# %lprun -f SmaliHIN.extract_info df = app.extract_info()
df = app.extract_info()

100%|██████████| 658/658 [00:00<00:00, 785.63it/s]


In [9]:
df[['package', 'method_name']].sample(10)

Unnamed: 0,package,method_name
9221,Landroid/widget/SearchView;,setOnQueryTextListener
5031,Ljava/lang/Character;,getDirectionality
1668,Ljava/lang/StringBuilder;,toString
6995,Landroid/support/v4/view/ViewPager;,executeKeyEvent
2471,Ljava/io/PrintWriter;,println
9868,Landroid/support/v4/widget/SwipeRefreshLayout;,removeCallbacks
5207,Ljava/io/File;,delete
8007,Landroid/support/v4/view/accessibility/Accessi...,getInfo
6994,Landroid/view/ViewGroup;,dispatchKeyEvent
1631,Ljava/lang/StringBuilder;,append


In [10]:
df.sample(5)

Unnamed: 0,call,filename,code_block_id,invocation,package,method_name,api_id
3962,"invoke-virtual {v3}, Ljava/util/ArrayList;->si...",./../data/apps/com.osoperfume/oso-perfume-buy-...,0,invoke-virtual,Ljava/util/ArrayList;,size,3666
10041,"invoke-virtual {v1}, Landroid/view/ViewGroup;-...",./../data/apps/com.osoperfume/oso-perfume-buy-...,0,invoke-virtual,Landroid/view/ViewGroup;,getRight,3167
1181,"invoke-virtual {v8}, Landroid/support/v4/app/F...",./../data/apps/com.osoperfume/oso-perfume-buy-...,0,invoke-virtual,Landroid/support/v4/app/FragmentManagerImpl;,retainNonConfig,723
8928,"invoke-static {v0, p1}, Landroid/support/v4/wi...",./../data/apps/com.osoperfume/oso-perfume-buy-...,0,invoke-static,Landroid/support/v4/widget/ExploreByTouchHelper;,access$100,2569
1422,"invoke-virtual {p3, p1}, Ljava/io/PrintWriter;...",./../data/apps/com.osoperfume/oso-perfume-buy-...,0,invoke-virtual,Ljava/io/PrintWriter;,print,3558


In [9]:
def method_pairs(df_group, graph):
    calls = df_group.api_id.unique()
    pairs = combinations(calls, 2)
    for pair in pairs:
        graph.add_edge(*pair)

In [10]:
%%time
B = nx.Graph()
B.add_nodes_from(df.api_id.unique())

B_pairs = df.groupby('code_block_id').progress_apply(method_pairs, graph=B)

100%|██████████| 25112/25112 [00:04<00:00, 5340.10it/s]

CPU times: user 4.67 s, sys: 108 ms, total: 4.78 s
Wall time: 4.73 s





In [11]:
%%time
P = nx.Graph()
P.add_nodes_from(df.api_id.unique())

P_pairs = df.groupby('package').progress_apply(method_pairs, graph=P)

100%|██████████| 5250/5250 [00:01<00:00, 3793.50it/s]

CPU times: user 1.39 s, sys: 52 ms, total: 1.44 s
Wall time: 1.43 s





In [33]:
df[['invocation', 'api_id']]

Unnamed: 0,invocation,api_id
0,invoke-direct,22062
1,invoke-direct,22062
2,invoke-direct,22062
3,invoke-direct,22062
4,invoke-direct,22062
...,...,...
103884,invoke-virtual,24772
103885,invoke-virtual,1619
103886,invoke-interface,495
103887,invoke-super,1616


In [12]:
# nx.drawing.draw(B)

In [13]:
df.groupby('invocation').apply(lambda df:len(df)**2/2).sum()

1888812409.5

In [14]:
df.call.nunique()**2/2

1088624460.5

In [15]:
%%time
I = nx.Graph()
I.add_nodes_from(df.call.unique())

# I_pairs = df.groupby('invocation').progress_apply(method_pairs, graph=I)

CPU times: user 83.2 ms, sys: 8.52 ms, total: 91.7 ms
Wall time: 89.6 ms


In [16]:
df.invocation.value_counts()

invoke-virtual      53840
invoke-static       20273
invoke-direct       19725
invoke-interface     8787
invoke-super         1264
Name: invocation, dtype: int64

In [17]:
# df_group = df.loc[df.groupby('invocation').groups['invoke-virtual'], :]
# # print(df_group.shape)
# calls = df_group.api_id.unique()
# pairs = combinations(calls, 2)
# for pair in tqdm(pairs, total=len(calls)*(len(calls)-1)/2):
#     I.add_edge(pair[0], pair[1])

#     del pair
#     break

In [18]:
df.sort_values('api_id').head()

Unnamed: 0,call,filename,code_block_id,invocation,package,method_name,api_id
4234,"invoke-virtual {p0}, Landroid/accessibilityser...",./../data/apps/com.paro.xmastreehdwallpaper/xm...,1407,invoke-virtual,Landroid/accessibilityservice/AccessibilitySer...,getCanRetrieveWindowContent,0
4240,"invoke-virtual {p0}, Landroid/accessibilityser...",./../data/apps/com.paro.xmastreehdwallpaper/xm...,1413,invoke-virtual,Landroid/accessibilityservice/AccessibilitySer...,getCapabilities,1
4235,"invoke-virtual {p0}, Landroid/accessibilityser...",./../data/apps/com.paro.xmastreehdwallpaper/xm...,1408,invoke-virtual,Landroid/accessibilityservice/AccessibilitySer...,getDescription,2
4236,"invoke-virtual {p0}, Landroid/accessibilityser...",./../data/apps/com.paro.xmastreehdwallpaper/xm...,1409,invoke-virtual,Landroid/accessibilityservice/AccessibilitySer...,getId,3
4237,"invoke-virtual {p0}, Landroid/accessibilityser...",./../data/apps/com.paro.xmastreehdwallpaper/xm...,1410,invoke-virtual,Landroid/accessibilityservice/AccessibilitySer...,getResolveInfo,4


In [19]:
n = df.api_id.nunique()
A = np.zeros((n,n))

from scipy.sparse import csr_matrix
# A = csr_matrix((n, n))

In [20]:
df_group = df.loc[df.groupby('invocation').groups['invoke-virtual'], :]
# print(df_group.shape)
calls = df_group.api_id.unique()
pairs = combinations(calls, 2)
for pair in tqdm(pairs, total=len(calls)*(len(calls)-1)/2):
    A[pair[0], pair[1]] = 1
    A[pair[1], pair[0]] = 1

100%|██████████| 79802661/79802661.0 [01:09<00:00, 1150828.63it/s]


array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [25]:
from scipy.sparse import csr_matrix

In [26]:
a = csr_matrix(A)

In [29]:
sys.getsizeof(a)

48

In [24]:
import sys
sys.getsizeof(A)/1e9

4.933025904