In [1]:
import sys
sys.path.insert(0, '/datasets/home/home-02/51/451/yuz530/HinDroid/')

In [2]:
%autosave 60

Autosaving every 60 seconds


In [3]:
import pandas as pd
from glob import glob
import os
from tqdm import tqdm
from multiprocess import Pool

In [49]:
import numpy as np

In [4]:
from src.data.get_data import prep_dir
from src.features.smali_features import SmaliApp
from src.features.build_features import build_features, clean_features

In [5]:
import json

In [6]:
config = json.load(open('/datasets/home/51/451/yuz530/HinDroid/config/data-params.json'))

In [7]:
clean_features(**config)

In [8]:
%%time
df, labels = build_features(**config)

Extracting features for class0


100%|██████████| 50/50 [02:07<00:00,  2.55s/it]
  0%|          | 0/50 [00:00<?, ?it/s]

Saving raw features for class0


100%|██████████| 50/50 [00:27<00:00,  1.81it/s]


Extracting features for class1


100%|██████████| 20/20 [00:10<00:00,  1.97it/s]
 35%|███▌      | 7/20 [00:00<00:00, 41.88it/s]

Saving raw features for class1


100%|██████████| 20/20 [00:01<00:00, 10.66it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Reading csv files for class0


100%|██████████| 50/50 [00:08<00:00,  5.56it/s]
 30%|███       | 6/20 [00:00<00:00, 55.53it/s]

Reading csv files for class1


100%|██████████| 20/20 [00:00<00:00, 26.82it/s]


CPU times: user 55.2 s, sys: 7.74 s, total: 1min 2s
Wall time: 3min 1s


In [9]:
raw_dir, proc_dir, raw_classes_dirs, interim_classes_dirs = \
    prep_dir(config['data_dir'], config['data_classes'])

In [10]:
class FeatureBuilder():
    
    def __init__(self, agg_df, labels):
        self.df = agg_df
        self.labels = pd.Series(labels).sort_index()
        self.out = pd.DataFrame()
    
    def _flatten_col_names(df):
        df.columns = ['.'.join(col).strip() for col in df.columns.values]
        return df
    
    def _simple_aggregations(df):
        out = df.groupby('package').agg({
            'call': 'size',
            'library': 'nunique',
            'code_block_id': ['mean', 'std', 'median', 'max', 'nunique'],
        })
        return FeatureBuilder._flatten_col_names(out)
    
    def _invoke_counts_by_type(df):
        out = df.groupby('package')['invocation'].value_counts().unstack(fill_value=0)
        out.columns = [col + '.count' for col in out.columns.values]
        return out
        
    def numerical_features(self):
        features = [
            FeatureBuilder._simple_aggregations(self.df),
            FeatureBuilder._invoke_counts_by_type(self.df)
        ]
        out = pd.concat(features, axis=1)
        return out
    
    def _top5_library(df):
        out = df.groupby('package')['library'].apply(
            lambda s: s.value_counts().iloc[:5]
        ).unstack(fill_value=0).clip(upper=1)
        out.columns = ['top5.' + col for col in out.columns.values]
        return out
    
    def categorical_features(self):
        features = [
            FeatureBuilder._top5_library(self.df)
        ]
        out = pd.concat(features, axis=1)
        return out
        
        
    def build(self):
        self.out = pd.concat([
            self.numerical_features(),
            self.categorical_features()
        ], axis=1)

In [11]:
fb = FeatureBuilder(df, labels)

In [33]:
df.groupby(['package', 'method_name']).ngroups

318373

In [12]:
fb.build()

In [13]:
fb.out

Unnamed: 0_level_0,call.size,library.nunique,code_block_id.mean,code_block_id.std,code_block_id.median,code_block_id.max,code_block_id.nunique,invoke-direct.count,invoke-interface.count,invoke-static.count,...,top5.Ljava/util/List;,top5.Lkotlin/jvm/internal/Intrinsics;,top5.Lmono/android/Runtime;,top5.Lmono/android/TypeManager;,top5.Lorg/apache/cordova/PluginResult;,top5.Lorg/apache/cordova/inappbrowser/InAppBrowser;,top5.Lorg/aspectj/runtime/reflect/Factory;,top5.Lorg/json/JSONArray;,top5.Lorg/json/JSONException;,top5.Lorg/json/JSONObject;
package,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
09f502a90a61229d76c71907b4f3f74f,551,79,64.201452,24.529264,60.0,100,84,117,18,68,...,0,0,0,0,0,0,0,0,1,1
115e0149ccfeaec1fa1fbd4abbecafeb,780,118,91.141026,38.839396,108.5,142,119,164,39,143,...,0,0,0,0,0,0,0,0,0,0
177af9700bcc8b7c8c131b662e8cdda8,28604,1738,6408.225388,3994.201302,6817.0,12145,8793,5328,2124,5270,...,0,0,0,0,0,0,0,0,0,0
17d083988dd5e6d9c2517899ae30bb02,20053,1379,4393.452750,2926.900600,3930.0,8899,6463,3770,1559,4020,...,0,0,0,0,0,0,0,0,0,0
23ee5fe81d5017edcc97f3c007748438,61457,2444,7512.191614,4182.714417,7783.0,13737,9459,9741,6011,13654,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
net.n2works.getsureill,96242,4899,14770.938800,8343.441718,15079.0,29118,20415,15629,10570,23029,...,0,0,0,0,0,0,0,0,0,0
nl.mxmx.texttotifinagh,16208,982,3407.646471,2141.520545,3728.0,6621,4660,2809,1195,2024,...,0,0,0,0,0,0,0,0,0,0
org.adventistas.advcountdown,91609,4122,15546.552566,8647.964873,15913.0,30125,23559,16873,9377,21008,...,0,1,0,0,0,0,0,0,0,0
photography.blackgallery.android,213059,10809,32274.402635,18753.661904,32624.0,63189,48903,42696,21177,42011,...,1,0,0,0,0,0,0,0,0,0


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [35]:
from sklearn.metrics import confusion_matrix, f1_score

In [56]:
lr_f1 = []
rf_f1 = []
gb_f1 = []
for i in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(
        fb.out, fb.labels == 'class1',
        test_size=0.2
    )
    lr = LogisticRegression(solver='liblinear')
    lr.fit(X_train, y_train)
    lr_f1.append(f1_score(y_test, lr.predict(X_test)))
    
    rf = RandomForestClassifier(n_estimators=10)
    rf.fit(X_train, y_train)
    rf_f1.append(f1_score(y_test, rf.predict(X_test)))

    gb = GradientBoostingClassifier()
    gb.fit(X_train, y_train)
    gb_f1.append(f1_score(y_test, gb.predict(X_test)))

  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', av

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

In [57]:
np.mean(lr_f1), np.mean(rf_f1), np.mean(gb_f1)

(0.43049323176823173, 0.5942526667776669, 0.6003109170241523)

In [42]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
print(lr.score(X_test, y_test))
print(f1_score(y_test, lr.predict(X_test)))
confusion_matrix(y_test, lr.predict(X_test))

0.5714285714285714
0.0




array([[8, 2],
       [4, 0]])

In [38]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))
print(f1_score(y_test, rf.predict(X_test)))
confusion_matrix(y_test, rf.predict(X_test))

0.9285714285714286
0.888888888888889




array([[9, 1],
       [0, 4]])

In [39]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
print(gb.score(X_test, y_test))
print(f1_score(y_test, gb.predict(X_test)))
confusion_matrix(y_test, gb.predict(X_test))

0.8571428571428571
0.8


array([[8, 2],
       [0, 4]])