In [1]:
# only need this line in jupyter
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
# there are 215 features, need to edit pandas default display settings

#pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Drebin dataset analysis  


In [3]:
drebin_df = pd.read_csv('./datasets/Drebin-215/drebin-215-dataset-5560malware-9476-benign.csv')

drebin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15036 entries, 0 to 15035
Columns: 216 entries, transact to class
dtypes: int64(214), object(2)
memory usage: 24.8+ MB


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Column-92('READ_EXTERNAL_STORAGE') had mixed dtype issue, fix by setting whole column to numeric

In [4]:
# convert column values to numeric
drebin_df['READ_EXTERNAL_STORAGE'] = pd.to_numeric(drebin_df['READ_EXTERNAL_STORAGE'])

In [5]:
drebin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15036 entries, 0 to 15035
Columns: 216 entries, transact to class
dtypes: int64(214), object(2)
memory usage: 24.8+ MB


In [6]:
drebin_df.dtypes

transact                                            int64
onServiceConnected                                  int64
bindService                                         int64
attachInterface                                     int64
ServiceConnection                                   int64
android.os.Binder                                   int64
SEND_SMS                                            int64
Ljava.lang.Class.getCanonicalName                   int64
Ljava.lang.Class.getMethods                         int64
Ljava.lang.Class.cast                               int64
Ljava.net.URLDecoder                                int64
android.content.pm.Signature                        int64
android.telephony.SmsManager                        int64
READ_PHONE_STATE                                    int64
getBinder                                           int64
ClassLoader                                         int64
Landroid.content.Context.registerReceiver           int64
Ljava.lang.Cla

In [7]:
# column-TelephonyManager.getSimCountryIso had object dtype 
# converting type to numeric
# gives error
## drebin_df['TelephonyManager.getSimCountryIso'] = pd.to_numeric(drebin_df['TelephonyManager.getSimCountryIso'])

In [8]:
def check_missing_data(df):
    
    total_instances = df.shape[0]
    columns = df.columns

    for column in columns:
        one_zero_count = df[(df[column]==0) | (df[column]==1)].shape[0]
        if one_zero_count != total_instances:
            print(column, 'has', (total_instances-one_zero_count), 'rows with non 0-1 values')

In [9]:
check_missing_data(drebin_df)

TelephonyManager.getSimCountryIso has 8192 rows with non 0-1 values
class has 15036 rows with non 0-1 values


In [10]:
drebin_df['TelephonyManager.getSimCountryIso'].unique()

array(['0', '1', '?', 1, 0], dtype=object)

In [11]:
drebin_df[(drebin_df['TelephonyManager.getSimCountryIso']=='?')].shape[0]

5

### Column- TelephonyManager.getSimCountryIso has values- ['0', '1', '?', 1, 0]. Need to remove instances with '?' (only 5 instances) and convert '0' to 0 and '1' to 1 

In [12]:
# drop rows with 'TelephonyManager.getSimCountryIso' = '?' 
drebin_df = drebin_df[drebin_df['TelephonyManager.getSimCountryIso']!='?']

In [13]:
drebin_df['TelephonyManager.getSimCountryIso'].unique()

array(['0', '1', 1, 0], dtype=object)

In [14]:
# convert column values to numeric
drebin_df['TelephonyManager.getSimCountryIso'] = pd.to_numeric(drebin_df['TelephonyManager.getSimCountryIso'])

In [15]:
check_missing_data(drebin_df)

class has 15031 rows with non 0-1 values


In [16]:
drebin_df.shape

(15031, 216)

In [17]:
drebin_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15031 entries, 0 to 15035
Columns: 216 entries, transact to class
dtypes: int64(215), object(1)
memory usage: 24.9+ MB


In [18]:
class_freq = drebin_df['class'].value_counts()
class_freq

B    9476
S    5555
Name: class, dtype: int64

In [19]:
# save pre-processed Drebin dataset to new csv file
drebin_df.to_csv('./datasets/Pre-processed_Dataset/Drebin.csv', index=False)

  
  # Malgenome Dataset Analysis

In [20]:
malgenome_df = pd.read_csv('./datasets/malgenome-215/malgenome-215-dataset-1260malware-2539-benign.csv')

In [21]:
malgenome_df.shape

(3799, 216)

In [22]:
malgenome_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3799 entries, 0 to 3798
Columns: 216 entries, transact to class
dtypes: int64(215), object(1)
memory usage: 6.3+ MB


In [23]:
check_missing_data(malgenome_df)

class has 3799 rows with non 0-1 values


### No problem with the Malgenome dataset

In [24]:
class_freq = malgenome_df['class'].value_counts()
class_freq

B    2539
S    1260
Name: class, dtype: int64

In [25]:
# save Malgenome dataset to csv file, no pre-processing needed
malgenome_df.to_csv('./datasets/Pre-processed_Dataset/Malgenome.csv', index=False)

  
  # Merge the two datasets

In [26]:
# check if columns of both the datasets are same
drebin_columns = drebin_df.columns
malgenome_columns = malgenome_df.columns

diff_column_cnt = 0
for idx in range(0, len(drebin_columns)):
    if(drebin_columns[idx]!=malgenome_columns[idx]):
        # print(drebin_columns[idx], '(drebin) & ', malgenome_columns[idx], "(malgenome) don't match")
        diff_column_cnt+=1
        
print(diff_column_cnt, "columns don't match")

211 columns don't match


In [27]:
drebin_columns_set = set(drebin_columns)
malgenome_columns_set = set(malgenome_columns)

In [28]:
all_columns = drebin_columns_set.union(malgenome_columns_set)
len(all_columns)

223

In [29]:
common_columns = drebin_columns_set.intersection(malgenome_columns_set)
len(common_columns)

209

In [30]:
not_common_columns = drebin_columns_set.symmetric_difference(malgenome_columns_set)
len(not_common_columns)

14

### The two datasets have 209 features in common ('common_columns'). We need to drop the extra columns ('not_common_columns') from the datasets.  


In [31]:
not_common_columns

{'.system.app',
 '.system.bin',
 '/system/app',
 '/system/bin',
 'BIND_TEXT_SERVICE',
 'BROADCAST_PACKAGE_REMOVED',
 'CONTROL_LOCATION_UPDATES',
 'DELETE_CACHE_FILES',
 'HARDWARE_TEST',
 'INJECT_EVENTS',
 'READ_INPUT_STATE',
 'Runtime.loadLibrary',
 'android.intent.action.CAMERA_BUTTON',
 'android.intent.action.REBOOT'}

In [32]:
drebin_drop_columns = [] 
malgenome_drop_columns = []

for drop_column in not_common_columns:
    if(drop_column in drebin_columns):
        drebin_drop_columns.append(drop_column)
    if(drop_column in malgenome_columns):
        malgenome_drop_columns.append(drop_column)

In [33]:
drebin_drop_columns

['/system/app',
 'BIND_TEXT_SERVICE',
 'Runtime.loadLibrary',
 'CONTROL_LOCATION_UPDATES',
 'HARDWARE_TEST',
 '/system/bin',
 'DELETE_CACHE_FILES']

In [34]:
malgenome_drop_columns

['READ_INPUT_STATE',
 'android.intent.action.CAMERA_BUTTON',
 '.system.bin',
 'INJECT_EVENTS',
 'android.intent.action.REBOOT',
 '.system.app',
 'BROADCAST_PACKAGE_REMOVED']