- Authors: Ray Donner and Seth Johnson
- Date: May 25, 2023
- Content: This file is a conglomerate of all the machine learning algorithms that we run and collect data on. This will include the following algorithms:
    - Categorical Naive-Bayes
    - Support Vector Machines
    - Decision Trees
    - Neural Network
    - Convolutional Neural Network
- The goal is to analyze this with our new dataset COVID19_APK_Data_06-2023.csv and compare train/test performace, as well as provide statistical analysis to compare COVIDMalware.pdf dataset to ours.

RUN ME FIRST

In [None]:
"""
### Package handling
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time
import pylab as pl
import random
from pprint import pprint
import csv

"""
### Site for Tensorflow reference: https://www.tensorflow.org/guide/distributed_training
"""

models = {
    "NB": {
        "Epoch_duration": [20],
        "duration": 1,
        "history": 0 # model.fit().history.items()
    },
    "SVM": {
        "Epoch_duration": [20],
        "duration": 1,
        "history": 0
    },
    "dTree": {
        "Epoch_duration": [20],
        "duration": 1,
        "history": 0
    },
    "DNN": {
        "Epoch_duration": [20],
        "duration": 1,
        "history": 0
    },
    "CNN": {
        "Epoch_duration": [20],
        "duration": 1,
        "history": 0
    },
    "RNN": {
        "Epoch_duration": [20],
        "duration": 1,
        "history": 0
    }
}

# TODO:
# Extract data from csv to be used for later modules
#   - Pandas? Numpy?

"""
### Playing with Pandas
- DataFrame (DF) is basically a multidimensional array
"""

apkData = pd.read_csv("COVID19_APK_Data_06-2023.csv")

# pprint([item for item in apkData]) # Prints Column labels in DF

# print(apkData.info()) # Prelim info on Dataframe
# print(apkData.to_string()) # Printing DF as formatted text for easy readability

# print(apkData.head(1)) # printing a specified number of first rows in DF. Defaults to 5
# # print(apkData.tail(1)) # printing a specified number of last rows in DF. Defaults to 5
# print(apkData.loc[0]) # Specifying a row or elemnt within DF
# print(apkData["AV Rank"].sum()) # Getting the sum the numbers of a specified column in a DF

# How do I assign the dataframe excluding the frist 7 columns
# print(features_df.to_string())
label_df = apkData["AV Rank"] # Referring to a specific column in a DF
totalAPKs = len(label_df)
totalBadAPKs = sum([1 if item > 0 else 0 for item in label_df])

permSpread = []
for i in range(7, len(apkData.loc[0].keys())):
    permSpread.append(apkData.loc[0].keys()[i])

# Pandas can do plotting!
# fig, axs = plt.subplots()
# apkData["Total permission requests"].plot.hist()
# axs.set_xlabel("Number of total permissions requests per APK")
# axs.set_ylabel("Frequency of APKs")
# fig.savefig("PermRequests.png")
# plt.show()

permSpreadSums = []
for i in permSpread[:20]:
    permSpreadSums.append(apkData[i].sum())

for i in range(20):
    print(permSpread[i])

df = pd.DataFrame(permSpreadSums)
df.plot.bar()
axs.set_xlabel("Permissions requested by APK")
axs.set_ylabel("Frequency of APKs")
fig.savefig("PermSpread.png")
plt.show()

# OR
# apkData["Total permission requests"].plot(kind="hist")
# plt.show()

# How many APKs are malicious?
# print(f"We analyzed {totalAPKs} APKs")
# print(f"Out of that, {totalBadAPKs} were flagged as malicious. This is according to the COVIDMalware.pdf dataset.")
# print(f"Which means about {((totalBadAPKs / totalAPKs) * 100):.2f}% of all analyzed APKs are labeled as malicious.")

# print(features_df)
# print(label_df)

apps = {}

# apps = {'Covid 19': {
#         '30fce6b41858aadce710ef2ad5f9b3afbd47c32bee70469b112cfa14f60085e9.apk': {
#             'avRank': 0,
#             'cloned': 0,
#             'clones': [],
#                     'permissions': ['android.permission.INTERNET'],
#                     'pkg name': 'com.urufu.covid19app'
#             },
#             '493e52c126be18efa077932250d82f764ab2da59d83b5f56d53fe95c1d6ba3bc.apk': {
#                 'avRank': 4,
#                     'cloned': 0,
#                     'clones': [],
#                     'permissions': [
#                 'android.permission.SET_WALLPAPER',
#                     'android.permission.KILL_BACKGROUND_PROCESSES',
#                     'com.anddoes.launcher.permission.UPDATE_COUNT',
#                     'android.permission.INTERNET',
#                     'android.permission.BROADCAST_PACKAGE_REPLACED',
#                 'com.oppo.launcher.permission.WRITE_SETTINGS',
#                     'android.permission.CALL_PHONE',
#                     'android.permission.PROCESS_OUTGOING_CALLS',
#                     'android.permission.WAKE_LOCK',
#                     'android.permission.READ_EXTERNAL_STORAGE',
#                 'com.huawei.android.launcher.permission.WRITE_SETTINGS',
#                     'android.permission.RECEIVE_SMS',
#                     'android.permission.SET_WALLPAPER_HINTS',
#                     'com.sonyericsson.home.permission.BROADCAST_BADGE',
#                 'com.sonymobile.home.permission.PROVIDER_INSERT_BADGE',
#                     'com.huawei.android.launcher.permission.CHANGE_BADGE',
#                     'com.sec.android.provider.badge.permission.WRITE',
#                 'com.android.browser.permission.READ_HISTORY_BOOKMARKS',
#                     'com.oppo.launcher.permission.READ_SETTINGS',
#                 'android.permission.READ_PHONE_STATE',
#                     'android.permission.ACCESS_COARSE_LOCATION',
#                     'android.permission.CAMERA',
#                     'android.permission.CHANGE_WIFI_STATE',
#                     'android.permission.READ_CONTACTS',
#                     'android.permission.WRITE_CONTACTS',
#                     'android.permission.READ_CALL_LOG',
#                     'android.permission.WRITE_CALL_LOG',
#                     'android.permission.FLASHLIGHT',
#                     'android.permission.SYSTEM_ALERT_WINDOW',
#                     'android.permission.WRITE_EXTERNAL_STORAGE',
#                     'me.everything.badger.permission.BADGE_COUNT_WRITE',
#                     'android.permission.RECORD_AUDIO',
#                     'android.permission.BROADCAST_PACKAGE_ADDED',
#                 'android.permission.BROADCAST_PACKAGE_CHANGED',
#                     'android.permission.READ_SMS',
#                     'com.htc.launcher.permission.READ_SETTINGS',
#                     'android.permission.VIBRATE',
#                     'android.permission.RECEIVE_BOOT_COMPLETED',
#                     'com.sec.android.provider.badge.permission.READ',
#                 'me.everything.badger.permission.BADGE_COUNT_READ',
#                 'android.permission.BROADCAST_PACKAGE_INSTALL',
#                 'android.permission.READ_APP_BADGE',
#                 'android.permission.BLUETOOTH',
#                 'android.permission.ACCESS_NETWORK_STATE',
#                 'android.permission.ACCESS_WIFI_STATE',
#                 'android.permission.ACCESS_FINE_LOCATION',
#                 'com.htc.launcher.permission.UPDATE_SHORTCUT',
#                 'com.huawei.android.launcher.permission.READ_SETTINGS',
#                 'android.permission.GET_TASKS',
#                 'android.permission.GET_ACCOUNTS',
#                 'com.majeur.launcher.permission.UPDATE_BADGE'
#             ],
#             'pkg name': 'cmf0.c3b5bm90zq.patch'
#         },
#         '86e93e44371566b39402b2e455f59b06ce0628d63c9f7a9b0bf7a5ebe8821b2b.apk': {
#             'avRank': 0,
#                     'cloned': 0,
#             'clones': [],
#             'permissions': ['android.permission.INTERNET'],
#             'pkg name': 'com.urufu.covid19app'
#         },
#         'c21da66789e5b45a69a2373a3569478eaaf3e8ed036329324fd5e4be939ac2a6.apk': {
#             'avRank': 0,
#             'cloned': 0,
#             'clones': [],
#             'permissions': [
#                 'android.permission.ACCESS_NETWORK_STATE',
#                 'android.permission.INTERNET'
#             ],
#             'pkg name': 'com.app.covid19'
#         }
# }}

# print(os.getcwd())
# with open("COVID19_APK_Data_06-2023.csv",'r') as inFile:
#     spam = csv.reader(inFile, delimiter=",")
#     # print(next(spam)[7:])
#     permSpread = next(spam)[7:]
#     # print(len(permSpread))
#     # print(sum([1 for _ in range(7, len(row))]))

#     for row in spam:
#         if row[0] not in apps:
#             apps.update({row[0]: {}})
#         if row[2] not in apps[row[0]]:
#             apps[row[0]].update({
#                 row[2]: {
#                     "AV Rank" : row[3],
#                     "pkg name": row[1],
#                     "permissions": [permSpread[i] if int(row[i]) > 0 else "fuck" for i in range(7, len(row) - 7)]
#                 }
#             })

# pprint(apps)

Statistics for our Dataset

In [None]:
# TODO:
"""
- Percentage of APKs that have been flagged as malware
- Whether or not apk has AVRank > 0
- Percentage of Apps that have malicious APKs
- Std error/dev given total APKs in dataset compared to analyzed APKs and APKs that we failed to analyze
- Graph the normalized quantity of APKs that request a given permission over the Permission Spread
- Graph the normalized quantity of APKs that have a given value of Total Permissions Requested field in COVID19_APK_DATA csv
"""

# pyplot.plt()

Scikit Learn prep

In [None]:
"""
### Package handling
"""
from sklearn.naive_bayes import GaussianNB  # WE WILL NOT BE USING GAUSSIAN N-B
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC # importing the Classifier module specifically
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

Neural Network Prep

In [None]:
"""
### Package handling
"""
import tensorflow as tf
import keras
from keras.layers import (
    Dense,
    Conv2D,
    MaxPool2D,
    Flatten,
    Dropout,
    BatchNormalization,
    Embedding,
    LSTM
)
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import utils
from keras.models import Sequential

"""
### Can we use the GPU?
"""

if tf.test.gpu_device_name():
    print(f'GPU installed. Good Job!\nGPU Device: {tf.test.gpu_device_name()}')
else:
    print(" No GPU found that can run TF.")

"""
### Overridden callback class "timer" for catching epoch/total time
"""
class timer(keras.callbacks.callbacks):
    import time
    def __init__(self): # initalized callback
        super(timer, self).__init__() # remember inheritance from OOP

    # training methods
    def on_train_begin(self, logs=None):
        self.start_train=time.time()
    def on_train_end(self, logs=None):
        stop_train = time.time()
        train_duration = stop_train - start_train
        # Calculates metrics
        tr_hours = tr_duration // 3600
        tr_minutes = (tr_duration - (hours * 3600)) // 60
        tr_seconds = tr_duration - ((hours * 3600) + (minutes * 60))
        # Generates message of string
        msg = f"Elapsed time: {str(tr_hours)}:{str(tr_minutes)}:{str(tr_seconds)}"
        print(msg)
    
    # batch training methods <-- might not need this
    def on_train_batch_begin(self, batch, logs=None):
        pass
    def on_train_batch_end(self, batch, logs=None):
        pass

    # epoch methods
    def on_epoch_begin(self, epoch, logs=None):
        self.start_epoch = time.time()
    def on_epoch_end(self, epoch, logs=None):
        stop_epoch = time.time()
        epoch_duration = stop_epoch - start_epoch
        msg = f"Epoch {epoch + 1} trained for {epoch_duration} seconds"
        print(msg)

    # prediction methods <-- this might be useful in the long run during CrossVal
    def on_predict_begin(self, logs=None):
        pass
    def on_predict_end(self, logs=None):
        pass

Visualizing performance

In [None]:
"""
### Visualizing model performace
"""
for val, data in history.history.items():
  plt.plot(data)
  plt.title(val)
  plt.show()
  print()