- Authors: Ray Donner and Seth Johnson
- Date: July 14, 2023
- Content: This file is a conglomerate of all the machine learning algorithms that we run and collect data on. This will include the following algorithms:
    - Categorical Naive-Bayes
    - Support Vector Machines
    - Decision Trees
    - Neural Network
    - Convolutional Neural Network
- This file is organized by way of which dataset is being passed to each classifier and compare each one in batches.
- The goal is to analyze this with our new dataset COVID19_APK_Data_06-2023.csv and compare train/test performace, as well as provide statistical analysis to compare COVIDMalware.pdf dataset to ours.

For Google Drive

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/2023_REU_Workspace

print(os.getcwd())

RUN ME FIRST

In [17]:
### Generic pkgs
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import pylab as pl
import random
from pprint import pprint
# From Scikit-Learn
from sklearn.naive_bayes import CategoricalNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import(
    accuracy_score,
    confusion_matrix,
    classification_report,
    make_scorer,
    precision_recall_fscore_support
)
from sklearn.model_selection import(
    train_test_split, 
    KFold,
    StratifiedKFold,
    cross_validate,
    ShuffleSplit,
    GridSearchCV
)
from imblearn.over_sampling import SMOTE
# From Tensorflow/Keras
import tensorflow as tf
import keras
from keras.layers import (
    Dense,
    Conv2D,
    MaxPool2D,
    Flatten,
    Dropout,
    BatchNormalization,
    Embedding,
    LSTM
)
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import utils
from keras.models import Sequential
from keras import backend as K

CSV_FILE = "COVID19_APK_Data_06-2023.csv" # CSV File consisting of analyzed APK data
OUTPUT_FILE = "ClassifierReport.txt"
### Variable Declaration
# CSV reference
apkData = -1 # DataFrame of all APK data harvested from APKScanner.py
apks = -1 # NumPy array that lists apk file hashes
avRanks = -1 # NumPy Array that lists the AV Ranking for each file
# Reference arrays
keys = [] # Lables of CSV data that is NOT the permissions requested by a given APK file
permKeys = [] # Key values for the permissions requested by a given APK file. This is for reference for our perms array
osPermKeys = [] # Key values for all permissions associated with the base AndroidOS devkit
# Separating permission spreads between benign and malicious apk tyes
permSpread = [] # Array of arrays representing the permissions that a given apk file requests
osPermSpread = [] # Array of arrays representing permission spread of all apk base AndroidOS requests
benignSpread = [] # Array of arrays that represent all permissions that a benign APK file requests
maliciousSpread = [] # Array of arrays that represent all permissions that a malicious APK file requests
benignSpread_OS = [] # Array of arrays representing all AndroidOS permissions that a benign APK file requests
maliciousSpread_OS = [] # Array of arrays representing all AndroidOS permissions that a malicious APK file requests
benignSpread_sums = [] # Array representing the total sum of all permissions a given benign apk file requests
maliciousSpread_sums = [] # Array represnting total sum of all permissions a given malicious apk file requests
benignPerms = [] # Array representing how many times a benign APK file requests a given permission. The index of each value represents how many times a permission in permSpread is requested.
maliciousPerms = [] # Array representing how many times a malicious APK file requests a given permission. The index of each value represents how many times a permission in permSpread is requested.
benignPerms_OS = [] # Array representing how many times a benign APK file requests a given AndroidOS permission
maliciousPerms_OS = [] # Array representing how many times a malicious APK file requests a given AndroidOS permission
# for our models
perms = -1 # "Features". NumPy array consisting of arrays which hold the permission attributes for a given element in apks @ the same index. First two elements indicate AV Rank, and Total Permissions Requested. Subsequent elements are a binary representation of the types of permissions the apk file requests.
labels = [] # "Labels". Array of arrays of 0s(benign) or 1s(malicious). Matches index values with apks and avRanks to indicate if apk file is malicious
permsSMOTE = [] # Array of the features treated with SMOTE
labelsSMOTE = [] # Array of the lables treated with SMOTE
xTrain = -1 # Array consisting training features. This is passed into fit() methods
xTest = -1 # Array consisting of testing features. Predictions will use this variable.
yTrain = -1 # Array consisting of training labels. Pass into fit() methods for supervised learning.
yTest = -1 # Array consisting of testing labels. This assists with metrics, and backpropogation
confusion = -1 # 2D Array built to represent True/False Positives to calculate Recall and Percision metrics
clf = -1 # Classifier variable to assign to different models
prediction = -1 # Array consisting of the results of a classifier's prediction call
ssplit = -1 # Cross-Validation object created by sklearn's ShuffleSplit(). Passed as a CV parameter for cross_val_score()
scores = -1 # Array consisting of metrics from sklearn's cross_val_score()
models = {} # Dictionary to contain all performance metrics of our models.
# visual aids
x = [] # Array for representing the quantity of permissions requested across all analyzed APK files
yBenign = [] # Array representing how many APK files request a certain number of permissions. Each index correlates to the index in `x`, and the value at each index represents how many benign APK files request that many permissions
yMalicious = [] # Array representing how many APK files request a certain number of permissions. Each index correlates to the index in `x`, and the value at each index represents how many malicious APK files request that many permissions
t0 = 0 # float value for recording a model's performance duration

### Function Declaration
def verifyPreprocessing(): # Writes a structured output to a file of all data parsed out of csv's DF
    with open("stupid.txt", "w") as outFile:
        for i in range(len(apks)):
            outFile.write("Application: " + apkData.loc[i].loc["Application Name"] + "\n")
            outFile.write("Package: " + apkData.loc[i]["Package Name"] + "\n")
            outFile.write(f"APK File: {apks[i]}" + "\n")
            outFile.write(f"AV Rank: {avRanks[i]}" + "\n")
            outFile.write(f"Total Permissions Requested: {sum(perms[i][2:])}" + "\n")
            outFile.write(f"Permission Spread: {perms[i]}" + "\n")
            arr = []
            for j in range(2, len(permKeys)):
                if perms[i][j] > 0:
                    arr.append(permKeys[j])
            outFile.write(f"Permissions requested: {arr}")
            outFile.write("\n")
    return 1

def visualize(arrs, lbls=[], title=None, x_Axs=None, y_Axs=None, save=False, flip=False):
    """
    @Description: Utilizes Matplotlib.pyplot to provide superimposed plots for my research

    @params
    xArr:= Array used for plotting along the x axis
    yArrs:= Array consisting of all values that wish to be superimposed on the same image
    title:= String used to label the graph. Defaults to None
    save:= Boolean that determines if the plot is to be saved as a .png. title must be provided if set to True. Defaults to False
    flip:= Boolean to flip the graph to a horizontal bar graph. Defaults to False
    x_Axs:= String to label the X-axis of a plot
    y_Axs:= String to label the y-axis of a plot

    @returns 1 if successful, 0 Otherwise
    """
    width = 0.5 # Designates the width of the bar

    # Ensuring array sizes matches
    if len(arrs[0]) != len(arrs[1]):
        raise Exception(f"Array sizes do not match!\n Expected size: {len(arrs[0])}")
        return 0

    if flip:
        for i, arr in enumerate(arrs):
            plt.barh(np.arange(len(arr)) + (width * i), arr, width, label=i)
        if len(lbls) != 0:
            plt.yticks(np.arange(len(lbls)), lbls) # associating indicies with xArr
    else:
        for i, arr in enumerate(arrs):
            plt.bar(np.arange(len(arr)) + (width * i), arr, width, label=i)
        if len(lbls) != 0:
            plt.xticks(np.arange(len(lbls)), lbls) # associating indicies with xArr

    if x_Axs != None:
        plt.xlabel(x_Axs)
    if y_Axs != None:
        plt.ylabel(y_Axs)

    if title != None:
        plt.title(title)

    plt.legend()

    if save:
        if title == None:
            raise Exception("No title provided, plot will not be saved.")
            return 0
        plt.savefig(f"{title}.png", dpi=300, bbox_inches="tight")

    plt.show()

    return 1
    
### UDF to calculate metrics for our Neural Network
# This genuinely confuses me, documentation states that they depriciated Recall and precision from keras.metrics, but I can still use it. 
# Tensorflow docs say that I can call all those, plus F1Score, but I cannot call F1Score at all, no matter what I try
# Trying a custom set of functions to handle getting metrics better, guided by Stack Overflow because I'm uncreative and cannot figure out anything at all
def recallCalc(train, test):
    true_positives = K.sum(K.round(K.clip(train * test, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(train, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precisionCalc(train, test):
    true_positives = K.sum(K.round(K.clip(train * test, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(test, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1Calc(train, test):
    precision = precision_m(train, test)
    recall = recall_m(train, test)
    return 2 * ((precision * recall) / (precision + recall + bK.epsilon()))

"""
### NOTE
- The following is used to make classification_report compatible with cross_validate()
- I could very well just pass scoring=[accuracy, precision, recall] to cross_validate() to then results and calculate f1 score
"""
def buildClassificationReport (yTest, yPred):
    test.extend(yTest)
    prediction.extend(yPred)
    return accuracy_score(yTest, yPred)

"""
### NOTE
- classification_report is heckin gorgeous
- pass output_dict=True to function if I want to return the dictionary per instance!
- EXAMPLE: print(classification_report(np.argmax(yTest, axis=1), np.argmax(prediction, axis=1)))
"""
def recordClassifier(labels, prediction, clfName):
    # operand = "a+" if os.path.isfile(OUTPUT_FILE) else "w+"
        
    with open(OUTPUT_FILE, "a+") as outFile:
        outFile.write(f"### {clfName}\n")
        outFile.write(f"Accuracy: {(accuracy_score(prediction, labels) * 100):.2f}%\n")
        outFile.write("Classification Report: \n")
        outFile.write(classification_report(labels, prediction)+"\n\n")
    
    return 1

# Function to define and compile our Deep Neural Network model
def buildDNN(NUM_CATS, xTrain):
    ### Clearing Tensorflow memory before building new model
    tf.keras.backend.clear_session()

    ### Building layers
    model = keras.models.Sequential()
    # input
    model.add(Dense(units=xTrain.shape[0], activation="relu", input_shape=(xTrain.shape[1],)))
    # Hidden Layers
    model.add(Dense(units=xTrain.shape[0], activation="relu")) # wonder if changing the units value dos anything...
    # Output
    model.add(Dense(units=NUM_CATS, activation="softmax"))

    ### Compiling model
    # Changed loss from categorical to binary, given w have a binary output
    model.compile(loss='binary_crossentropy', metrics=["accuracy"])
    return model

# Calls buildDNN() to create a model, iterates through a KFold Cross-Validation loop, then writes it to OUT_FILE
def dnnKFold(features, lables, imbalanced=False): 
    results = { # where to store my results temporarily
        "benign": {
            "fscore": [],
            "precision": [],
            "recall": [],
            "support": [],
        },
        "malicious": {
            "fscore": [],
            "precision": [],
            "support": [],
            "recall": [],
        },
        "accuracy": [],
    }
    cats = [i for i in results.keys()][:2] # b/c I'm lazy man...
    kf = StratifiedKFold(n_splits=FOLDS, random_state=RANDOM_STATE, shuffle=True) # initiating the kfold object
    for num, (trainIndx, testIndx) in enumerate(kf.split(features, labels)):
        print(f"Fold: {num + 1}")
        
        xTrain = [features[i] for i in trainIndx]
        xTest =  [features[i] for i in testIndx]
        yTrain = [labels[i] for i in trainIndx]
        yTest =  [labels[i] for i in testIndx]

        xTrain = np.asarray(xTrain).astype('float64')
        xTest = np.asarray(xTest).astype('float64')
        yTrain = np.asarray(yTrain).astype('float64')
        yTest = np.asarray(yTest).astype('float64')
        
        if imbalanced:
            xTrain_SMOTE, yTrain_SMOTE = smote.fit_resample(xTrain, yTrain)

        yTrain = keras.utils.to_categorical(yTrain, NUM_CATS)
        yTest = keras.utils.to_categorical(yTest, NUM_CATS)

        model = buildDNN(NUM_CATS, xTrain)

        # Displaying model summary
        # model.summary()
        
        model.fit(
            xTrain, yTrain, 
            epochs=20,
            verbose=0,
            validation_data=(xTest, yTest),
            callbacks=[timer()]
        )

        prediction = np.argmax(model.predict(xTest), axis=1)
        results["accuracy"].append(accuracy_score(prediction, np.argmax(yTest, axis=1)))
        confusion = confusion_matrix(np.argmax(yTest, axis=1), prediction)
        print(confusion)

        metrics = precision_recall_fscore_support(np.argmax(yTest, axis=1), prediction)

        for i, key in enumerate(cats):
            results[key]["precision"].append(metrics[0][i])
            results[key]["recall"].append(metrics[1][i])
            results[key]["fscore"].append(metrics[2][i])
            results[key]["support"].append(metrics[3][i])
            # pprint(results)

    with open(OUTPUT_FILE, "a+") as outFile:
        outFile.write(f"###### Connected Neural Network ######\n")
        outFile.write(f"Accuracy: {(np.mean(results['accuracy']) * 100):.2f}%\n")
        outFile.write("Classification Report: \n")
        outFile.write(f"{'precision':>23}{'recall':>10}{'f1-score':>10}{'support':>10}\n") # THIS IS FOR THE LABELS
        outFile.write("\n")
        outFile.write(f"{0:>12}{np.mean(results['benign']['precision']):>11.2f}{np.mean(results['benign']['recall']):>10.2f}{np.mean(results['benign']['fscore']):>10.2f}{np.floor(np.mean(results['benign']['support'])):>10.0f}\n")
        outFile.write(f"{1:>12}{np.mean(results['malicious']['precision']):>11.2f}{np.mean(results['malicious']['recall']):>10.2f}{np.mean(results['malicious']['fscore']):>10.2f}{np.floor(np.mean(results['malicious']['support'])):>10.0f}\n")
        outFile.write("\n")
        outFile.write(f"{'accuracy':>12}{np.mean(results['accuracy']):>31.2f}{xTest.shape[0]:>10.0f}\n")

    return 1

"""
### PREPROCESSING
"""
print(os.getcwd()) # Displaying script's CWD

print(f"### CONSOLE: Reading {CSV_FILE}...")

apkData = pd.read_csv(CSV_FILE) # Calling CSV and filling DataFrame (DF)

"""
### Scraping our CSV's DF
"""
# Building keys array for parsinng reference later
for i in range(6):
    keys.append(apkData.keys()[i])

permKeys = apkData.loc[0].keys().drop(i for i in keys).values # Key values for the permissions requested by a given APK file. This is for reference for our features array
apks = apkData["APK File"].values # Pulling APK files to correlate labels
avRanks = apkData["AV Rank"].values # pulls AV Rank from csv DF
labels = [1 if i > 0 else 0 for i in avRanks] # builds an array of malware classification based off avRank

perms = [apkData.loc[i].drop((i for i in keys)).values for i in range(len(apkData))] # Genereating features array that drops first 6 columns to include the total permissions requested, followed by the PermSpread

# verifyPreprocessing()
tf.keras.backend.clear_session()

print("### CONSOLE: Preprocessing complete.")

/home/sdj81/PyWorkspace/2023_REU_Workspace
### CONSOLE: Reading COVID19_APK_Data_06-2023.csv...
### CONSOLE: Preprocessing complete.


Statistics for our Dataset

In [None]:
# TODO:
# - percentage of apps that are malicious
# - percentage of apps that failed analysis
# - Std Dev, Std Err to compare my processed data with the data provided by COVIDMalware.pdf
#      - might be useful to includ metric of apks that failed analysis
#      - this can help confirm that what I did was right
#      - Check Stats 305 stuff to provide formulas and context for these values

### Prepping our arrays
for i in range(len(perms)):
    permSpread.append(perms[i][1:]) # cleaning permission spread of AV Rank and total permission requests

    # Creating keys and permSpread arrays specifically for AndroidOS-based permissions
    arr = [] # subarray for our osPermSpread
    for ef in range(len(permKeys)):
        if permKeys[ef].lower().startswith("android.permission"):
            if permKeys[ef] not in osPermKeys: # keeps from multiplying immensely
                osPermKeys.append(permKeys[ef]) # Builds osPerms key array
            arr.append(perms[i][ef])
    osPermSpread.append(arr) # updating osPermSpread with our subarray

benignPerms = [0 for _ in range(len(permKeys))]
maliciousPerms = [0 for _ in range(len(permKeys))]
benignPerms_OS = [0 for _ in range(len(osPermKeys))]
maliciousPerms_OS = [0 for _ in range(len(osPermKeys))]

for i in range(len(apks)):
    if labels[i] > 0:
        maliciousSpread.append(permSpread[i]) # Sorting out malicious permission spreads
        maliciousSpread_OS.append(osPermSpread[i]) # ...and for OS specific permissions
        for j in range(len(permSpread[i])): 
            maliciousPerms[j] += permSpread[i][j] # building an array of ints representing malicious requests FE permission
        # restricting to only AndroidOS permissions...
        for j in range(len(osPermSpread[i])): 
            maliciousPerms_OS[j] += osPermSpread[i][j]
    else:
        benignSpread.append(permSpread[i]) # Sorting out benign permission spreads
        benignSpread_OS.append(osPermSpread[i]) # ...and for OS specific permissions
        for j in range(len(permSpread[i])): 
            benignPerms[j] += permSpread[i][j] # building an array of ints representing benign requests FE permission
        # restricting to only AndroidOS permissions...
        for j in range(len(osPermSpread[i])):
            benignPerms_OS[j] += osPermSpread[i][j]

# Collecting the sum of each array in malicious/benignSpread
benignSums = [sum(i) for i in benignSpread]
maliciousSums = [sum(i) for i in maliciousSpread]

# Building the x-array to display each APK's total permission request, distinguished by AV Rank > 0
x = [i for i in range(max(maliciousSums) + 1)] if maliciousSums > benignSums else [i for i in range(max(benignSums) + 1)]

# Building y-arrays to display each APK's total permission request, distinguished by AV Rank > 0
yBenign = [0 for _ in range(len(x))]
yMalicious = [0 for _ in range(len(x))]

for i in benignSums:
    yBenign[i] += 1

for i in maliciousSums:
    yMalicious[i] += 1

### Plotting the total permissions requested by a given apk, organized based off an AV rank > 0

plt.bar([i for i in range(len(benignSums))], benignSums, label="AV Rank = 0")
plt.bar([i for i in range(len(maliciousSums))], maliciousSums, label="AV Rank > 0")
plt.xlabel("APK File")
plt.ylabel("Quantity of requested permissions")
plt.title("Total number of permissions requested for a given APK file.")
plt.legend()
plt.savefig("TotalPermissionsGraphed.png", dpi=300, bbox_inches = "tight")
plt.show()

### Plotting how many apps requested a given quantity of permissions

# Frequency
title = "Frequency of APK Files requesting some number of permissions"
xLabel = "Total permissions requested per APK file"
yLabel = "Frequency of APK files"
visualize([yBenign, yMalicious], title=title, save=True, x_Axs=xLabel, y_Axs=yLabel)

# Frequency up to 60 permissions requested
title = "Frequency of APK Files requesting up to 60 permissions"
visualize([yBenign[:60], yMalicious[:60]], title=title, save=True, x_Axs=xLabel, y_Axs=yLabel)

# ...Normalized
title = "Normalized Frequency of APK Files requesting some number of permissions"
yLabel = "Normalized frequency of APK files"
visualize([[i / max(yBenign) for i in yBenign], [i / max(yMalicious) for i in yMalicious]], title=title, save=True, x_Axs=xLabel, y_Axs=yLabel)

# ...Normalized for up to 60 permissions
title = "Normalized Frequency of APK files requesting up to 60 permissions"
visualize([[i / max(yBenign) for i in yBenign[:60]], [i / max(yMalicious) for i in yMalicious[:60]]], title=title, save=True, x_Axs=xLabel, y_Axs=yLabel)

# ...Proportioned based on the quantity of benign/malicious APKs respectively
title = "Proportional Frequency of APK Files requesting some number of permissions"
yLabel = "Proportional Frequency of APK files"
visualize([[i / len(benignSpread) for i in yBenign], [i / len(maliciousSpread) for i in yMalicious]], title=title, save=True, x_Axs=xLabel, y_Axs=yLabel)

# Plotting proportional frequency of APK files that requested up to 60 permissions
title = "Proportional Frequency of APK files requesting up to 60 permissions"
visualize([[i / len(benignSpread) for i in yBenign[:60]], [i / len(maliciousSpread) for i in yMalicious[:60]]], title=title, save=True, x_Axs=xLabel, y_Axs=yLabel)

### Plotting the frequency of requests FE permission found during analysis

print(sorted(benignPerms, reverse=True))
print(sorted(maliciousPerms, reverse=True))

# Frequency of applications requesting a given permission
title = "Top 10 most popular permissions requested by APK files"
xLabel = "Frequency of APK File requests"
yLabel = "Permissions"
visualize([sorted(benignPerms, reverse=True)[:10], sorted(maliciousPerms, reverse=True)[:10]], permKeys[1:11], title=title, x_Axs=xLabel, y_Axs=yLabel, save=True, flip=True)

# proportion of applications requesting a given permission
title = "Proportional frequency of a given permission requested by APK files"
xLabel = "Proportion of APK File requests"
visualize(
    [
        [i / len(benignPerms) for i in sorted(benignPerms, reverse=True)[:10]],
        [i / len(maliciousPerms) for i in sorted(maliciousPerms, reverse=True)[:10]]
    ],
    [permKeys[1:][benignPerms.index(i)] for i in sorted(benignPerms, reverse=True)[:10]],
    title=title,
    x_Axs=xLabel,
    y_Axs=yLabel,
    save=True,
    flip=True
)

### Re-Doing yBenign and yMalicious for our OS specific permission spreads

benignSums = [sum(i) for i in benignSpread_OS]
maliciousSums = [sum(i) for i in maliciousSpread_OS]

# Resetting y arrays
yBenign = [0 for _ in range(len(x))]
yMalicious = [0 for _ in range(len(x))]

for i in benignSums:
    yBenign[i] += 1

for i in maliciousSums:
    yMalicious[i] += 1

### Re-Plotting for AndroidOS specific permission spreads

# resetting summation arrays
benignSums = [sum(i) for i in benignSpread_OS]
maliciousSums = [sum(i) for i in maliciousSpread_OS]

# Resetting y arrays
yBenign = [0 for _ in range(len(x))]
yMalicious = [0 for _ in range(len(x))]

for i in benignSums:
    yBenign[i] += 1

for i in maliciousSums:
    yMalicious[i] += 1

# Proportional frequency of APK files that requested only base AndroidOS permissions
title = "Proportional frequency of APK files that request only base AndroidOS permissions"
xLabel = "Quantity of AndroidOS permissions requested"
yLabel= "Proportional frequency of APK files"
visualize([[i / len(benignSpread_OS) for i in yBenign], [i / len(maliciousSpread_OS) for i in yMalicious]], title=title, x_Axs=xLabel, y_Axs=yLabel, save=True)

# Proportional frequency of APK files that requested only base AndroidOS permissions up to 60 times
title = "Proportional frequency of APK files that request only base AndroidOS permissions up to 60 times"
visualize([[i / len(benignSpread_OS) for i in yBenign[:60]], [i / len(maliciousSpread_OS) for i in yMalicious[:60]]], title=title, x_Axs=xLabel, y_Axs=yLabel, save=True)

# Frequency of applications requesting base AndroidOS permissions
title = "Frequency of requests for each base AndroidOS permission"
xLabel = "AndroidOS Permissions"
yLabel = "Frequency of APK file requests"
visualize(
    [sorted(benignPerms_OS, reverse=True), sorted(maliciousPerms_OS, reverse=True)],
    [osPermKeys[benignPerms_OS.index(i)] for i in sorted(benignPerms_OS, reverse=True)],
    title=title,
    x_Axs=xLabel,
    y_Axs=yLabel,
    flip=True
)

# Frequency of applications requesting the first 30 base AndroidOS permissions
title = "Frequency of the top 10 most requested base AndroidOS Permissions"
visualize([benignPerms_OS[:10], maliciousPerms_OS[:10]], osPermKeys[:10], title=title, x_Axs=xLabel, y_Axs=yLabel, flip=True, save=True)
visualize(
    [
        sorted(benignPerms_OS, reverse=True)[:10],
        sorted(maliciousPerms_OS, reverse=True)[:10]
    ],
    [osPermKeys[benignPerms_OS.index(i)] for i in sorted(benignPerms_OS, reverse=True)[:10]],
    title=title,
    x_Axs=xLabel,
    y_Axs=yLabel,
    save=True,
    flip=True
)
# Proportion of applications requesting the first 30 base AndroidOS permissions
title = "Proportional frequency of the top 10 most requested base AndroidOS Permissions"
visualize(
    [
        [i / len(benignPerms_OS) for i in sorted(benignPerms_OS, reverse=True)[:10]],
        [i / len(maliciousPerms_OS) for i in sorted(maliciousPerms_OS, reverse=True)[:10]]
    ],
    [osPermKeys[benignPerms_OS.index(i)] for i in sorted(benignPerms_OS, reverse=True)[:10]],
    title=title,
    x_Axs=xLabel,
    y_Axs=yLabel,
    save=True,
    flip=True
)

# How many APKs are malicious?
totalAPKs = len(apks)
totalBadAPKs = sum([1 if item > 0 else 0 for item in labels])
print(f"We analyzed {totalAPKs} APKs")
print(f"Out of that, {totalBadAPKs} were flagged as malicious. This is according to the dataset provided by Wang et al 2021.")
print(f"Which means about {((totalBadAPKs / totalAPKs) * 100):.2f}% of all analyzed APKs are labeled as malicious.")


Prep

In [18]:
"""
### OPTIMIZATION PARAMETER DECLARATION
- Modify this value if we wanna change how many iterations our models go through
    - this will change the size of masterTrain and masterTest arrays, hence changing how many times each model iterates through these folds
"""
FOLDS = 20
RANDOM_STATE = 0
TEST_SIZE = 0.1
INT_ARR = [1, 3, 5, 10, 20, 50, 100, 1000]
NUM_CATS = 2 # The quantity of categories we are going to organize our APK files into. 0 for benign, 1 for malware: S({0, 1}) = 2

"""
### Sklearn ShuffleSplit module
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html
- cross_validate(): https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
- compare with the current manual split
"""
print("### CONSOLE: Prepping K-Fold C-V...")
ssplit = ShuffleSplit(n_splits=FOLDS, test_size=TEST_SIZE, random_state=RANDOM_STATE)

"""
### SMOTE
- https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html
- This can be called prior to any sort of data separation
"""
print("### CONSOLE: prepping SMOTE...")
print(len(perms)) # 1959
print(len(labels)) # 1959

smote = SMOTE(random_state=RANDOM_STATE,sampling_strategy={1: 270})
permsSMOTE, labelsSMOTE = smote.fit_resample(perms, labels)

print(len(permsSMOTE)) # now 3378
print(len(labelsSMOTE)) # now 3378


"""
### Can we use the GPU?
"""
if tf.test.gpu_device_name():
    print(f'GPU installed. Good Job!\nGPU Device: {tf.test.gpu_device_name()}\n')
else:
    print("No GPU found that can run TF.\n")

"""
### Overridden callback class "timer" for catching epoch/total time
"""
class timer(keras.callbacks.Callback):
    import time
    def __init__(self): # initalized callback
        super(timer, self).__init__() # remember inheritance from OOP

    # training methods
    def on_train_begin(self, logs=None):
        self.start_train=time.time()

    def on_train_end(self, logs=None):
        stop_train = time.time()
        tr_duration = stop_train - self.start_train
        # Calculates metrics
        tr_hours = tr_duration // 3600
        tr_minutes = (tr_duration - (tr_hours * 3600)) // 60
        tr_seconds = tr_duration - ((tr_hours * 3600) + (tr_minutes * 60))
        # Generates message of string
        msg = f"\nElapsed time: {tr_hours:.0f}:{tr_minutes:.0f}:{tr_seconds:.3f}\n"
        print(msg)
    
    # batch training methods <-- might not need this
    def on_train_batch_begin(self, batch, logs=None):
        pass
    def on_train_batch_end(self, batch, logs=None):
        pass

    # epoch methods
    def on_epoch_begin(self, epoch, logs=None):
        self.start_epoch = time.time()
    
    def on_epoch_end(self, epoch, logs=None):
        stop_epoch = time.time()
        epoch_duration = stop_epoch - self.start_epoch
        msg = f"Epoch {epoch + 1} trained for {epoch_duration} seconds"
        print(msg)

    # prediction methods <-- this might be useful in the long run during CrossVal
    def on_predict_begin(self, logs=None):
        pass
    def on_predict_end(self, logs=None):
        pass

print("### CONSOLE: Prep successful.")

### CONSOLE: Prepping K-Fold C-V...
### CONSOLE: prepping SMOTE...
1959
1959
3378
3378
GPU installed. Good Job!
GPU Device: /device:GPU:0

### CONSOLE: Prep successful.


2023-07-18 13:31:40.990279: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-18 13:31:40.990485: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-18 13:31:40.990610: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

Control group

In [19]:
print("\n### CONSOLE: Starting control group...\n")

with open(OUTPUT_FILE, "a+") as outFile:
    outFile.write(f"############################### CONTROL GROUP ###############################\n")

### Splitting data
xTrain, xTest, yTrain, yTest = train_test_split(perms, labels, random_state=RANDOM_STATE, test_size=TEST_SIZE)
# Converting to NumPy float32. Tensorflow hates int64
xTrain = np.asarray(xTrain).astype('float32')
xTest = np.asarray(xTest).astype('float32')
yTrain = np.asarray(yTrain).astype('float32')
yTest = np.asarray(yTest).astype('float32')

print("### CONSOLE: Executing CatNB...")
clf = CategoricalNB(min_categories=len(perms))

# t0 = time.time()
clf.fit(xTrain, yTrain) # Train
# print(f"Training time: {(time.time() - t0):.3f} s") # Mark training time
# t0 = time.time()
prediction = clf.predict(xTest) # Predicting
# print(f"Prediction Time: {(time.time() - t0):.3f} s") # marking prediction time
# print(f"CNB accuracy: {(accuracy_score(prediction, yTest) * 100):.2f}%")
recordClassifier(yTest, prediction, "Classical Naive-Bayes")

print("### CONSOLE: Executing Support Vector Machines...")
clf = SVC()

# t0 = time.time()
clf.fit(xTrain, yTrain) # Train
# print(f"Training time: {(time.time() - t0):.3f} s") # Mark training time
# t0 = time.time()
prediction = clf.predict(xTest) # Predicting
# print(f"Prediction Time: {(time.time() - t0):.3f} s") # marking prediction time
# print(f"CNB accuracy: {(accuracy_score(prediction, yTest) * 100):.2f}%")
recordClassifier(yTest, prediction, "Support Vector Machines")

print("### CONSOLE: Executing Decision Trees...")
clf = DecisionTreeClassifier(splitter="random")

# t0 = time.time()
clf.fit(xTrain, yTrain) # Train
# print(f"Training time: {(time.time() - t0):.3f} s") # Mark training time
# t0 = time.time()
prediction = clf.predict(xTest) # Predicting
# print(f"Prediction Time: {(time.time() - t0):.3f} s") # marking prediction time
# print(f"CNB accuracy: {(accuracy_score(prediction, yTest) * 100):.2f}%")
recordClassifier(yTest, prediction, "Decision Trees")

print("### CONSOLE: Executing Deep Neural Network...")

model = buildDNN(NUM_CATS, xTrain)
# Displaying model summary
# model.summary()

yTrain = keras.utils.to_categorical(yTrain, NUM_CATS)
yTest = keras.utils.to_categorical(yTest, NUM_CATS)

### fitting
model.fit(
    xTrain, yTrain, 
    epochs=20, 
    verbose=0,
    validation_data=(xTest, yTest),
    callbacks=[timer()]
)

prediction = model.predict(xTest)
recordClassifier(np.argmax(yTest, axis=1), np.argmax(prediction, axis=1), "Connected Neural Network")

# ### Manual
# t0 = time.time()
# clf.fit(xTrain, yTrain) # Train
# print(f"Training time: {(time.time() - t0):.3f} s") # Mark training time

# t0 = time.time()
# prediction = clf.predict(xTest) # Predicting
# print(f"Prediction Time: {(time.time() - t0):.3f} s") # marking prediction time
# print(f"Prediction accuracy: {(accuracy_score(prediction, yTest) * 100):.2f}%")

# # Metrics
# confusion = confusion_matrix(yTest, prediction)
# print(f"Prediction accuracy: {(accuracy_score(prediction, yTest) * 100):.2f}%")
# print(f"Confusion Matrix:\n{confusion}")
# print()
# print("Maliciousness")
# print(f"True Positives: {confusion[1][1]}")
# print(f"False Positives: {confusion[1][0]}")
# print(f"False Negatives: {confusion[0][1]}") # --> MINIMIZE THIS VALUE
# print(f"Total Recall B): {(confusion[1][1] / sum(confusion[1])):.2f}")
# print(f"Percision: {(confusion[1][1] / sum([confusion[r][1] for r in range(len(confusion))])):.2f}")
# print()

print("\n### CONSOLE: Finished control group.\n")


### CONSOLE: Starting control group...

### CONSOLE: Executing CatNB...
### CONSOLE: Executing Support Vector Machines...
### CONSOLE: Executing Decision Trees...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### CONSOLE: Executing Deep Neural Network...
Epoch 1 trained for 0.961216926574707 seconds
Epoch 2 trained for 0.15665698051452637 seconds
Epoch 3 trained for 0.17224574089050293 seconds
Epoch 4 trained for 0.16274023056030273 seconds
Epoch 5 trained for 0.1611945629119873 seconds
Epoch 6 trained for 0.15621137619018555 seconds
Epoch 7 trained for 0.16660833358764648 seconds
Epoch 8 trained for 0.16695380210876465 seconds
Epoch 9 trained for 0.15997552871704102 seconds
Epoch 10 trained for 0.15860605239868164 seconds
Epoch 11 trained for 0.15432214736938477 seconds
Epoch 12 trained for 0.15370678901672363 seconds
Epoch 13 trained for 0.1436154842376709 seconds
Epoch 14 trained for 0.1465744972229004 seconds
Epoch 15 trained for 0.14363527297973633 seconds
Epoch 16 trained for 0.15140366554260254 seconds
Epoch 17 trained for 0.1445930004119873 seconds
Epoch 18 trained for 0.16025328636169434 seconds
Epoch 19 trained for 0.1592268943786621 seconds
Epoch 20 trained for 0.1658608913421630

SMOTE

In [20]:
print("\n### CONSOLE: Starting SMOTE group...\n")

with open(OUTPUT_FILE, "a+") as outFile:
    outFile.write(f"############################### TRAIN/TEST SPLIT WITH SMOTE ###############################\n")


xTrain, xTest, yTrain, yTest = train_test_split(permsSMOTE, labelsSMOTE, random_state=RANDOM_STATE, test_size=TEST_SIZE)
# Converting to NumPy float32. Tensorflow hates int64
xTrain = np.asarray(xTrain).astype('float32')
xTest = np.asarray(xTest).astype('float32')
yTrain = np.asarray(yTrain).astype('float32')
yTest = np.asarray(yTest).astype('float32')

print("### CONSOLE: Executing CatNB...")
clf = CategoricalNB(min_categories=len(perms))

# t0 = time.time()
clf.fit(xTrain, yTrain) # Train
# print(f"Training time: {(time.time() - t0):.3f} s") # Mark training time
# t0 = time.time()
prediction = clf.predict(xTest) # Predicting
# print(f"Prediction Time: {(time.time() - t0):.3f} s") # marking prediction time
# print(f"CNB accuracy: {(accuracy_score(prediction, yTest) * 100):.2f}%")
recordClassifier(yTest, prediction, "Classical Naive-Bayes")

print("### CONSOLE: Executing Support Vector Machines...")
clf = SVC()

# t0 = time.time()
clf.fit(xTrain, yTrain) # Train
# print(f"Training time: {(time.time() - t0):.3f} s") # Mark training time
# t0 = time.time()
prediction = clf.predict(xTest) # Predicting
# print(f"Prediction Time: {(time.time() - t0):.3f} s") # marking prediction time
# print(f"CNB accuracy: {(accuracy_score(prediction, yTest) * 100):.2f}%")
recordClassifier(yTest, prediction, "Support Vector Machines")

print("### CONSOLE: Executing Decision Trees...")
clf = DecisionTreeClassifier(splitter="random")

# t0 = time.time()
clf.fit(xTrain, yTrain) # Train
# print(f"Training time: {(time.time() - t0):.3f} s") # Mark training time
# t0 = time.time()
prediction = clf.predict(xTest) # Predicting
# print(f"Prediction Time: {(time.time() - t0):.3f} s") # marking prediction time
# print(f"CNB accuracy: {(accuracy_score(prediction, yTest) * 100):.2f}%")
recordClassifier(yTest, prediction, "Decision Trees")

print("### CONSOLE: Executing Deep Neural Network...")

model = buildDNN(NUM_CATS, xTrain)
# Displaying model summary
# model.summary()

yTrain = keras.utils.to_categorical(yTrain, NUM_CATS)
yTest = keras.utils.to_categorical(yTest, NUM_CATS)

### fitting
model.fit(
    xTrain, yTrain, 
    epochs=20, 
    verbose=0,
    validation_data=(xTest, yTest),
    callbacks=[timer()]
)

prediction = model.predict(xTest)
recordClassifier(np.argmax(yTest, axis=1), np.argmax(prediction, axis=1), "Connected Neural Network")

# ### Manual with SMOTE

# t0 = time.time()
# clf.fit(xTrain, yTrain) # Train
# print(f"Training time: {(time.time() - t0):.3f} s") # Mark training time

# t0 = time.time()
# prediction = clf.predict(xTest) # Predicting
# print(f"Prediction Time: {(time.time() - t0):.3f} s") # marking prediction time
# print(f"Prediction accuracy: {(accuracy_score(prediction, yTest) * 100):.2f}%")

# # Metrics
# confusion = confusion_matrix(yTest, prediction)
# print(f"Prediction accuracy: {(accuracy_score(prediction, yTest) * 100):.2f}%")
# print(f"Confusion Matrix:\n{confusion}")
# print()
# print("Maliciousness")
# print(f"True Positives: {confusion[1][1]}")
# print(f"False Positives: {confusion[1][0]}")
# print(f"False Negatives: {confusion[0][1]}") # --> MINIMIZE THIS VALUE
# print(f"Total Recall B): {(confusion[1][1] / sum(confusion[1])):.2f}")
# print(f"Percision: {(confusion[1][1] / sum([confusion[r][1] for r in range(len(confusion))])):.2f}")
# print()


### CONSOLE: Starting SMOTE group...

### CONSOLE: Executing CatNB...
### CONSOLE: Executing Support Vector Machines...
### CONSOLE: Executing Decision Trees...
### CONSOLE: Executing Deep Neural Network...
Epoch 1 trained for 0.9512343406677246 seconds
Epoch 2 trained for 0.20559072494506836 seconds
Epoch 3 trained for 0.22727584838867188 seconds
Epoch 4 trained for 0.22679495811462402 seconds
Epoch 5 trained for 0.2192976474761963 seconds
Epoch 6 trained for 0.21757173538208008 seconds
Epoch 7 trained for 0.23093867301940918 seconds
Epoch 8 trained for 0.22620272636413574 seconds
Epoch 9 trained for 0.22527050971984863 seconds
Epoch 10 trained for 0.22890186309814453 seconds
Epoch 11 trained for 0.23032903671264648 seconds
Epoch 12 trained for 0.22905421257019043 seconds
Epoch 13 trained for 0.217545747756958 seconds
Epoch 14 trained for 0.22505879402160645 seconds
Epoch 15 trained for 0.21758294105529785 seconds
Epoch 16 trained for 0.22082901000976562 seconds
Epoch 17 trained for 

1

Cross-Validation

In [21]:
print("\n### CONSOLE: Starting Cross-Validation group...\n")

with open(OUTPUT_FILE, "a+") as outFile:
    outFile.write(f"############################### CROSS-VALIDATION GROUP ###############################\n")

print("### CONSOLE: Executing CNB...")
clf = CategoricalNB(min_categories=len(perms))

test = []
prediction = []
scores = cross_validate(clf, perms, labels, cv=ssplit, scoring=make_scorer(buildClassificationReport))
recordClassifier(test, prediction, "Classical Naive-Bayes")

print("### CONSOLE: Executing Support Vector Machines...")
clf = SVC()

test = []
prediction = []
scores = cross_validate(clf, perms, labels, cv=ssplit, scoring=make_scorer(buildClassificationReport))
recordClassifier(test, prediction, "Support Vector MachineS")

print("### CONSOLE: Executing Decision Trees...")
clf = DecisionTreeClassifier(splitter="random")

test = []
prediction = []
scores = cross_validate(clf, perms, labels, cv=ssplit, scoring=make_scorer(buildClassificationReport))
recordClassifier(test, prediction, "Decision Trees")

print("### CONSOLE: Executing Deep Neural Network...")

dnnKFold(perms, labels)

# ### cross_validate()
# scores = cross_validate(clf, perms, labels, cv=ssplit, return_train_score=True, scoring=['accuracy', 'average_precision', 'f1', 'max_error', 'recall'])
# print("Scoring:")
# for key, value in scores.items():
#     print(f"{key}: {np.mean(value)}")

print("\n### CONSOLE: Cross-Validation complete.\n")


### CONSOLE: Starting Cross-Validation group...

### CONSOLE: Executing CNB...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### CONSOLE: Executing Support Vector Machines...
### CONSOLE: Executing Decision Trees...
### CONSOLE: Executing Deep Neural Network...
Fold: 1
Epoch 1 trained for 0.7849955558776855 seconds
Epoch 2 trained for 0.14162921905517578 seconds
Epoch 3 trained for 0.16354680061340332 seconds
Epoch 4 trained for 0.1635894775390625 seconds
Epoch 5 trained for 0.16002798080444336 seconds
Epoch 6 trained for 0.16206932067871094 seconds
Epoch 7 trained for 0.16793346405029297 seconds
Epoch 8 trained for 0.1589641571044922 seconds
Epoch 9 trained for 0.17259621620178223 seconds
Epoch 10 trained for 0.1477954387664795 seconds
Epoch 11 trained for 0.15356802940368652 seconds
Epoch 12 trained for 0.16576313972473145 seconds
Epoch 13 trained for 0.15218591690063477 seconds
Epoch 14 trained for 0.16481828689575195 seconds
Epoch 15 trained for 0.17361903190612793 seconds
Epoch 16 trained for 0.1609351634979248 seconds
Epoch 17 trained for 0.16597986221313477 seconds
Epoch 18 trained for 0.1590559482574

CV with SMOTE

In [22]:
print("\n### CONSOLE: Starting CV with SMOTE group...\n")

with open(OUTPUT_FILE, "a+") as outFile:
    outFile.write(f"############################### CROSS-VALIDATION WITH SMOTE ###############################\n")


print("### CONSOLE: Executing CNB...")
clf = CategoricalNB(min_categories=len(perms))

test = []
prediction = []
scores = cross_validate(clf, permsSMOTE, labelsSMOTE, cv=ssplit, scoring=make_scorer(buildClassificationReport))
recordClassifier(test, prediction, "Classical Naive-Bayes")

print("### CONSOLE: Executing Support Vector Machines...")
clf = SVC()

test = []
prediction = []
scores = cross_validate(clf, permsSMOTE, labelsSMOTE, cv=ssplit, scoring=make_scorer(buildClassificationReport))
recordClassifier(test, prediction, "Support Vector Machines")

print("### CONSOLE: Executing Decision Trees...")
clf = DecisionTreeClassifier(splitter="random")

test = []
prediction = []
scores = cross_validate(clf, permsSMOTE, labelsSMOTE, cv=ssplit, scoring=make_scorer(buildClassificationReport))
recordClassifier(test, prediction, "Decision Trees")

print("### CONSOLE: Executing Deep Neural Network...")

dnnKFold(perms, labels, imbalanced=True)

# ### cross_validate() with SMOTE
# scores = cross_validate(clf, permsSMOTE, labelsSMOTE, cv=ssplit, return_train_score=True, scoring=['accuracy', 'average_precision', 'f1', 'max_error', 'recall'])
# print("\nScoring with SMOTE:")
# for key, value in scores.items():
#     print(f"{key}: {np.mean(value)}")

print("\n### CONSOLE: CV with SMOTE complete.\n")


### CONSOLE: Starting CV with SMOTE group...

### CONSOLE: Executing CNB...
### CONSOLE: Executing Support Vector Machines...
### CONSOLE: Executing Decision Trees...
### CONSOLE: Executing Deep Neural Network...
Fold: 1
Epoch 1 trained for 0.827136754989624 seconds
Epoch 2 trained for 0.15868425369262695 seconds
Epoch 3 trained for 0.17217516899108887 seconds
Epoch 4 trained for 0.160292387008667 seconds
Epoch 5 trained for 0.1713721752166748 seconds
Epoch 6 trained for 0.17154788970947266 seconds
Epoch 7 trained for 0.1684260368347168 seconds
Epoch 8 trained for 0.1633918285369873 seconds
Epoch 9 trained for 0.16922473907470703 seconds
Epoch 10 trained for 0.16416001319885254 seconds
Epoch 11 trained for 0.1672358512878418 seconds
Epoch 12 trained for 0.1710495948791504 seconds
Epoch 13 trained for 0.17421603202819824 seconds
Epoch 14 trained for 0.16669178009033203 seconds
Epoch 15 trained for 0.1552906036376953 seconds
Epoch 16 trained for 0.1565690040588379 seconds
Epoch 17 train

Convolutional Neural Network

In [None]:
"""
### NOTE
- utilizing batch normalization and Dropout as well
    - might be good to use in DNN!
- As of 07-14-23
    - Most common error I get after a reshape is a OOM, not quite sure how to mitigate that...
"""

print(f"Traning data\n x: {xTrain.shape}, y: {yTrain.shape} ")
print(f"Traning data\n x: {xTest.shape}, y: {yTest.shape} ")
# Reshapng my features dataset for convolution layers
xTrain = xTrain.reshape(-1, 28, 28, 1) # --> (864 is from: ceil.sqrt(728655))
xTest = xTest.reshape(-1, 28, 28, 1) # --> This is obviously not going to work

### Building layers
model = keras.models.Sequential()

# Convolutional layer
model.add(Conv2D(75, (3, 3), strides=1, padding="same", activation="relu", input_shape=(28, 28, 1)))
# Batch Normalization layer
model.add(BatchNormalization())
# Pooling layer
model.add(MaxPool2D((2, 2), strides=2, padding="same"))

# Convolutional layer
model.add(Conv2D(50, (3, 3), strides=1, padding="same", activation="relu", input_shape=(28, 28, 1)))
# Enabling a Droput
model.add(Dropout(0.2))
# Batch Normalization layer
model.add(BatchNormalization())
# Pooling layer
model.add(MaxPool2D((2, 2), strides=2, padding="same"))

# Convolutional layer
model.add(Conv2D(25, (3, 3), strides=1, padding="same", activation="relu", input_shape=(28, 28, 1)))
model.add(BatchNormalization())
model.add(MaxPool2D((2, 2), strides=1, padding="same"))
model.add(Flatten())

# input
model.add(Dense(units=xTrain.shape[0], activation="relu", input_shape=(xTrain.shape[0],)))
# Hidden Layers
model.add(Dense(units=xTrain.shape[0], activation="relu")) # wonder if changing the units value dos anything...
# Output
model.add(Dense(units=NUM_CATS, activation="softmax"))

# Displaying model summary
model.summary()

### Compiling, fitting
model.compile(loss='categorical_crossentropy', metrics=['accuracy'])
models.update(
    { "CNN": model.fit(
    xTrain, yTrain, 
    epochs=20, 
    verbose=0,
    validation_data=(xTest, yTest),
    callbacks=[timer()]
)})

### Displaying results
# pprint(history.history)

fig, axis = plt.subplots(2, 2, figsize=(20,15))
for key, value in models.items():
    axis[0,0].plot(value.history["loss"], label=key)
    axis[0,0].set_title("loss")
    axis[0,1].plot(value.history["accuracy"], label=key)
    axis[0,1].set_title("accuracy")
    axis[1,0].plot(value.history["val_loss"], label=key)
    axis[1,0].set_title("val_loss")
    axis[1,1].plot(value.history["val_accuracy"], label=key)
    axis[1,1].set_title("val_accuracy")
    plt.legend()
    plt.show()

print("### CONSOLE: Analysis of Connected Neural Network completed.")

Visualizing performance

In [None]:
"""
### Visualizing model performace
- I like the idea of visualizing our confusion matrix
  - use that to also visualize performance per K-fold iteration
- compare average values across all models
  - Train/Test: F1, accuracy, Recall, percision, time
- I can't remember the how else Dr Perez wants to compare data
"""
### Displaying results
# pprint(model["DNN"].history)

# for key, value in models["DNN"]:
#     print(key, np.mean(value))

# Gonna have to update this when for the additional metrics
# fig, axis = plt.subplots(2, 2, figsize=(20,15))
# for key, value in models.items():
#     axis[0,0].plot(value.history["loss"], label=key)
#     axis[0,0].set_title("loss")
#     axis[0,1].plot(value.history["accuracy"], label=key)
#     axis[0,1].set_title("accuracy")
#     axis[1,0].plot(value.history["val_loss"], label=key)
#     axis[1,0].set_title("val_loss")
#     axis[1,1].plot(value.history["val_accuracy"], label=key)
#     axis[1,1].set_title("val_accuracy")
#     plt.legend()
#     plt.show()


"""
### GridSearchCV
"""
# # Parameters to pass to GridSearchCV()
# params = {
#     "C": INT_ARR,
#     "kernel": ["poly", "rbf", "sigmoid"], # linear kernel overfits
#     "gamma": ["scale", "auto"],
# }

# # Determining optimal parameters for execution
# clf = SVC()
# clf = GridSearchCV(estimator=clf, param_grid=params) # What about HalvingGridSearchCV? Should we include Cross-Validation as well?
# clf.fit(perms, labels)
# newParams = clf.best_params_
# pprint(newParams) # lists which params yeilded the best results