In [1]:
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.nn.modules.loss import L1Loss

from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn import metrics


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Read Dataset
df = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/4105av/fulloutput.csv')

In [4]:
#split truth off and form dfX
dfT = df['malware'].values
dfX = df.drop(['malware', 'length', 'id'], axis=1)

In [5]:
# Classical

#Train Test
np.random.seed(0)
Xtrain, Xval, Ttrain, Tval = train_test_split(dfX, dfT, train_size = 0.8, test_size=0.2)

model = LogisticRegression(C=0.01, max_iter = 100000) #With reg.
model.fit(Xtrain, Ttrain)


# Make predictions on the validation set
predictions = model.predict(Xval)

# Compute confusion matrix
conf_matrix = confusion_matrix(Tval, predictions)

# Print confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

# Calculate metrics
accuracy = accuracy_score(Tval, predictions)
precision = precision_score(Tval, predictions)
recall = recall_score(Tval, predictions)
f1 = f1_score(Tval, predictions)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Confusion Matrix:
[[14479  2807]
 [ 2491 20533]]
Accuracy: 0.8686
Precision: 0.8797
Recall: 0.8918
F1 Score: 0.8857


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

Xtrain, Xval, Ttrain, Tval = train_test_split(dfX, dfT, train_size = 0.8, test_size=0.2)

class AV_NN(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, weight_decay = 0.0001, dropout_prob=0.1):
        super(AV_NN, self).__init__()
        self.hidden_layers = nn.ModuleList()
        input_dim = input_size

        for hidden_size in hidden_sizes:
            self.hidden_layers.append(nn.Linear(input_dim, hidden_size))
            self.hidden_layers.append(nn.ReLU())
            self.hidden_layers.append(nn.Dropout(p=dropout_prob))
            input_dim = hidden_size

        self.output_layer = nn.Linear(hidden_sizes[-1], output_size)

        # Add L2 regularization to all linear layers
        self.l2_regularizer = nn.Linear(hidden_sizes[-1], 1)
        self.weight_decay = weight_decay

    def forward(self, x):
        for layer in self.hidden_layers:
            x = layer(x)

        l2_reg = 0.0
        for param in self.parameters():
            l2_reg += torch.sum(param ** 2)

        x = self.output_layer(x)
        return x

XtrainT = torch.tensor(Xtrain.values, dtype=torch.float)
TtrainT = torch.tensor(Ttrain, dtype=torch.float)
XvalT = torch.tensor(Xval.values, dtype=torch.float)
TvalT = torch.tensor(Tval, dtype=torch.float)

num_epochs = 1000

model = AV_NN(Xtrain.shape[1], [160, 320, 16], 1, dropout_prob=0.1)
optimizer = optim.Adam(model.parameters(), lr=0.0004)
lossFn = nn.MSELoss()

for epoch in range(num_epochs):
    vpredictions = model(XvalT)
    vloss = lossFn(vpredictions, TvalT.view(-1, 1))

    predictions = model(XtrainT)
    loss = lossFn(predictions, TtrainT.view(-1, 1))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 1 == 0:
      print('Epoch %d, Loss %f, Validation Loss %f' % (epoch, float(loss), float(vloss)))

nn_complexity = sum(p.numel() for p in model.parameters())
print(f'Neural Network Parameter Count: {nn_complexity}')

cuda
Epoch 0, Loss 0.793873, Validation Loss 0.802645
Epoch 1, Loss 0.754302, Validation Loss 0.762868
Epoch 2, Loss 0.722944, Validation Loss 0.730986
Epoch 3, Loss 0.697540, Validation Loss 0.705247
Epoch 4, Loss 0.673393, Validation Loss 0.680770
Epoch 5, Loss 0.648281, Validation Loss 0.655080
Epoch 6, Loss 0.621885, Validation Loss 0.628281
Epoch 7, Loss 0.594517, Validation Loss 0.600979
Epoch 8, Loss 0.565983, Validation Loss 0.572018
Epoch 9, Loss 0.537070, Validation Loss 0.542471
Epoch 10, Loss 0.507491, Validation Loss 0.513110
Epoch 11, Loss 0.477911, Validation Loss 0.482800
Epoch 12, Loss 0.448908, Validation Loss 0.452616
Epoch 13, Loss 0.421386, Validation Loss 0.424519
Epoch 14, Loss 0.395599, Validation Loss 0.399141
Epoch 15, Loss 0.373410, Validation Loss 0.375357
Epoch 16, Loss 0.355818, Validation Loss 0.356291
Epoch 17, Loss 0.342282, Validation Loss 0.343722
Epoch 18, Loss 0.336044, Validation Loss 0.336684
Epoch 19, Loss 0.335922, Validation Loss 0.334033
Epoch

In [9]:
udf = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/4105av/unseen.csv')

# threshold
threshold = 0.5

# Evaluate the model on the validation set
model.eval()
print(XvalT)
with torch.no_grad():
    vpredictions = model(XvalT)

# Convert the predictions to binary values (0 or 1)
v_predictions_binary = np.where(vpredictions > threshold, 1, 0)

# Calculate metrics
accuracy = accuracy_score(TvalT.cpu().numpy(), v_predictions_binary)
precision = precision_score(TvalT.cpu().numpy(), v_predictions_binary)
recall = recall_score(TvalT.cpu().numpy(), v_predictions_binary)
f1 = f1_score(TvalT.cpu().numpy(), v_predictions_binary)
confusion = confusion_matrix(TvalT.cpu().numpy(), v_predictions_binary)
print(confusion)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

# Neural network complexity
nn_complexity = sum(p.numel() for p in model.parameters())
print(f'Neural Network Parameter Count: {nn_complexity}')

tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 2.5000e-01, 6.9442e-01,
         4.8917e-02],
        [0.0000e+00, 0.0000e+00, 1.0000e+00,  ..., 2.5000e-01, 9.8895e-01,
         2.0398e-02],
        [0.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 3.3333e-01, 7.4150e-01,
         4.8005e-04],
        ...,
        [1.0000e+00, 0.0000e+00, 1.0000e+00,  ..., 2.0000e-01, 8.7915e-01,
         4.7422e-03],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 1.6667e-01, 5.7299e-01,
         1.0482e-02],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 4.0000e-01, 5.2141e-01,
         2.4120e-03]])
[[16096  1057]
 [  937 22220]]
Accuracy: 0.9505
Precision: 0.9546
Recall: 0.9595
F1 Score: 0.9571
Neural Network Parameter Count: 69650


In [8]:
# Read Dataset
testSet = torch.tensor(udf.values, dtype=torch.float)

with torch.no_grad():
  upredictions = model(testSet)

print(upredictions)

tensor([[0.0402],
        [0.0452],
        [0.7297],
        [0.0327],
        [0.0067],
        [0.0016],
        [0.9665],
        [0.9244],
        [0.8043],
        [0.0790],
        [0.9641],
        [0.8187],
        [0.0342]])
