In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate

## Setting the seeds for reproducibility

In [2]:
# Seed value
seed_value= 0

# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

## Creating the datasets

### Creating the training set

In [4]:
cwd = os.getcwd()
train_file_path = '/'.join(cwd.split('/')[:-2]) + '/preprocessing/data/14_12000/train_14_12000.csv'
train_df = pd.read_csv(train_file_path)

In [5]:
train_df.head()

Unnamed: 0,label,byte1,byte2,byte3,byte4,byte5,byte6,byte7,byte8,byte9,...,byte1471,byte1472,byte1473,byte1474,byte1475,byte1476,byte1477,byte1478,byte1479,byte1480
0,TeamViewer,235,70,23,50,224,231,75,73,18,...,,,,,,,,,,
1,TeamViewer,231,210,23,50,13,151,138,5,247,...,,,,,,,,,,
2,BitTorrent,200,213,54,184,0,102,153,229,100,...,,,,,,,,,,
3,WhatsApp,216,52,1,187,140,130,117,234,140,...,,,,,,,,,,
4,GoogleServices,135,88,1,187,184,14,7,147,164,...,,,,,,,,,,


In [6]:
## converting the labels into unique integers
train_df['label'] = train_df.label.astype('category').cat.codes

### Creating the validation set

In [7]:
validation_file_path = '/'.join(cwd.split('/')[:-2]) + '/preprocessing/data/14_12000/val_14_12000.csv'
validation_df = pd.read_csv(validation_file_path)

In [8]:
## converting the labels into unique integers
validation_df['label'] = validation_df.label.astype('category').cat.codes

### Creating the testing set

In [10]:
test_file_path = '/'.join(cwd.split('/')[:-2]) + '/preprocessing/data/14_12000/test_14_12000.csv'
test_df = pd.read_csv(test_file_path)

In [11]:
## converting the labels into unique integers
test_df['label'] = test_df.label.astype('category').cat.codes

## Training the svc using one vs many

In [12]:
lin_clf = svm.LinearSVC(max_iter=5000, dual=False)

In [13]:
X_train = train_df.drop('label', axis=1)
X_train = X_train / 255.0

In [14]:
y_train = train_df['label']

In [15]:
lin_clf.fit(X_train,y_train)

LinearSVC(dual=False, max_iter=5000)

## Evaluating the models performance on the validation set

In [16]:
X_validation = validation_df.drop('label', axis=1)
X_validation = X_validation / 255.0

In [17]:
y_validation = validation_df['label']

In [18]:
y_pred_validation = lin_clf.predict(X_validation)

In [19]:
print("Validation Accuracy:",accuracy_score(y_validation, y_pred_validation))

Validation Accuracy: 0.07284226190476191


## Evaluating the models performance on the test set

In [20]:
X_test = test_df.drop('label', axis=1)
X_test = X_test / 255.0

In [21]:
y_test = test_df['label']

In [22]:
y_pred_test = lin_clf.predict(X_test)

In [23]:
print("Test Accuracy:",accuracy_score(y_test, y_pred_test))

Test Accuracy: 0.0730952380952381


# Evaluating resource consumption
We will be evaluating the computational efficiency of a model by calculating amount of memory and time needed to make a prediction.

## Creating a sample packet

In [24]:
sample_packet = []
for i in range(1480):
    sample_packet.append(random.random())
sample_packet = np.array(sample_packet).reshape(1, -1)

## Memory

In [25]:
import tracemalloc

In [26]:
tracemalloc.start()

y_pred_test = lin_clf.predict(sample_packet)

current, peak = tracemalloc.get_traced_memory()
current = (current / 10**6)
peak = (peak / 10**6)
print("The amount of memory needed to make a single preiction")
print(f"Current memory usage is {current}MB; Peak was {peak}MB")
tracemalloc.stop()

The amount of memory needed to make a single preiction
Current memory usage is 0.001221MB; Peak was 0.011661MB


## Time


In [27]:
import time

start = time.time()
y_pred_test = lin_clf.predict(sample_packet)
end = time.time()
print("The time taken to make a prediction: {}".format((end - start)))

The time taken to make a prediction: 0.0004711151123046875
