# **Pre-processing phase**


Import the needed modules throughout the whole experiment

In [None]:
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os 
import seaborn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
#np.random.seed(42)  

Create folders in which we will upload the needed files for the experiments

In [None]:
# uncomment if you want to remove all csv files in "out/" directory
#!rm  out/*.csv
# uncomment if you want to remove all csv files in "out_div/" directory
#!rm  out_div/*.csv
# uncomment if you want to remove all csv files in "kmeans/" directory
#!rm  kmeans/*kmeans
!mkdir out
!mkdir out_div
!mkdir kmeans
!pwd

Define function which parse, load and create training, validation and test sets, given the path of the files 

In [None]:
#### 
# upload files directly from tab on 
#   the left
####
def load_dataset(path):
    '''
    @arg: path contains the path in which there are the 
        files to be parsed, loaded and split

    @return: num_classes
             num_samples
             num_pkt
             num_features
             x_train
             x_valid
             x_test
             y_train
             y_test
             y_valid
             d_labels
             dataset
             label_dataset
    '''

    path_normal_data = path

    num_classes =  -1   #n° of different traffic flows
    num_samples =  -1   #n° of different video samples taken
    num_pkt =      -1   #n° pkt intercepted in traffic flow
    num_features = -1   #n° features (e.g., global_size, pkt_byte, inter-times, ...)
    x_train = None
    x_valid = None
    x_test  = None
    y_train = []  
    y_test  = []  
    y_valid = []  

    bypassed_videos = []

    d_labels = {} #dictionary which contains encoding (name_video:integer) for labels

    iter = 0
    #parse files in order to get data as I want
    csv_files=os.listdir(path_normal_data)

    for i in range(len(csv_files)): 
        full_filename=csv_files[i]
        #print("[D] full_filename:",full_filename)
        #bypass files which are not meant to be part of the project
        if not (full_filename[:3] == "out"):
            continue
        tmp = []
        #print("full_filename:"+full_filename)
        with open(path_normal_data+full_filename, "r", newline='') as file:
            a = file.read().splitlines()
            for i in range(len(a)):
                tmp.append(a[i].split(","))

        #parse values from str to int/float ones
        for i in range(len(tmp)):
            for k in range(len(tmp[i])):
                try: # not very efficient, I know, anyway ... :)
                    tmp[i][k] = int(tmp[i][k])
                except:
                    tmp[i][k] = float(tmp[i][k])

        #reshape tmp from 2d to 3d tensor
        try:
            tmp = np.asarray(tmp)
            tmp = np.reshape(tmp,(1,tmp.shape[0],tmp.shape[1]))

            if iter == 0:
                x_train = tmp #np.asarray(tmp).copy()

            else:
                x_train = np.append(x_train.copy(),tmp,axis = 0) #np.array([x_train,np.asarray(tmp)]) #[x_train.copy(),np.asarray(tmp).copy()] #
        except:
            print("[E] Bypassing file "+full_filename+" due to its length")
            bypassed_videos.append(full_filename)
    
        iter += 1

    x_train = np.asarray(x_train)


    #NOTE: first dimension of x_train stands for number of samples in dataset
    #      second dim stands for number of features
    #      third dim stands for number of packets captured in traffic flow of that sample
    print("[i] shape initial dataset:",x_train.shape)

    #parse and prepare labels
    index_label = 0
    for i in range(len(csv_files)):
        full_filename=csv_files[i]
        #if videos has been bypassed before, bypass also here
        if full_filename in bypassed_videos:
            continue
        #bypass files which are not meant to be part of the project
        if not (full_filename[:3] == "out"):
            continue
        real_filename = full_filename.split(":")[2][2:-4]
        if real_filename in d_labels:
            y_train.append(d_labels[real_filename])    
        else:
            d_labels[real_filename] = index_label #d_labels[real_filename] = int(real_filename[5:7]) #
            index_label+=1
            y_train.append(d_labels[real_filename])

    y_train = np.array(y_train)
    num_classes = len(set(y_train))   #n° of different traffic flows (i.e., different videos in dataset)


    #shuffle dataset (yes, x_train is whole dataset until now, we make the division
    # in validation and test immediately after)
    from sklearn.utils import shuffle
    x_train, y_train = shuffle(x_train,y_train)

    dataset = x_train
    label_dataset = y_train


    ########## create validation set ##########

    #x_valid contains 30% of the elements in x_train
    x_valid = x_train[:int(len(x_train)*.3)] 
    #remove elements in validation set from training one
    x_train = x_train[int(len(x_train)*.3):]
    #do the same for validation labels
    y_valid = y_train[:int(len(y_train)*.3)]
    y_train = y_train[int(len(y_train)*.3):]


    ########## create test set ##########

    #x_test contains 20% of the remaining elements in x_train
    x_test = x_train[:int(len(x_train)*.20)] 
    #remove elements in test set from training one
    x_train = x_train[int(len(x_train)*.20):]
    #do the same for test labels
    y_test = y_train[:int(len(y_train)*.20)]
    y_train = y_train[int(len(y_train)*.20):]

    #convert val and test labels from list to numpy array
    y_valid = np.array(y_valid)
    y_test = np.array(y_test)



    print("[i] labels y_train:",y_train)
    print("[i] labels y_valid:",y_valid)
    print("[i] labels y_test:",y_test)
    print("[i] encoding:",d_labels)


    #set useful values for creating the model
    num_samples  =  x_train.shape[0]+x_valid.shape[0]+x_test.shape[0]   #n° of different video samples taken
    num_pkt      = len(x_train[0][0])   #n° pkt intercepted in traffic flow (padded to maximum #pkt intercepted among all traffic flows)
    num_features = x_train.shape[1]     #n° features (e.g., global_size, pkt_byte, inter-times, ...) 

    print("[D] num_classes:",num_classes)
    print("[D] num_samples:",num_samples)
    print("[D] num_pkt:",num_pkt)
    print("[D] num_features:",num_features)
    print("[D] training data:",x_train.shape[0])


    return num_classes, num_samples, num_pkt, num_features, x_train, x_valid, x_test, y_train, y_test, y_valid, d_labels, dataset, label_dataset

## K-Means

In [None]:
num_classes, num_samples, num_pkt, num_features, x_train, x_valid, x_test, y_train, y_test, y_valid, d_labels, dataset, label_dataset = load_dataset("/content/kmeans/")

Reshape dataset from 3D to 2D, standardize features and apply PCA to reduce dimensionality from 20 to 2, in order to be able to visualize data

In [None]:
print("[D] initial dataset.shape:",dataset.shape)
print("*"*80)
x_train_2d = None
for i in range(len(dataset)):
    if i == 0:
        x_train_2d = np.array([dataset[i].ravel()])
    else:
        #convert from 1d to 2d
        tmp = np.reshape(dataset[i].ravel(),(1,dataset[i].ravel().shape[0]))
        x_train_2d = np.append(x_train_2d,tmp, axis = 0)

print("[D] final x_train.shape:",x_train_2d.shape)

### scale features ###
x_train_2d = StandardScaler().fit_transform(x_train_2d)

### apply PCA ###
pca_2 = PCA(n_components=2)
pca_2_result = pca_2.fit_transform(x_train_2d)
print('Explained variation per principal component: {}'.format(pca_2.explained_variance_ratio_))

print('Cumulative variance explained by 2 principal components: {:.2%}'.format(np.sum(pca_2.explained_variance_ratio_)))

print("[D] shape PCA dim2:",pca_2_result.shape)

In [None]:
print("label_dataset:",label_dataset)
print("*"*50)
print("Encoding:")
for k, v in d_labels.items():
    print("label",v,": "+k)
print("*"*50)

#n_clusters = 4


for n_clusters in range(2,6):
    print("+++++++++++ n_clusters:{} +++++++++++".format(n_clusters))
    kmeans = KMeans(n_clusters=n_clusters, n_init=20, algorithm="auto")
    # Train K-Means
    y_pred_kmeans = kmeans.fit_predict(pca_2_result) #kmeans.fit_predict(x_train_2d)
    # Evaluate the K-Means clustering accuracy.

    #print("y_pred:",y_pred_kmeans)

    '''
    ##### plot training set and predicted clusters #####
    fig, ax = plt.subplots()
    sc = ax.scatter(pca_2_result[:, 0], pca_2_result[:, 1], c=label_dataset, edgecolors='k', cmap=plt.cm.Paired);
    ax.legend(*sc.legend_elements(), title='clusters')
    plt.xlabel("pca 1")
    plt.ylabel("pca 2")
    plt.title('Training set')

    fig, ax = plt.subplots()
    sc = ax.scatter(pca_2_result[:, 0], pca_2_result[:, 1], c=y_pred_kmeans, edgecolors='k');
    ax.legend(*sc.legend_elements(), title='clusters')
    #get centroids and plot them
    centers = np.array(kmeans.cluster_centers_)
    plt.scatter(centers[:,0], centers[:,1], marker="x", color='r')
    plt.xlabel("pca 1")
    plt.ylabel("pca 2")
    plt.title('Predicted clusters')
    '''
    ## print elements in same cluster
    for i in range(n_clusters):
        print("------ Cluster {} ------".format(i))
        #array which contains the indices (of y_train) of the elements which are in cluster {i}
        indices_el = [k for k, x in enumerate(y_pred_kmeans) if x == i]
        for index in indices_el:
            print([i for i in d_labels if d_labels[i] == label_dataset[index]][0][:-2]) 
        print()



#**Attacker's DL Model**

TODO: brief description what the model is used for and how it works

## Upload - Parse and Load dataset with traffic without defence
Following cell is needed in order to upload on colab the files which contain the values of the features, to parse the data in the format we want and to finally split the dataset into train, validation and test sets.

In [None]:
num_classes, num_samples, num_pkt, num_features, x_train, x_valid, x_test, y_train, y_test, y_valid, d_labels, dataset, label_dataset = load_dataset("/content/out/")

## Define Model
TODO: describe how the model is composed + activation functions + loss

In [None]:
#Best model found
def build_model_2d(num_classes, rows, cols, nb_filters=64, pool_size=[2,2], kernel_size=[1,3]):
    model = keras.models.Sequential()
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size, input_shape = (rows, cols,1)))
    model.add(keras.layers.Activation('tanh'))
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size))
    model.add(keras.layers.Dropout(rate=0.4))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size))
    model.add(keras.layers.Dropout(rate=0.7))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.MaxPooling2D(pool_size=pool_size))
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size))
    model.add(keras.layers.Dropout(rate=0.2))
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(64))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.Dropout(rate=0.4))
    model.add(keras.layers.Dense(num_classes))
    model.add(keras.layers.Activation('softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
        optimizer='adam', metrics=['accuracy']) #optimizer='sgd'
    return model


model = build_model_2d(num_classes=num_classes,rows=num_features,cols=num_pkt)

#print(get_model_summary(model))

#from keras.utils.vis_utils import plot_model
#plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True, rankdir="LR")

## Train model
TODO: describe main characteristics of the training session (e.g., #epochs, hyperparamters, ...)

In [None]:
def plot_loss(history):
  plt.figure(figsize=(10,6)) 
  plt.plot(history.epoch, history.history['loss'], label='loss')
  plt.plot(history.epoch, history.history['val_loss'], label='val_loss')
  plt.legend()
  plt.title('loss')

def plot_accuracy(history):
  plt.figure(figsize=(10,6))
  plt.plot(history.epoch,history.history['accuracy'],label='accuracy')
  plt.plot(history.epoch,history.history['val_accuracy'],label='val_accuracy')
  plt.legend()
  plt.title('accuracy')


history = model.fit(x_train, y_train, epochs=20, batch_size=10
                    ,validation_data=(x_valid, y_valid))
plot_loss(history)
plot_accuracy(history)

## Test model

In [None]:
scores = model.evaluate(x_test, y_test, verbose=2)
print("*"*80)
print(" %s test_set: %.2f%%" % (model.metrics_names[1], scores[1]*100))

## Plot confusion matrix without defences

In [None]:
t = 0
w = 0
y_pred = []

for i in range(1,len(x_test)+1):
    prediction = model.predict(x_test[i-1:i]) #prediction contains probability that the sample is assigned to each label by the model
    pred_label = np.where(prediction[0] == max(prediction[0]))[0][0]
    true_label = y_test[i-1]
    y_pred.append(pred_label)
    if int(true_label) == int(pred_label):
        t += 1
        true_video = list(d_labels.keys())[list(d_labels.values()).index(int(true_label))]
        #print("OK "+true_video)
        #print("label:"+str(true_label))
    else:
        w += 1
        print("[-] NOT OK")
        print("prediction:",np.round(prediction,decimals=3))
        print("true label:"+str(true_label))
        print("pred label:"+str(pred_label))
        true_video = list(d_labels.keys())[list(d_labels.values()).index(int(true_label))]
        pred_video = list(d_labels.keys())[list(d_labels.values()).index(int(pred_label))]
        print("\'"+true_video + "\'" + " predicted as \'" + pred_video + "\'")
        print("-"*80)
    

#print("acc:"+str(float(t/len(x_test))))


cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=[i for i in range(num_classes)],
                     columns = [i for i in range(num_classes)])
plt.figure(figsize = (10,7))
seaborn.heatmap(df_cm, annot=True, cmap="rainbow", linewidths=.5).set(xlabel='Predicted label', ylabel='True label', title="Confusion matrix w/out defence")

for k, v in d_labels.items():
    print("label",v,": "+k)
print()

---
---



# **Divergent**

## Upload - Parse and Load dataset with Divergent applied

In [None]:
!rm -rd out_div/
!mkdir out_div/

In [None]:
num_classes, num_samples, num_pkt, num_features, x_train, x_valid, x_test, y_train, y_test, y_valid, d_labels, dataset, label_dataset = load_dataset("/content/out_div/")

## Train model on traffic with Divergent

In [None]:
def build_model_2d(num_classes, rows, cols, nb_filters=64, pool_size=[2,2], kernel_size=[1,3]):
    model = keras.models.Sequential()
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size, input_shape = (rows, cols,1)))
    model.add(keras.layers.Activation('tanh'))
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size))
    model.add(keras.layers.Dropout(rate=0.4))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size))
    model.add(keras.layers.Dropout(rate=0.7))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.MaxPooling2D(pool_size=pool_size))
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size))
    model.add(keras.layers.Dropout(rate=0.2))
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(64))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.Dropout(rate=0.7))
    model.add(keras.layers.Dense(num_classes))
    model.add(keras.layers.Activation('softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
        optimizer='adam', metrics=['accuracy']) #optimizer='sgd'
    return model


model = build_model_2d(num_classes=num_classes,rows=num_features,cols=num_pkt)



######################################################################################

def plot_loss(history):
  plt.figure(figsize=(10,6)) 
  plt.plot(history.epoch, history.history['loss'], label='loss')
  plt.plot(history.epoch, history.history['val_loss'], label='val_loss')
  plt.legend()
  plt.title('loss')

def plot_accuracy(history):
  plt.figure(figsize=(10,6))
  plt.plot(history.epoch,history.history['accuracy'],label='accuracy')
  plt.plot(history.epoch,history.history['val_accuracy'],label='val_accuracy')
  plt.legend()
  plt.title('accuracy')


history = model.fit(x_train, y_train, epochs=30, batch_size=10
                    ,validation_data=(x_valid, y_valid))
plot_loss(history)
plot_accuracy(history)


######################################################################################


scores = model.evaluate(x_test, y_test, verbose=2)
print("\n"+"-"*90+"\n")
print("[+] %s test_set: %.2f%%" % (model.metrics_names[1], scores[1]*100))

## Plot confusion matrix with Divergent applied

In [None]:
t = 0
w = 0
y_pred = []

for i in range(1,len(x_test)+1):
    prediction = model.predict(x_test[i-1:i]) #prediction contains probability that the sample is assigned to each label by the model
    
    pred_label = np.where(prediction[0] == max(prediction[0]))[0][0]
    true_label = y_test[i-1]
    y_pred.append(pred_label)
    if int(true_label) == int(pred_label):
        t += 1
        true_video = list(d_labels.keys())[list(d_labels.values()).index(int(true_label))]
        print("[+] OK "+true_video)
        print("prediction:",np.round(prediction,decimals=3))
        print("label:"+str(true_label))
        print("-"*80)
    else:
        w += 1
        true_video = list(d_labels.keys())[list(d_labels.values()).index(int(true_label))]
        pred_video = list(d_labels.keys())[list(d_labels.values()).index(int(pred_label))]
        print("[-] NOT OK")
        print("prediction:",np.round(prediction,decimals=3))
        print("true label:"+str(true_label))
        print("pred label:"+str(pred_label))
        print("\'"+true_video + "\'" + " predicted as \'" + pred_video + "\'")
        print("-"*80)
    

#print("acc:"+str(float(t/len(x_test))))


cm = confusion_matrix(y_test, y_pred)

df_cm = pd.DataFrame(cm, index=[i for i in range(num_classes)],
                     columns = [i for i in range(num_classes)])
plt.figure(figsize = (10,7))
seaborn.heatmap(df_cm, annot=True, cmap="rainbow", linewidths=.5).set(xlabel='Predicted label', ylabel='True label',title="Confusion matrix w/ Divergent")

for k, v in d_labels.items():
    print("label",v,": "+k)
print()

# Experiment with **TOR** traffic


In [None]:
#create directory to put dataset
!mkdir tor_traffic

In [None]:
#### 
# upload files directly from tab on 
#   the left
####
def load_TOR_dataset(path):
    '''
    @arg: path contains the path in which there are the 
        TOR files to be parsed, loaded and split

    @return: num_classes
             num_samples
             num_pkt
             num_features
             x_train
             x_valid
             x_test
             y_train
             y_test
             y_valid
             dataset
             label_dataset
    '''
    x_train = None
    x_valid = None
    x_test  = None
    y_train = []
    y_test  = []
    y_valid = []

    MAX_SIZE_CELL = 25000 #28000

    cell_files=os.listdir(path)
    #print(cell_files)
    iter = 0

    for cell_name in cell_files:
        ## add values to x_train
        tmp = [[],[]]
        #print("cell_name:",cell_name)
        with open(path+cell_name, "r", newline='') as file:
            cell_values = file.read().splitlines()#.split("\t")
            #print("cell_values:",cell_values)
            for value in cell_values:
                value = value.split("\t")
                #print("value:",value)
                tmp[0].append(float(value[0])) #appending timestamps
                tmp[1].append(int(value[1])) #appending direction packets
            
            #append dummy values so that all cells have same dimensions
            for i in range(MAX_SIZE_CELL - len(tmp[0])):
                tmp[0].append(-20)
                tmp[1].append(-20)

            tmp = np.asarray(tmp)
            tmp[0] = np.asarray(tmp[0])
            tmp[1] = np.asarray(tmp[1])
            #reshape tmp from 2D to 3D
            tmp = np.reshape(tmp,(1,tmp.shape[0],tmp.shape[1]))

            if iter == 0:
                x_train = tmp
            else:
                x_train = np.append(x_train.copy(),tmp,axis = 0)#x_train.append(tmp)

        iter += 1

        ## add labels to y_train
        y_train.append(int(cell_name[0]))

        #if iter == 15:
        #    break   
    
    y_train = np.array(y_train)
    
    
    x_train = np.asarray(x_train)
    #print("x_train:",x_train)

    num_classes = len(set(y_train))

    #shuffle dataset (yes, x_train is whole dataset until now, we make the division
    # in validation and test immediately after)
    from sklearn.utils import shuffle
    x_train, y_train = shuffle(x_train,y_train)

    dataset = x_train
    label_dataset = y_train


    ########## create validation set ##########

    #x_valid contains 30% of the elements in x_train
    x_valid = x_train[:int(len(x_train)*.3)]
    #remove elements in validation set from training one
    x_train = x_train[int(len(x_train)*.3):]
    #do the same for validation labels
    y_valid = y_train[:int(len(y_train)*.3)]
    y_train = y_train[int(len(y_train)*.3):]


    ########## create test set ##########

    #x_test contains 20% of the remaining elements in x_train
    x_test = x_train[:int(len(x_train)*.20)]
    #remove elements in test set from training one
    x_train = x_train[int(len(x_train)*.20):]
    #do the same for test labels
    y_test = y_train[:int(len(y_train)*.20)]
    y_train = y_train[int(len(y_train)*.20):]

    #convert val and test labels from list to numpy array
    y_valid = np.array(y_valid)
    y_test = np.array(y_test)


    print("[i] labels y_train:",y_train)
    print("[i] labels y_valid:",y_valid)
    print("[i] labels y_test:",y_test)

    #set useful values for creating the model
    num_samples  =  x_train.shape[0]+x_valid.shape[0]+x_test.shape[0]   #n° of different video samples taken
    num_pkt      = len(x_train[0][0])   #n° pkt intercepted in traffic flow (padded to maximum #pkt intercepted among all traffic flows)
    num_features = x_train.shape[1]     #n° features (e.g., global_size, pkt_byte, inter-times, ...)

    print("[D] num_classes:",num_classes)
    print("[D] num_samples:",num_samples)
    print("[D] max num_pkt:",num_pkt)
    print("[D] num_features:",num_features)
    print("[D] training data:",x_train.shape[0])

    
    return num_classes, num_samples, num_pkt, num_features, x_train, x_valid, x_test, y_train, y_test, y_valid, dataset, label_dataset

## Testing on cells without defence

In [None]:
num_classes, num_samples, num_pkt, num_features, x_train, x_valid, x_test, y_train, y_test, y_valid, dataset, label_dataset = load_TOR_dataset("tor_traffic/")

In [None]:
#Best model found for TOR traffic, with 1111 samples in dataset, 10 classes and 623 sampled in training set
def build_model_2d(num_classes, rows, cols, nb_filters=32, pool_size=[2,2], kernel_size=[1,3]):
    model = keras.models.Sequential()
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size, input_shape = (rows, cols,1)))
    model.add(keras.layers.Activation('relu'))
    #model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size))
    #model.add(keras.layers.Activation('tanh'))
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size))
    model.add(keras.layers.Dropout(rate=0.4))
    model.add(keras.layers.Activation('tanh'))
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size))
    model.add(keras.layers.Dropout(rate=0.7))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.AveragePooling2D(pool_size=pool_size))#model.add(keras.layers.MaxPooling2D(pool_size=pool_size))
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size))
    model.add(keras.layers.Dropout(rate=0.2))
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(128))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.Dense(64))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.Dense(32))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.Dropout(rate=0.4))
    model.add(keras.layers.Dense(num_classes))
    model.add(keras.layers.Activation('softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
        optimizer='adam', metrics=['accuracy']) #optimizer='sgd'
    return model


model = build_model_2d(num_classes=num_classes,rows=num_features,cols=num_pkt)

#print(get_model_summary(model))

from keras.utils.vis_utils import plot_model
#plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True, rankdir="LR")



def plot_loss(history):
  plt.figure(figsize=(10,6)) 
  plt.plot(history.epoch, history.history['loss'], label='loss')
  plt.plot(history.epoch, history.history['val_loss'], label='val_loss')
  plt.legend()
  plt.title('loss')

def plot_accuracy(history):
  plt.figure(figsize=(10,6))
  plt.plot(history.epoch,history.history['accuracy'],label='accuracy')
  plt.plot(history.epoch,history.history['val_accuracy'],label='val_accuracy')
  plt.legend()
  plt.title('accuracy')


history = model.fit(x_train, y_train, epochs=45, batch_size=25
                    ,validation_data=(x_valid, y_valid))
plot_loss(history)
plot_accuracy(history)



scores = model.evaluate(x_test, y_test, verbose=2)
print("*"*80)
print(" %s test_set: %.2f%%" % (model.metrics_names[1], scores[1]*100))

## Testing on cells with Divergent

In [None]:
num_classes, num_samples, num_pkt, num_features, x_train, x_valid, x_test, y_train, y_test, y_valid, dataset, label_dataset = load_TOR_dataset("tor_traffic/")

In [None]:
def build_model_2d(num_classes, rows, cols, nb_filters=32, pool_size=[2,2], kernel_size=[1,3]):
    model = keras.models.Sequential()
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size, input_shape = (rows, cols,1)))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size))
    model.add(keras.layers.Dropout(rate=0.4))
    model.add(keras.layers.Activation('tanh'))
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size))
    model.add(keras.layers.Dropout(rate=0.7))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.AveragePooling2D(pool_size=pool_size))#model.add(keras.layers.MaxPooling2D(pool_size=pool_size))
    model.add(keras.layers.Conv2D(nb_filters, kernel_size=kernel_size))
    model.add(keras.layers.Dropout(rate=0.2))
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(128))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.Dense(64))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.Dense(32))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.Dropout(rate=0.4))
    model.add(keras.layers.Dense(num_classes))
    model.add(keras.layers.Activation('softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
        optimizer='adam', metrics=['accuracy']) #optimizer='sgd'
    return model


model = build_model_2d(num_classes=num_classes,rows=num_features,cols=num_pkt)


def plot_loss(history):
  plt.figure(figsize=(10,6)) 
  plt.plot(history.epoch, history.history['loss'], label='loss')
  plt.plot(history.epoch, history.history['val_loss'], label='val_loss')
  plt.legend()
  plt.title('loss')

def plot_accuracy(history):
  plt.figure(figsize=(10,6))
  plt.plot(history.epoch,history.history['accuracy'],label='accuracy')
  plt.plot(history.epoch,history.history['val_accuracy'],label='val_accuracy')
  plt.legend()
  plt.title('accuracy')


history = model.fit(x_train, y_train, epochs=35, batch_size=25
                    ,validation_data=(x_valid, y_valid))
plot_loss(history)
plot_accuracy(history)



scores = model.evaluate(x_test, y_test, verbose=2)
print("*"*80)
print(" %s test_set: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [None]:
t = 0
w = 0
y_pred = []

for i in range(1,len(x_test)+1):
    prediction = model.predict(x_test[i-1:i]) #prediction contains probability that the sample is assigned to each label by the model
    #print("prediction:",np.round(prediction,decimals=3))
    pred_label = np.where(prediction[0] == max(prediction[0]))[0][0]
    true_label = y_test[i-1]
    y_pred.append(pred_label)
    if int(true_label) == int(pred_label):
        t += 1
        #print("OK "+true_video)
        #print("label:"+str(true_label))
    else:
        w += 1
        #print("[-] NOT OK")
        #print("true label:"+str(true_label))
        #print("pred label:"+str(pred_label))
        #print("-"*80)
    

#print("acc:"+str(float(t/len(x_test))))


cm = confusion_matrix(y_test, y_pred)

df_cm = pd.DataFrame(cm, index=[i for i in range(num_classes)],
                     columns = [i for i in range(num_classes)])
plt.figure(figsize = (10,7))
seaborn.heatmap(df_cm, annot=True, cmap="rainbow", linewidths=.5).set(xlabel='Predicted label', ylabel='True label',title="Confusion matrix w/ Divergent in Tor traffic")

print()