In [1]:
import vaex
import numpy as np
import pandas as pd
from pyclustering.cluster import cure


In [2]:
#Normalizing data before comparing it
def normalize(data):
    running_mat_data = data.values
    running_mat_data_tp = np.transpose(running_mat_data)
    running_normed_data = [np.asarray((running_mat_data_tp[i] - min_holder[i])/(max_holder[i] - min_holder[i])).reshape(-1) for i in range(115)]
    running_normed_data = np.transpose(running_normed_data)
    return running_normed_data

In [3]:
#Calculating the distance between two vectors: used to find the closest cluster represtative to a data point
def dist(vecA, vecB):
    return np.sqrt(np.power(vecA - vecB, 2).sum())

In [4]:
#Compares a datapoint to each cluster representative, and returns the cluster value that the closest representative belongs to
#This is currently done in serial, but should really be implemented in parallel
def get_closest_cluster(data_point, rep):
    min_dist = float('inf')
    for i in range(len(rep)):
        for j in range(len(rep[i])):
            temp_dist = dist(data_point, rep[i][j])
            if temp_dist < min_dist:
                min_dist = temp_dist
                cluster_val = i
    return cluster_val

In [5]:
# Counting the datapoints which belong to each cluster according to whether they are malicious or not
def get_output_data_full_streaming(chunk, mal_indicator, rep, cluster_count_benign, cluster_count_mal, ind_count):
    for i in range(len(chunk)):
        #Keeps a running tab of the indicator index, as this is not streamed and fits in memory
        indicator = mal_indicator[ind_count][1]
        ind_count += 1
        cluster_val = get_closest_cluster(chunk[i], rep)
        #Indicates the packet is benign
        if indicator == 0:
            cluster_count_benign[cluster_val] += 1
        #Indicates the packet is malicious
        else:
            cluster_count_mal[cluster_val] += 1
    return cluster_count_benign, cluster_count_mal, ind_count

In [6]:
#Reading the pre-computed data sample which fits in memory
df = pd.read_csv("sample_3k/sample_3k_total.csv", header=None)

In [7]:
#Normalizing the sample data
mat_data = df.values
mat_data_tp = np.transpose(mat_data)
#The minimum and maximum of the sample data is saved becaused it is used for the normalization of the streaming data later
min_holder = np.zeros(115)
max_holder = np.zeros(115)
for i in range(115):
    min_holder[i] = mat_data_tp[i].min()
    max_holder[i] = mat_data_tp[i].max()
normed_data = [np.asarray((mat_data_tp[i] - mat_data_tp[i].min())/(mat_data_tp[i].max() - mat_data_tp[i].min())).reshape(-1) for i in range(115)]
normed_data = np.transpose(normed_data)

In [8]:
#Defining the parameters of the CURE algorithm
#100 clusters are used to identify the clusters with high density
cure_algo = cure.cure(data=normed_data, number_cluster=9, number_represent_points=10, compression = 0.2, ccore=True)

In [9]:
#Running the algorithm
cure_algo.process()

<pyclustering.cluster.cure.cure at 0x269b4f1cb80>

In [10]:
#Retrieving the clusters
clusters = cure_algo.get_clusters()

In [11]:
#Viewing the number of datapoints in each cluster to identify the high density clusters
for i in range(len(clusters)):
    print(len(clusters[i]))

2938
1
15051
1
3401
2
1
15
2590


In [12]:
#Getting the representors if each cluster
representors = cure_algo.get_representors()

In [13]:
#Getting the labels of the dataset (malicious or not), which fit in memory
indicator_df = pd.read_csv("SSL Renegotiation_labels.csv")
indicator_data = indicator_df.values

In [14]:
#Setting the parameters for streaming chunk size
chunksize = 5000
#Initializing variables which will store how many datapoints are assigned to a given cluster
cluster_count_benign_start = np.zeros(len(representors))
cluster_count_mal_start = np.zeros(len(representors))
ind_count_start = 0

In [15]:
#Streaming the full dataset of an attack type in chunks, normalizing the data, and assigning each datapoint to a cluster
for chunk in pd.read_csv("SSL Renegotiation_dataset-002.csv", chunksize=chunksize):
    normed_chunk = normalize(chunk)
    cluster_count_benign_start, cluster_count_mal_start, ind_count_start = get_output_data_full_streaming(normed_chunk, indicator_data, representors, cluster_count_benign_start, cluster_count_mal_start, ind_count_start)

In [16]:
#Saving the results to file
final_results = []
final_results.append(cluster_count_benign_start)
final_results.append(cluster_count_mal_start)
np.savetxt("results/SSL_final_results.csv", np.transpose(final_results), delimiter = ",")

In [17]:
#Getting the labels of the dataset (malicious or not), which fit in memory
indicator_df = pd.read_csv("Active Wiretap_labels.csv")
indicator_data = indicator_df.values
#Setting the parameters for streaming chunk size
chunksize = 5000
#Initializing variables which will store how many datapoints are assigned to a given cluster
cluster_count_benign_start = np.zeros(len(representors))
cluster_count_mal_start = np.zeros(len(representors))
ind_count_start = 0
#Streaming the full dataset of an attack type in chunks, normalizing the data, and assigning each datapoint to a cluster
for chunk in pd.read_csv("Active Wiretap_dataset-002.csv", chunksize=chunksize):
    normed_chunk = normalize(chunk)
    cluster_count_benign_start, cluster_count_mal_start, ind_count_start = get_output_data_full_streaming(normed_chunk, indicator_data, representors, cluster_count_benign_start, cluster_count_mal_start, ind_count_start)
#Saving the results to file
final_results = []
final_results.append(cluster_count_benign_start)
final_results.append(cluster_count_mal_start)
np.savetxt("results/wiretap_final_results.csv", np.transpose(final_results), delimiter = ",")    

In [18]:
#Getting the labels of the dataset (malicious or not), which fit in memory
indicator_df = pd.read_csv("ARP MitM_labels.csv")
indicator_data = indicator_df.values
#Setting the parameters for streaming chunk size
chunksize = 5000
#Initializing variables which will store how many datapoints are assigned to a given cluster
cluster_count_benign_start = np.zeros(len(representors))
cluster_count_mal_start = np.zeros(len(representors))
ind_count_start = 0
#Streaming the full dataset of an attack type in chunks, normalizing the data, and assigning each datapoint to a cluster
for chunk in pd.read_csv("ARP MitM_dataset-002.csv", chunksize=chunksize):
    normed_chunk = normalize(chunk)
    cluster_count_benign_start, cluster_count_mal_start, ind_count_start = get_output_data_full_streaming(normed_chunk, indicator_data, representors, cluster_count_benign_start, cluster_count_mal_start, ind_count_start)
#Saving the results to file
final_results = []
final_results.append(cluster_count_benign_start)
final_results.append(cluster_count_mal_start)
np.savetxt("results/ARP_final_results.csv", np.transpose(final_results), delimiter = ",")    

In [19]:
#Getting the labels of the dataset (malicious or not), which fit in memory
indicator_df = pd.read_csv("Fuzzing_labels.csv")
indicator_data = indicator_df.values
#Setting the parameters for streaming chunk size
chunksize = 5000
#Initializing variables which will store how many datapoints are assigned to a given cluster
cluster_count_benign_start = np.zeros(len(representors))
cluster_count_mal_start = np.zeros(len(representors))
ind_count_start = 0
#Streaming the full dataset of an attack type in chunks, normalizing the data, and assigning each datapoint to a cluster
for chunk in pd.read_csv("Fuzzing_dataset-002.csv", chunksize=chunksize):
    normed_chunk = normalize(chunk)
    cluster_count_benign_start, cluster_count_mal_start, ind_count_start = get_output_data_full_streaming(normed_chunk, indicator_data, representors, cluster_count_benign_start, cluster_count_mal_start, ind_count_start)
#Saving the results to file
final_results = []
final_results.append(cluster_count_benign_start)
final_results.append(cluster_count_mal_start)
np.savetxt("results/fuzzing_final_results.csv", np.transpose(final_results), delimiter = ",")    

In [20]:
#Getting the labels of the dataset (malicious or not), which fit in memory
indicator_df = pd.read_csv("OS Scan_labels.csv")
indicator_data = indicator_df.values
#Setting the parameters for streaming chunk size
chunksize = 5000
#Initializing variables which will store how many datapoints are assigned to a given cluster
cluster_count_benign_start = np.zeros(len(representors))
cluster_count_mal_start = np.zeros(len(representors))
ind_count_start = 0
#Streaming the full dataset of an attack type in chunks, normalizing the data, and assigning each datapoint to a cluster
for chunk in pd.read_csv("OS Scan_dataset-002.csv", chunksize=chunksize):
    normed_chunk = normalize(chunk)
    cluster_count_benign_start, cluster_count_mal_start, ind_count_start = get_output_data_full_streaming(normed_chunk, indicator_data, representors, cluster_count_benign_start, cluster_count_mal_start, ind_count_start)
#Saving the results to file
final_results = []
final_results.append(cluster_count_benign_start)
final_results.append(cluster_count_mal_start)
np.savetxt("results/scan_final_results.csv", np.transpose(final_results), delimiter = ",")   

In [21]:
#Getting the labels of the dataset (malicious or not), which fit in memory
indicator_df = pd.read_csv("SSDP Flood_labels.csv")
indicator_data = indicator_df.values
#Setting the parameters for streaming chunk size
chunksize = 5000
#Initializing variables which will store how many datapoints are assigned to a given cluster
cluster_count_benign_start = np.zeros(len(representors))
cluster_count_mal_start = np.zeros(len(representors))
ind_count_start = 0
#Streaming the full dataset of an attack type in chunks, normalizing the data, and assigning each datapoint to a cluster
for chunk in pd.read_csv("SSDP Flood_dataset-002.csv", chunksize=chunksize):
    normed_chunk = normalize(chunk)
    cluster_count_benign_start, cluster_count_mal_start, ind_count_start = get_output_data_full_streaming(normed_chunk, indicator_data, representors, cluster_count_benign_start, cluster_count_mal_start, ind_count_start)
#Saving the results to file
final_results = []
final_results.append(cluster_count_benign_start)
final_results.append(cluster_count_mal_start)
np.savetxt("results/SSDP_final_results.csv", np.transpose(final_results), delimiter = ",")   

In [22]:
#Getting the labels of the dataset (malicious or not), which fit in memory
indicator_df = pd.read_csv("SYN DoS_labels.csv")
indicator_data = indicator_df.values
#Setting the parameters for streaming chunk size
chunksize = 5000
#Initializing variables which will store how many datapoints are assigned to a given cluster
cluster_count_benign_start = np.zeros(len(representors))
cluster_count_mal_start = np.zeros(len(representors))
ind_count_start = 0
#Streaming the full dataset of an attack type in chunks, normalizing the data, and assigning each datapoint to a cluster
for chunk in pd.read_csv("SYN DoS_dataset-002.csv", chunksize=chunksize):
    normed_chunk = normalize(chunk)
    cluster_count_benign_start, cluster_count_mal_start, ind_count_start = get_output_data_full_streaming(normed_chunk, indicator_data, representors, cluster_count_benign_start, cluster_count_mal_start, ind_count_start)
#Saving the results to file
final_results = []
final_results.append(cluster_count_benign_start)
final_results.append(cluster_count_mal_start)
np.savetxt("results/SYN_results.csv", np.transpose(final_results), delimiter = ",")   

In [23]:
#Getting the labels of the dataset (malicious or not), which fit in memory
indicator_df = pd.read_csv("Video Injection_labels.csv")
indicator_data = indicator_df.values
#Setting the parameters for streaming chunk size
chunksize = 5000
#Initializing variables which will store how many datapoints are assigned to a given cluster
cluster_count_benign_start = np.zeros(len(representors))
cluster_count_mal_start = np.zeros(len(representors))
ind_count_start = 0
#Streaming the full dataset of an attack type in chunks, normalizing the data, and assigning each datapoint to a cluster
for chunk in pd.read_csv("Video Injection_dataset-002.csv", chunksize=chunksize):
    normed_chunk = normalize(chunk)
    cluster_count_benign_start, cluster_count_mal_start, ind_count_start = get_output_data_full_streaming(normed_chunk, indicator_data, representors, cluster_count_benign_start, cluster_count_mal_start, ind_count_start)
#Saving the results to file
final_results = []
final_results.append(cluster_count_benign_start)
final_results.append(cluster_count_mal_start)
np.savetxt("results/video_results.csv", np.transpose(final_results), delimiter = ",")   