In [1]:
import vaex
import numpy as np
import pandas as pd
from pyclustering.cluster import cure


In [2]:
#Getting the indices of the clusters with sufficient density
def get_rep_values(cluster, thresh):
    holder = []
    for i in range(len(cluster)):
        if len(cluster[i]) >= thresh:
            holder.append(i)
    return holder

In [3]:
#Normalizing data before comparing it
def normalize(data):
    running_mat_data = data.values
    running_mat_data_tp = np.transpose(running_mat_data)
    running_normed_data = [np.asarray((running_mat_data_tp[i] - min_holder[i])/(max_holder[i] - min_holder[i])).reshape(-1) for i in range(115)]
    running_normed_data = np.transpose(running_normed_data)
    return running_normed_data

In [4]:
#Calculating the distance between two vectors: used to find the closest cluster represtative to a data point
def dist(vecA, vecB):
    return np.sqrt(np.power(vecA - vecB, 2).sum())

In [5]:
#Compares a datapoint to each cluster representative, and returns the cluster value that the closest representative belongs to
#This is currently done in serial, but should really be implemented in parallel
def get_closest_cluster(data_point, rep):
    min_dist = float('inf')
    for i in range(len(rep)):
        for j in range(len(rep[i])):
            temp_dist = dist(data_point, rep[i][j])
            if temp_dist < min_dist:
                min_dist = temp_dist
                cluster_val = i
    return cluster_val

In [6]:
# Counting the datapoints which belong to each cluster according to whether they are malicious or not
def get_output_data_full_streaming(chunk, rep, cluster_count):
    for i in range(len(chunk)):
        cluster_val = get_closest_cluster(chunk[i], rep)
        #Indicates the packet is benign
        cluster_count[cluster_val] += 1
    return cluster_count

In [7]:
#Reading the pre-computed data sample which fits in memory
df = pd.read_csv("sample_3k_mal/sample_3k_total_mal.csv", header=None)

In [8]:
#Normalizing the sample data
mat_data = df.values
mat_data_tp = np.transpose(mat_data)
#The minimum and maximum of the sample data is saved becaused it is used for the normalization of the streaming data later
min_holder = np.zeros(115)
max_holder = np.zeros(115)
for i in range(115):
    min_holder[i] = mat_data_tp[i].min()
    max_holder[i] = mat_data_tp[i].max()
normed_data = [np.asarray((mat_data_tp[i] - mat_data_tp[i].min())/(mat_data_tp[i].max() - mat_data_tp[i].min())).reshape(-1) for i in range(115)]
normed_data = np.transpose(normed_data)

In [9]:
#Defining the parameters of the CURE algorithm
#100 clusters are used to identify the clusters with high density
cure_algo = cure.cure(data=normed_data, number_cluster=100, number_represent_points=10, compression = 0.2, ccore=True)

In [10]:
#Running the algorithm
cure_algo.process()

<pyclustering.cluster.cure.cure at 0x28db499dd60>

In [11]:
#Retrieving the clusters
clusters = cure_algo.get_clusters()

In [12]:
#Viewing the number of datapoints in each cluster to identify the high density clusters
for i in range(len(clusters)):
    print(len(clusters[i]))

3
3021
23
2887
6
1
5054
1
8
5
16
1
1
1
1
7
13
10
3
1
1
8
1
11179
1
1
1
32
3
1
1
4
79
1
2
1
1
1
1
3
1
2
1
1
1
3
4
1
1
1
726
30
1
2
2
1
6
1
1
1
1
3
3
1
1
4
1
5
1
1
2
1
1
2
5
1
1
1
2
1
1
1
1
505
1
103
1
1
1
1
1
1
1
107
42
1
1
6
13
1


In [13]:
#Defining the density threshold for what clusters to keep based on the results from above
#Generally, top 9 density clusters as long as they include > 95% of the datapoints
threshold = 500

In [14]:
#Getting the indices of the clusters with sufficent density
rep_values = get_rep_values(clusters, threshold)

In [15]:
#Getting the representors if each cluster
representors = cure_algo.get_representors()

In [16]:
#Getting the cluster representators of the dense clusters
new_rep = []
for i in range(len(rep_values)):
    new_rep.append(representors[i])

In [17]:
#Setting the parameters for streaming chunk size
chunksize = 5000
#Initializing variables which will store how many datapoints are assigned to a given cluster
cluster_count_start = np.zeros(len(new_rep))


In [18]:
#Streaming the full dataset of an attack type in chunks, normalizing the data, and assigning each datapoint to a cluster
for chunk in pd.read_csv("mal_data/SSL_Mal.csv", chunksize=chunksize):
    normed_chunk = normalize(chunk)
    cluster_count_start = get_output_data_full_streaming(normed_chunk, new_rep, cluster_count_start)

In [19]:
#Saving the results to file
final_results = []
final_results.append(cluster_count_start)
np.savetxt("results/SSL_final_results_mal.csv", np.transpose(final_results), delimiter = ",")

In [20]:
#Setting the parameters for streaming chunk size
chunksize = 5000
#Initializing variables which will store how many datapoints are assigned to a given cluster
cluster_count_start = np.zeros(len(new_rep))
#Streaming the full dataset of an attack type in chunks, normalizing the data, and assigning each datapoint to a cluster
for chunk in pd.read_csv("mal_data/Active_Wiretap_Mal.csv", chunksize=chunksize):
    normed_chunk = normalize(chunk)
    cluster_count_start = get_output_data_full_streaming(normed_chunk, new_rep, cluster_count_start)
#Saving the results to file
final_results = []
final_results.append(cluster_count_start)
np.savetxt("results/wiretap_final_results_mal.csv", np.transpose(final_results), delimiter = ",")

In [21]:
#Setting the parameters for streaming chunk size
chunksize = 5000
#Initializing variables which will store how many datapoints are assigned to a given cluster
cluster_count_start = np.zeros(len(new_rep))
#Streaming the full dataset of an attack type in chunks, normalizing the data, and assigning each datapoint to a cluster
for chunk in pd.read_csv("mal_data/ARP_MITM_Mal.csv", chunksize=chunksize):
    normed_chunk = normalize(chunk)
    cluster_count_start = get_output_data_full_streaming(normed_chunk, new_rep, cluster_count_start)
#Saving the results to file
final_results = []
final_results.append(cluster_count_start)
np.savetxt("results/ARP_final_results_mal.csv", np.transpose(final_results), delimiter = ",") 

In [22]:
#Setting the parameters for streaming chunk size
chunksize = 5000
#Initializing variables which will store how many datapoints are assigned to a given cluster
cluster_count_start = np.zeros(len(new_rep))
#Streaming the full dataset of an attack type in chunks, normalizing the data, and assigning each datapoint to a cluster
for chunk in pd.read_csv("mal_data/fuzzing_Mal.csv", chunksize=chunksize):
    normed_chunk = normalize(chunk)
    cluster_count_start = get_output_data_full_streaming(normed_chunk, new_rep, cluster_count_start)
#Saving the results to file
final_results = []
final_results.append(cluster_count_start)
np.savetxt("results/fuzzing_final_results_mal.csv", np.transpose(final_results), delimiter = ",")  

In [23]:
#Setting the parameters for streaming chunk size
chunksize = 5000
#Initializing variables which will store how many datapoints are assigned to a given cluster
cluster_count_start = np.zeros(len(new_rep))
#Streaming the full dataset of an attack type in chunks, normalizing the data, and assigning each datapoint to a cluster
for chunk in pd.read_csv("mal_data/scan_Mal.csv", chunksize=chunksize):
    normed_chunk = normalize(chunk)
    cluster_count_start = get_output_data_full_streaming(normed_chunk, new_rep, cluster_count_start)
#Saving the results to file
final_results = []
final_results.append(cluster_count_start)
np.savetxt("results/scan_final_results_mal.csv", np.transpose(final_results), delimiter = ",")    

In [24]:
#Setting the parameters for streaming chunk size
chunksize = 5000
#Initializing variables which will store how many datapoints are assigned to a given cluster
cluster_count_start = np.zeros(len(new_rep))
#Streaming the full dataset of an attack type in chunks, normalizing the data, and assigning each datapoint to a cluster
for chunk in pd.read_csv("mal_data/SSDP_Mal.csv", chunksize=chunksize):
    normed_chunk = normalize(chunk)
    cluster_count_start = get_output_data_full_streaming(normed_chunk, new_rep, cluster_count_start)
#Saving the results to file
final_results = []
final_results.append(cluster_count_start)
np.savetxt("results/SSDP_final_results_mal.csv", np.transpose(final_results), delimiter = ",")   

In [25]:
#Setting the parameters for streaming chunk size
chunksize = 5000
#Initializing variables which will store how many datapoints are assigned to a given cluster
cluster_count_start = np.zeros(len(new_rep))
#Streaming the full dataset of an attack type in chunks, normalizing the data, and assigning each datapoint to a cluster
for chunk in pd.read_csv("mal_data/SYN_Mal.csv", chunksize=chunksize):
    normed_chunk = normalize(chunk)
    cluster_count_start = get_output_data_full_streaming(normed_chunk, new_rep, cluster_count_start)
#Saving the results to file
final_results = []
final_results.append(cluster_count_start)
np.savetxt("results/SYN_final_results_mal.csv", np.transpose(final_results), delimiter = ",")  

In [26]:
#Setting the parameters for streaming chunk size
chunksize = 5000
#Initializing variables which will store how many datapoints are assigned to a given cluster
cluster_count_start = np.zeros(len(new_rep))
#Streaming the full dataset of an attack type in chunks, normalizing the data, and assigning each datapoint to a cluster
for chunk in pd.read_csv("mal_data/video_Mal.csv", chunksize=chunksize):
    normed_chunk = normalize(chunk)
    cluster_count_start = get_output_data_full_streaming(normed_chunk, new_rep, cluster_count_start)
#Saving the results to file
final_results = []
final_results.append(cluster_count_start)
np.savetxt("results/video_final_results_mal.csv", np.transpose(final_results), delimiter = ",")    