# Assignment 2

In case scapy is needed:

In [None]:
#!pip install --pre scapy[complete]

# import scapy
# from scapy.all import *

# packets = rdpcap('/home/jovyan/work/A2/test.pcap')

In [4]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


import numpy as np
import sklearn
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cluster import KMeans, DBSCAN


import seaborn as sns
sns.set(style="ticks", color_codes=True)

# Plotting defaults
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 12.0
plt.rcParams['figure.figsize'] = 14.0, 7.0

Load data exported from splunk

In [5]:
df_normal=pd.read_csv("/home/jovyan/work/A2/a2_feat_flow_level_normal.csv")
df_attack=pd.read_csv("/home/jovyan/work/A2/a2_feat_flow_level_test.csv")

We can merge together both datasets just to explore a general relation between the variables

In [6]:
df_normal['attack']=0
df_attack['attack']=1
df=pd.concat([df_normal,df_attack])
print(df.columns)

Index(['src', 'dst', 'start', 'finish', 'total_duration', 'protocol',
       'total_bytes', 'total_packets', 'src_ports', 'dst_ports', 'pps', 'bps',
       'bpp', 'attack'],
      dtype='object')


Let's perform some basic exploration to look for correlation in the data.  
We are interested in some of these features to perform a cluster analysis

In [7]:
selected_features=['total_duration',
       'total_bytes', 'total_packets', 'src_ports', 'dst_ports', 'pps', 'bps',
       'bpp', 'attack']

Transform categorical values to numerical (if necessary)

In [8]:
X_Normal = StandardScaler().fit_transform(df_normal[selected_features])
X_Attack = StandardScaler().fit_transform(df_attack[selected_features])
X_All= StandardScaler().fit_transform(df[selected_features])

# KMEANS

In [None]:
from sklearn.metrics import silhouette_score

scores = []
clusters = range(2,5)
for K in clusters:
    
    clusterer = KMeans(n_clusters=K)
    cluster_labels = clusterer.fit_predict(X_Normal)
    score = silhouette_score(X_Normal, cluster_labels)
    scores.append(score)
    print(k," done")
# Plot it out
pd.DataFrame({'Num Clusters':clusters, 'score':scores}).plot(x='Num Clusters', y='score')

In [None]:
# So we know that the highest (closest to 1) silhouette score is at 10 clusters
kmeans = KMeans(n_clusters=2).fit(X_Normal)



In [None]:
labels_attack=kmeans.predict(X_Attack)

In [None]:
pca = PCA(n_components=2,random_state=42).fit(X_Attack)
print("Variability explained by the PC:",sum(pca.explained_variance_ratio_) )
reduced_data = pca.transform(X_Attack)

# Now we can put our ML results back onto our dataframe!
df_attack['cluster'] = labels_attack
df_attack['x'] = reduced_data[:, 0] # Projection X Column
df_attack['y'] = reduced_data[:, 1] # Projection Y Column

In [None]:
# Now use dataframe group by cluster
cluster_groups = df_attack.groupby('cluster')

# Plot the Machine Learning results
colors = {-1:'black', 0:'green', 1:'blue', 2:'red', 3:'orange', 4:'purple', 5:'brown', 6:'pink', 7:'lightblue', 8:'grey', 9:'yellow'}
fig, ax = plt.subplots()
for key, group in cluster_groups:
    group.plot(ax=ax, kind='scatter', x='x', y='y', alpha=0.5, s=250,
               label='Cluster: {:d}'.format(key), color=colors[key])

In [None]:
# TSNE is a great projection algorithm. In this case we're going from 12 dimensions to 2
projection = TSNE().fit_transform(X_Normal)

# Now we can put our ML results back onto our dataframe!
df_normal['cluster'] = labels_attack
df_normal['x'] = reduced_data[:, 0] # Projection X Column
df_normal['y'] = reduced_data[:, 1] # Projection Y Column

In [None]:
# Now use dataframe group by cluster
cluster_groups = df_normal.groupby('cluster')

# Plot the Machine Learning results
colors = {-1:'black', 0:'green', 1:'blue', 2:'red', 3:'orange', 4:'purple', 5:'brown', 6:'pink', 7:'lightblue', 8:'grey', 9:'yellow'}
fig, ax = plt.subplots()
for key, group in cluster_groups:
    group.plot(ax=ax, kind='scatter', x='x', y='y', alpha=0.5, s=250,
               label='Cluster: {:d}'.format(key), color=colors[key])

In [None]:
# Now print out the details for each cluster
pd.set_option('display.width', 1000)
for key, group in cluster_groups:
    print('\nCluster {:d}: {:d} observations'.format(key, len(group)))
    print(group[features].head(3))

# DBSCAN

In [None]:
# Now try DBScan
eps=100
min_samples=20
dbscan=DBSCAN(eps=eps, min_samples=min_samples,algorithm='ball_tree', metric='euclidean').fit(X_Normal)


In [None]:
df_attack['cluster_db'] = dbscan.predict(X_Attack)
print('Number of Clusters: {:d}'.format(df_attack['cluster_db'].nunique()))

In [None]:
# Now use dataframe group by cluster
cluster_groups = df_normal.groupby('cluster_db')

# Plot the Machine Learning results
fig, ax = plt.subplots()
for key, group in cluster_groups:
    group.plot(ax=ax, kind='scatter', x='x', y='y', alpha=0.5, s=250,
               label='Cluster: {:d}'.format(key), color=colors[key])

# OPTICS

In [9]:
from sklearn.cluster import OPTICS, cluster_optics_dbscan
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np

In [10]:
clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05)

In [None]:
clust.fit(X_Normal)