In [1]:
# Third Party Imports
import pandas as pd
import sklearn
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np

# Local imports
import bat
from bat import log_to_dataframe
from bat import dataframe_to_matrix

In [10]:
# Create a Pandas dataframe from a Bro log
bro_df = log_to_dataframe.LogToDataFrame('/home/dane/giant/conn.log')
#bro_df.head()
#bro_df?

Successfully monitoring /home/dane/giant/conn.log...


In [13]:
# These are the features we want (note some of these are categorical :)
'''features = ['auth_attempts', 'auth_success','cipher_alg','client',
            'compression_alg','host_key','host_key_alg',
            'id.orig_h', 'id.orig_p', 'kex_alg', 'mac_alg',
            'server','uid','version']
            '''
features = ['conn_state', 'duration', 'history', 'id.orig_h',
 'id.orig_p','id.resp_h', 'id.resp_p', 'local_orig',
 'local_resp', 'missed_bytes', 'orig_bytes', 'orig_ip_bytes',
 'orig_pkts', 'proto', 'resp_bytes', 'resp_ip_bytes',
 'resp_pkts','service','tunnel_parents','uid']
#fatures = ['auth_attempts', 'auth_success','cipher_alg','client',
#            'compression_alg','host_key','host_key_alg',
#            'id.orig_h', 'id.orig_p', 'kex_alg', 'mac_alg',
#            'server','uid','version']
feature_df = bro_df[features]
bro_df.head()

Unnamed: 0_level_0,conn_state,duration,history,id.orig_h,id.orig_p,id.resp_h,id.resp_p,local_orig,local_resp,missed_bytes,orig_bytes,orig_ip_bytes,orig_pkts,proto,resp_bytes,resp_ip_bytes,resp_pkts,service,tunnel_parents,uid
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2018-12-07 08:59:17.609383,S0,0 days,D,fe80::fe66:9c07:d4c0:e690,5353,ff02::fb,5353,False,False,0,0,120,1,udp,0,0,0,dns,-,C7bTbStISQw1Saj4a
2018-12-07 08:59:17.608987,S0,0 days,D,192.168.230.136,5353,224.0.0.251,5353,True,False,0,0,100,1,udp,0,0,0,dns,-,CSDfJs4ckMrct8pX06
2018-12-07 08:59:17.611906,S0,0 days,D,fe80::8a51:fbff:fee9:e904,5353,ff02::fb,5353,False,False,0,0,230,1,udp,0,0,0,dns,-,C41q8j2UI6JZV5dPu
2018-12-07 09:00:01.745444,OTH,0 days,C,192.168.230.115,47042,192.168.35.1,9182,True,False,0,0,0,0,tcp,0,0,0,-,-,CRWkkt4heQBFYTZeAa
2018-12-07 09:00:01.747875,OTH,0 days,C,192.168.230.115,37396,192.168.35.2,9182,True,False,0,0,0,0,tcp,0,0,0,-,-,Cutzv42T0IfIstJQe


In [14]:
# Use the bat DataframeToMatrix class (handles categorical data)
# You can see below it uses a heuristic to detect category data. When doing
# this for real we should explicitly convert before sending to the transformer.
to_matrix = dataframe_to_matrix.DataFrameToMatrix()
bro_matrix = to_matrix.fit_transform(feature_df)

Changing column conn_state to category...
Changing column proto to category...
Changing column service to category...
Changing column tunnel_parents to category...
Normalizing column id.orig_p...
Normalizing column id.resp_p...
Normalizing column missed_bytes...
Normalizing column orig_bytes...
Normalizing column orig_ip_bytes...
Normalizing column orig_pkts...
Normalizing column resp_bytes...
Normalizing column resp_ip_bytes...
Normalizing column resp_pkts...


In [15]:
# Just showing that the class is tracking categoricals and normalization maps
print(to_matrix.cat_columns)
print(to_matrix.norm_map)
print(bro_matrix)

['conn_state', 'proto', 'service', 'tunnel_parents']
{'id.orig_p': (0, 65535), 'id.resp_p': (0, 65417), 'missed_bytes': (0, 205051389), 'orig_bytes': (0, 3163654475), 'orig_ip_bytes': (0, 2909096593), 'orig_pkts': (0, 904735), 'resp_bytes': (0, 3183290615), 'resp_ip_bytes': (0, 3165564339), 'resp_pkts': (0, 3820187)}
[[0.08168154421301595 0.08182888240059923 False ... 0 0 1]
 [0.08168154421301595 0.08182888240059923 True ... 0 0 1]
 [0.08168154421301595 0.08182888240059923 False ... 0 0 1]
 ...
 [4.5777065690089265e-05 0.00015286546310592047 True ... 0 0 1]
 [0.002105745021744106 0.0021095433908617026 True ... 0 0 1]
 [0.8047913328755627 0.7301007383401869 True ... 0 0 1]]


In [16]:
# Now we're ready for scikit-learn!
# Just some simple stuff for this example, KMeans and TSNE projection
kmeans = KMeans(n_clusters=3).fit_predict(bro_matrix)

In [None]:
projection = TSNE().fit_transform(bro_matrix)

In [None]:
# Now we can put our ML results back onto our dataframe!
bro_df['x'] = projection[:, 0] # Projection X Column
bro_df['y'] = projection[:, 1] # Projection Y Column
bro_df['cluster'] = kmeans
bro_df.head()
#bro_df[['auth_attempts', 'id.orig_p', 'cluster']].head()  # Showing the scikit-learn results in our dataframe

In [None]:
# Plotting defaults
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 18.0
plt.rcParams['figure.figsize'] = 15.0, 7.0

In [None]:
# Now use dataframe group by cluster
cluster_groups = bro_df.groupby('cluster')

# Plot the Machine Learning results
fig, ax = plt.subplots()
colors = {0:'green', 1:'blue', 2:'red'}#, 3:'orange', 4:'purple'}
for key, group in cluster_groups:
    group.plot(ax=ax, kind='scatter', x='x', y='y', alpha=0.5, s=250,
               label='Cluster: {:d}'.format(key), color=colors[key])

In [None]:
# Now print out the details for each cluster
pd.set_option('display.width', 1000)
show_fields = ['query', 'Z', 'proto', 'qtype_name', 'cluster']
for key, group in cluster_groups:
    print('\nCluster {:d}: {:d} observations'.format(key, len(group)))
    print(group[show_fields].head())

<div style="float: right; margin: 50px 0px 0px 0px"><img src="https://www.kitware.com/img/small_logo_over.png"></div>
## Wrap Up
Well that's it for this notebook, we'll have an upcoming notebook that addresses some of the issues that we overlooked in this simple example. We use Isolation Forest for anomaly detection which works well for high dimensional data. The notebook will cover both training and streaming evalution against the model.

If you liked this notebook please visit the [bat](https://github.com/SuperCowPowers/bat) project for more notebooks and examples.