In [1]:
import pandas as pd
import numpy as np
from kmodes import kmodes
from kmodes import kprototypes
from fp_growth import find_frequent_itemsets
import matplotlib.pyplot as plt
import mpld3
from datetime import datetime, timedelta, date
import time

%matplotlib inline

# Enable mpld3 for notebook
mpld3.enable_notebook()

# Timestamp goodness
def timestamp(dataframe):
    timestamps = []
    for row in dataframe.iterrows():
        timestamps.append(time.ctime(row[1][4] - (row[1][3] - row[1][19]) / 1000))
    dataframe['TIMESTAMP'] = timestamps
    return dataframe

In [3]:
np.random.seed(1234)

In [None]:
# unixSeconds - sysUpTime + FIRST_SWITCHED
time.ctime(1454085029 - (587231704 - 587213704) / 1000)

In [22]:
# Import netflow capture file(s)

# Crate dataframe
brocade_flowdata = pd.DataFrame()

# List of csv's to read in
brocade_cap_files = ["/home/ehenry/code/ml/classifier_examples/1454084206.csv"]

# Read in the csv's and append to dataframe
for f in brocade_cap_files:
    frame = pd.read_csv(f, sep=';')
    brocade_flowdata = brocade_flowdata.append(frame, ignore_index=True)

In [23]:
brocade_flowdata.dtypes

timeReceived                           float64
nfHost                                  object
nfSourceID                               int64
sysUpTime                                int64
unixSeconds                              int64
sequenceNumber                           int64
flowSetID                                int64
IN_BYTES                                 int64
IN_PKTS                                  int64
PROTOCOL                                 int64
TOS                                      int64
TCP_FLAGS                                int64
L4_SRC_PORT                              int64
IPV4_SRC_ADDR                           object
INPUT_SNMP                               int64
L4_DST_PORT                              int64
IPV4_DST_ADDR                           object
OUTPUT_SNMP                              int64
LAST_SWITCHED                            int64
FIRST_SWITCHED                           int64
IPV6_SRC_ADDR                           object
IPV6_DST_ADDR

In [24]:
# Convert and append timestamps
brocade_flowdata_ts = timestamp(brocade_flowdata)

In [25]:
# Convert variables to respective type
# cont = continuous
# cat = categorical

cat_cols = ['nfHost','nfSourceID','sequenceNumber','flowSetID','PROTOCOL',
            'TOS', 'TCP_FLAGS','L4_SRC_PORT','IPV4_SRC_ADDR','INPUT_SNMP',
            'L4_DST_PORT','IPV4_DST_ADDR','OUTPUT_SNMP','IPV6_SRC_ADDR',
            'IPV6_DST_ADDR','ICMP_TYPE',
            'DIRECTION','flowId','postNATSourceIPv4Address',
            'postNATDestinationIPv4Address','postNAPTSourceTransportAddress',
            'postNAPTDestinationTransportAddress','firewallEvent',
            'postNATSourceIPv6Address','postNATDestinationIPv6Address',
            'privateEnterpriseNumber','App-ID','User-ID']

cont_cols = ['timeReceived','IN_BYTES',
             'sysUpTime','unixSeconds','FIRST_SWITCHED',
             'LAST_SWITCHED']

for c in cat_cols:
    brocade_flowdata[c] = brocade_flowdata[c].astype('category')

for c in cont_cols:
    brocade_flowdata[c] = brocade_flowdata[c].astype('float64')

# Strip whitespace
brocade_flowdata.rename(columns=lambda x: x.strip(), inplace = True)

In [26]:
ipv4_brcd_flwdt = brocade_flowdata.loc[:,['TIMESTAMP','flowId','PROTOCOL',
                                          'IPV4_SRC_ADDR','L4_SRC_PORT','IPV4_DST_ADDR',
                                          'L4_DST_PORT','DIRECTION','IN_BYTES', 
                                          'IN_PKTS','FIRST_SWITCHED','LAST_SWITCHED',]]

cats_10 = ['flowId','PROTOCOL','IPV4_SRC_ADDR', 
           'L4_SRC_PORT','IPV4_DST_ADDR','L4_DST_PORT',
           'DIRECTION']

for c in cats_10:
    ipv4_brcd_flwdt[c] = ipv4_brcd_flwdt[c].astype('category')

In [27]:
ipv4_brcd_flwdt.head(n=10)

Unnamed: 0,TIMESTAMP,flowId,PROTOCOL,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,DIRECTION,IN_BYTES,IN_PKTS,FIRST_SWITCHED,LAST_SWITCHED
0,Fri Jan 29 15:55:10 2016,311091,6,10.252.132.20,50246,216.58.192.46,443,0,70,1,585112704,585112704
1,Fri Jan 29 15:55:11 2016,33828306,6,10.252.132.6,59032,192.168.1.23,52323,0,66,1,585113704,585113704
2,Fri Jan 29 15:55:11 2016,34890688,17,10.70.20.23,58877,61.172.201.254,53,0,88,1,585113704,585113704
3,Fri Jan 29 15:55:10 2016,67336502,6,10.130.53.5,62406,23.5.251.27,80,0,70,1,585112704,585112704
4,Fri Jan 29 15:55:10 2016,68080560,17,10.252.131.68,55105,10.70.20.23,53,0,91,1,585112704,585112704
5,Fri Jan 29 15:55:11 2016,34501117,17,10.70.20.43,52284,205.251.195.166,53,0,134,1,585113704,585113704
6,Fri Jan 29 15:55:07 2016,473184,6,192.168.1.23,52323,10.252.132.6,59043,0,0,0,585109704,585112704
7,Fri Jan 29 15:55:10 2016,68080560,17,10.70.20.23,53,10.252.131.68,55105,0,0,0,585112704,585112704
8,Fri Jan 29 15:55:10 2016,67326420,6,10.102.137.218,52069,54.246.126.156,80,0,70,1,585112704,585112704
9,Fri Jan 29 15:55:11 2016,33839017,6,10.252.132.7,64835,10.70.20.124,389,0,78,1,585113704,585113704


#Brocade Flowdata

### Notes

* Palo Alto Firwalls don't provide what I'll call wall-clock timestamps, only RFC3954 FIRST_SWITCHED AND LAST_SWITCHED fields - https://live.paloaltonetworks.com/t5/Documentation-Articles/PAN-OS-Netflow-Templates-and-Field-Types-PAN-OS-5-0/ta-p/54223?attachment-id=1816

* flowId field is IPFIX (RFC5102) compliant in that it provides a unique flowId to all different Flows ("different" is ambiguous in the RFC as well) - https://tools.ietf.org/html/rfc5102#page-22

We should be able to sort flows by the FIRST_SWITCHED column to maintain received order of the flows, with respect to the Palo Alto.

In [28]:
print(len(ipv4_brcd_flwdt))
ipv4_brcd_flwdt = ipv4_brcd_flwdt.dropna()
print(len(ipv4_brcd_flwdt))

3704276
3686524


## Kmodes

Testing kmodes using just categorical data

In [None]:
#Limit this research to categorical vars for now

brocade_flowdata_cats = brocade_flowdata.loc[:,['protocol','ipv4_src_addr', 'l4_src_port',
                                                  'ipv4_dst_addr','l4_dst_port']]

# Cast each row (flow) as a tuple and store in array for use with
# kmodes algorithm
modes_tuples = [tuple(x) for x in brocade_flowdata_cats.values]

In [None]:
#kmodes test

km = kmodes.KModes(n_clusters=20, init='Huang', n_init=5, verbose=5)
clusters = km.fit_predict(tuples[0:50000])

In [None]:
cluster_assignments = zip(tuples[0:200], clusters)

## KPrototypes

Testing Kprototypes using both continuous and categorical data

In [None]:
#Limit this research to categorical vars for now

brocade_flowdata_mix = brocade_flowdata.loc[:,['protocol','ipv4_src_addr', 
                                               'l4_src_port','ipv4_dst_addr','l4_dst_port',
                                               'in_pkts','in_bytes']]

# Normalize the continuous vars

norm_columns = ['in_pkts', 'in_bytes']

brocade_flowdata_mix[norm_columns] = brocade_flowdata_mix[norm_columns].apply(lambda x: (x - x.mean()) / 
                                                                              (x.max() - x.min()))

In [None]:
# Convert dataframe to np.array() for use with
# kprototypes algorithm

brocade_flowdata_mix_mat = brocade_flowdata_mix.as_matrix()

In [None]:
#kprototypes test

n_clusters = 160

km = kprototypes.KPrototypes(n_clusters, init='Cao', max_iter=100)
proto_clusters = km.fit_predict(brocade_flowdata_mix_mat[0:10000], categorical=[0,1,2,3,4,5])

In [None]:
# combine dataframe entries with resultant clusterId

proto_cluster_assignments = zip(brocade_flowdata_mix_mat[0:10000], proto_clusters)

In [None]:
# instantiate dataframe to house new cluster data

cluster_df = pd.DataFrame(columns=('protocol','ipv4_src_addr', 'l4_src_port',
                                   'ipv4_dst_addr','l4_dst_port','in_pkts',
                                   'in_bytes','cluster_id'))

# convert arrays into dataframe

for array in proto_cluster_assignments:
    cluster_df = cluster_df.append({'protocol':array[0][0],'ipv4_src_addr':array[0][1], 'l4_src_port':array[0][2],
                                    'ipv4_dst_addr':array[0][3],'l4_dst_port':array[0][4],'in_pkts':array[0][5],
                                    'in_bytes':array[0][6],'cluster_id':array[1]}, ignore_index=True)

In [None]:
cluster_df.to_csv("kproto_160_centroid_cao.csv",sep=',')

In [None]:
cluster_df['cluster_id'] = cluster_df['cluster_id'].astype('str')

In [None]:
# Create unique dataframe for each cluster created
# by the kprototypes algorithm run above

cluster_dfs = []

for i in np.unique(cluster_df['cluster_id']):
    cluster_dfs.append(cluster_df[cluster_df['cluster_id'].str.contains(('^%s' %i), na=False)])

In [None]:
# Subset single cluster, will ultimately make
# function that runs this entire process over
# all cluster_dfs

cluster_n = cluster_dfs[100]
cluster_n_src_ip = cluster_n['ipv4_src_addr'].unique()

cluster_n_df = cluster_n.loc[:,['protocol','ipv4_src_addr','l4_src_port',
                                'ipv4_dst_addr','l4_dst_port','in_pkts',
                                'in_bytes']]

cluster_n_df_mat = cluster_n_df.as_matrix()

In [None]:
print(len(cluster_n_src_ip))
print(len(cluster_n_df_mat))

In [None]:
#cluster2 kprototypes test

n_clusters = len(cluster_2_src_ip)

cluster_2_km = kprototypes.KPrototypes(n_clusters, init='Cao', max_iter=100)
cluster_2_proto_clusters = km.fit_predict(cluster_n_df_mat, categorical=[0,1,2,3,4])

In [None]:
cluster_2_proto_cluster_assignments = zip(cluster_2_df_mat, cluster_2_proto_clusters)

In [None]:
# instantiate dataframe to house new cluster data

clust_2_subcluster_df = pd.DataFrame(columns=('protocol','ipv4_src_addr', 'l4_src_port',
                                   'ipv4_dst_addr','l4_dst_port','in_pkts',
                                   'in_bytes','cluster_id'))

# convert arrays into dataframe

for array in cluster_2_proto_cluster_assignments:
    clust_2_subcluster_df = clust_2_subcluster_df.append({'protocol':array[0][0],'ipv4_src_addr':array[0][1], 'l4_src_port':array[0][2],
                                                          'ipv4_dst_addr':array[0][3],'l4_dst_port':array[0][4],'in_pkts':array[0][5],
                                                          'in_bytes':array[0][6],'cluster_id':array[1]}, ignore_index=True)

In [None]:
clust_2_subcluster_df.to_csv("subclust_kprototypes.csv",sep=',')

In [None]:
cluster_list = cluster_df.values.tolist()

plot_x = []
plot_y = []

for flow in cluster_list:
    plot_x.append(flow[4])
    plot_y.append(flow[8])

In [None]:
fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'), figsize=(10, 10))


colors = [v[7] for k,v in enumerate(cluster_list)]
mpld3_scatter = ax.scatter(plot_x, plot_y, c = colors)
ax.grid(color='white', linestyle='solid')

labels = [v[1:5] for k,v in enumerate(cluster_list)]
tooltip = mpld3.plugins.PointLabelTooltip(mpld3_scatter, labels=labels)
mpld3.plugins.connect(fig, tooltip)

In [None]:
for itemset in find_frequent_itemsets(cluster_list[0:100], 10, include_support=True):
    print itemset