In [18]:
import pandas as pd
import numpy as np
import pyhash
import gensim
import multiprocessing as mp
from joblib import Parallel, delayed
import concurrent.futures
from pprint import pprint
import random
import mpld3
import re
import matplotlib.pyplot as plt
from sklearn import cluster
from sklearn import manifold
from sklearn.decomposition import PCA, TruncatedSVD

%matplotlib inline

# Enable mpld3 for notebook
mpld3.enable_notebook()

# Instantiate hasher object
hasher = pyhash.city_64()

# Method to strip white test
def strip(text):
    return text.strip()

# Method to set dataframe entries to integers
def make_int(text):
    return int(text.strip(''))    

# Method to match IP against flow srcIP
def sort_ip_flow(ip):
    # List to house flows when matches
    flows_list = []
    # Iterate over tcp_flows list
    for flow in tcp_flows:   
        # Comparison logic - flow[1][3] corresponds to SrcIP in flow tuple
        if ip == flow[1][3]:        
            # Append match to flows_list
            flows_list.append(flow)
    # Return dictionary of IPs and flows
    return {ip: flows_list}

def process_flow(flow):    
    # Create hash of protocol
    proto_hash = hasher(flow[1][2])        
    # Create hash of SrcIP
    srcip_hash = hasher(flow[1][3])        
    # Create hash of Sport
    srcprt_hash = hasher(flow[1][4]) 
    # Create hash of DstIP
    dstip_hash = hasher(flow[1][6])    
    # Create hash of Dport
    dstprt_hash = hasher(flow[1][7]) 
    # Cast flow entry as list for manipulation
    flow_list = list(flow)       
    # Insert hashes as entry in tuple for each flow
    flow_list.insert(4, (str(proto_hash), str(srcip_hash), str(srcprt_hash), 
                         str(dstip_hash), str(dstprt_hash)))    
    # Re-cast flow entry as tuple w/ added hash tuple
    flow = tuple(flow_list)
    return(flow)

def single_hash(flow):
    flow_hash = hasher(flow)
    flow_list = list(flow)
    flow_list.insert(4, str(flow_hash))
    flow = tuple(flow_list) 
    return(flow)
    

In [3]:
# Import netflow capture file

flowdata = pd.DataFrame()

cap_files = ["capture20110810.binetflow","capture20110811.binetflow"]

for f in cap_files:
    frame = pd.read_csv(f, sep=',', header=0)
    flowdata = flowdata.append(frame, ignore_index=True)

#flowdata = pd.read_csv("capture20110810.binetflow", sep=',', header=0)

# Strip whitespace
flowdata.rename(columns=lambda x: x.strip(), inplace = True)

In [4]:
subsample_cats = flowdata.loc[:,['SrcAddr', 'DstAddr', 'Dport','Proto', 'Label']]
#subsample_labels = flowdata.loc[:,['Label']]

## Word2vec (co-occurence idea for flow data)

Attempting to find some co-occurence patterns in the flow data according to how an algorithm like word2vec, in its skip-gram implementation specifically for this work, works. The idea is that flows, $V_{f}$ for vector representation, that occur within a window $W_{f}$, which can be modeled as "time" using timestamps from the capture or just raw capture data. For brevity we will be testing raw capture for now and will transition to "time" in the future. I'll transition to writing a time based function futher down in the notebook.

* Note here that using time-stamp windowing will allow for variable sentence length

Considering the conditional probabilities $P(w|f)$, with a given set of flow captures _Captures_, the goal is to set the parameters $\theta$ of $P(w|f;\theta)$ so as to maximize the capture probability :

$$\underset{\theta}{\operatorname{argmax}} \underset{f \in Captures}{\operatorname{\prod}} \left[\underset{w \in W_{f}}{\operatorname{\prod}} P(w \vert f;\theta)\right] $$

in this equation $W_{f}$ is a set of surrounding flows of flow $f$. Alternatively :

$$ \underset{\theta}{\operatorname{argmax}} \underset{(f, w) \in D}{\operatorname{\prod}} P(w \vert f;\theta) $$

Here $D$ is the set of all flow and window pairs we extract from the text.

The word2vec algorithm seems to capture an underlying phenomenon of written language that clusters words together according to their linguistic similarity, this can be seen in something like simple synonym analysis. The goal is to exploit this underlying "similarity" phenomenon in the "conversations" that machines are having within a given data network. This can also be thought of as written text being an express conversation between the writer and the reader. This is the idea behind using word2vec on flow capture data.

Each "time step", right now just being a subset of a given flow data set, is as a 'sentence' in the word2vec model. We should then be able to find flow "similarities" that exist within the context of flows. The idea is this "symilarity" will really just yield an occurence pattern over the flow data, much like word2vec does for written text.

Another part of the idea is much like in written text there are word / context, $(w,c)$, patterns that are discovered and exploited when running the algorithm over a given set of written language. There are common occurences and patterns that can be yielded from flow data, much like the occurences and patterns that are mined from written text. At the end of the embedding excersize we can use k-means to attempt to cluster the flows. This should yield some sort of clustering of commonly occuring flows that have the same occurence measure in a given set of netflow captures. We can then use this data to measure against other, unseen, flows for future classification of "anamoly". I use that word loosely as this is strictly expirimental.

### Assumptions :

#### Maximizing the objective will result in good embeddings $v_{f}  \forall w \in V$

#####_It is important to note with the above statment, with respect to time, is the assumption that the data I am operating from has already been ordered according to the tooling I used to acquire it_

## Skip-gram Negative Sampling

One of the other portions of the word2vec algorithm that I will be testing in this experiment will be negative sampling.

The objective of Skipgram with Negative Sampling is to maximize the the probability that $(f,w)$ came from the data $D$. This can be modeled as a distribution such that $P(D=1|f,w)$ be the probability that $(f,w)$ came from the data and $P(D=0|f,w) = 1 - P(D=1|f,w)$ the probability that $(f,w)$ did not. 

The distribution is modeled as :

$$P(D=1|f,w) = \sigma(\vec{f} \cdot \vec{w}) = \frac{1}{1+e^{-\vec{f} \cdot \vec{w}}}$$

where $\vec{f}$ and $\vec{w}$ (each a d-dimensional vector) are the model parameters to be learned.

The negative sampling tries to maximize $P(D=1|f,w)$ for observed $(f,w)$ pairs while maximizing $P(D=0|f,w)$ for stochastically sampled "negative" examples, under the assumption that selecting a context for a given word is likely to result in an unobserved $(f,w)$ pair.

SGNS's objective for a single $(f,w)$ observation is then:

$$ \log \sigma(\vec{f} \cdot \vec{w}) + k \cdot \mathbb{E}_{w_{N} \sim P_{D}} [\log \sigma(\vec{-f} \cdot \vec{w}_N)] $$

where $k$ is the number of "negative" samples and $c_{N}$ is the sampled window, drawn according to the empirical unigram distribution $P_{D}(w) = \frac{\#w}{|D|}$

This object is then trained in an online fashion using stochastic gradient updated over the observed pairs in the corpus $D$. The goal objective then sums over the observed $(f,w)$ pairs in the corpus :

$$ \ell = \Sigma_{f \in V_{f}} \Sigma_{w \in V_{w}} \#(f,w)(\log \sigma(\vec{f} \cdot \vec{w}) + k \cdot \mathbb{E}_{w_{N} \sim P_{D}} [\log \sigma(\vec{-f} \cdot \vec{w}_N)]$$

Optimizing this objective groups flows that have similar embeddings, while scattering unobserved pairs.

In [5]:
# Method to slide window over dataframe of 
# flowdata and create "sentences"

def create_corpora(dataframe, window, corpus_count):
    corpus = []
    corpora = []
    begin = 0
    end = 0
    for i in range(corpus_count):
        while end <= window:
            end += 1
        else:
            corpus.append(dataframe[begin:(end-1)])
        begin = begin + window
        end = end + window
    corpora.append(corpus)
    return(corpora)

In [6]:
corpora = create_corpora(subsample_cats, 30, 153333)
#labels = create_corpora(subsample_labels, 150, 18500)

In [7]:
# Convert all tuples created by previous create_corpora function
# to strings for use with tokenization which is then used in the
# word2vec algorithm below 

str_corpora = []

for corpus in corpora[0]:
    str_corpus = []
    for sentence in corpus.values.tolist():
        str_corpus.append(str(sentence).encode('utf-8'))
    str_corpora.append(str_corpus)

In [320]:
# Here we train a model without using the negative sampling hyperparameter 
# We will be using this for testing of accuracy of model vs. using the 
# negative sampling function

flow_model = gensim.models.Word2Vec(str_corpora, workers=23, size=200, window=20, min_count=1)

In [10]:
# Here we train a model using the negative sampling which we will then compare
# to the model above for the impact that the negative sampling has on the 
# clustering of flows

flow_model_1 = gensim.models.Word2Vec(str_corpora, workers=23, size=100, window=30, negative=10, sample=5)

## Preliminary results (very rough, no real hyperparameter tunings, etc.)

We can see below the results may prove to be useful with respect to certain labels present in the dataset, but not others. This may have to do with the raw occurence rates of certain flow and window #$(f,w)$ combinations vs. others. I use labels lightly as well as this will ultimately become an exercise of semi-supervised learning as it can sometimes be impossible for humans to interpret the results of an unsupervised learning task without any type of contextual insight, as labels can provide

We can tune for this using subsampling above in the SGNS model. Which will we do next.

In [16]:
# Test for flow similarity, preferrably a flow that has the botnet label

flow_model_1.most_similar("['147.32.84.165', '192.33.4.12', '53', 'udp', 'flow=From-Botnet-V42-UDP-DNS']")

[("['147.32.84.165', '192.5.5.241', '53', 'udp', 'flow=From-Botnet-V42-UDP-DNS']",
  0.9761667847633362),
 ("['147.32.84.165', '202.12.27.33', '53', 'udp', 'flow=From-Botnet-V42-UDP-DNS']",
  0.9741541743278503),
 ("['147.32.84.165', '128.8.10.90', '53', 'udp', 'flow=From-Botnet-V42-UDP-DNS']",
  0.973616898059845),
 ("['147.32.84.165', '78.47.76.4', '53', 'udp', 'flow=From-Botnet-V42-UDP-DNS']",
  0.9714504480361938),
 ("['147.32.84.165', '193.0.14.129', '53', 'udp', 'flow=From-Botnet-V42-UDP-DNS']",
  0.9692395925521851),
 ("['147.32.84.165', '199.7.83.42', '53', 'udp', 'flow=From-Botnet-V42-UDP-DNS']",
  0.9687032699584961),
 ("['147.32.84.165', '192.228.79.201', '53', 'udp', 'flow=From-Botnet-V42-UDP-DNS']",
  0.9674479961395264),
 ("['147.32.84.165', '192.58.128.30', '53', 'udp', 'flow=From-Botnet-V42-UDP-DNS']",
  0.9664252400398254),
 ("['147.32.84.165', '92.53.98.100', '53', 'udp', 'flow=From-Botnet-V42-UDP-DNS']",
  0.9656703472137451),
 ("['147.32.84.165', '192.112.36.4', '53

In [23]:
flow_model_1.most_similar("['147.32.84.165', '60.190.223.75', '888', 'tcp', 'flow=From-Botnet-V42-TCP-CC6-Plain-HTTP-Encrypted-Data']")

[("['217.66.146.105', '147.32.84.229', '443', 'tcp', 'flow=Background-TCP-Established']",
  0.970333993434906),
 ("['188.26.176.163', '147.32.84.229', '13363', 'udp', 'flow=Background-UDP-Established']",
  0.963600218296051),
 ("['114.75.11.242', '147.32.84.229', '80', 'tcp', 'flow=Background-TCP-Established']",
  0.9627201557159424),
 ("['147.32.86.96', '147.32.87.29', '0xb612', 'icmp', 'flow=Background']",
  0.9622609615325928),
 ("['195.234.241.9', '147.32.84.229', '13363', 'udp', 'flow=Background-UDP-Established']",
  0.9621870517730713),
 ("['41.130.66.62', '147.32.84.229', '13363', 'udp', 'flow=Background-UDP-Established']",
  0.9606925249099731),
 ("['131.104.149.212', '147.32.84.229', '13363', 'udp', 'flow=Background-UDP-Established']",
  0.9604771733283997),
 ("['147.32.84.59', '90.146.27.130', '46356', 'udp', 'flow=Background-Attempt-cmpgw-CVUT']",
  0.9597481489181519),
 ("['147.32.84.229', '78.141.179.11', '34046', 'udp', 'flow=Background-UDP-Established']",
  0.95972657203

### Aggregated flows, equivalent to "phrases"

word2vec has the option of being able to model phrases of words

## Clustering

Now that we have some vector representations of occurences of flows within the captures that we have, we can run a clustering algorithm over them to see if we can humanly identify some of the groupings that have taken place. For this, we'll use kmeans within the scikit-learn package.

Kmeans has an objective function that intends to partition $n$ objects into $k$ clusters in which each object, $n$, belongs to the cluster with the nearest mean. This can be seen as :

$$ J = \sum_{j=1}^{k}\sum_{i=1}^{n} \| x_{i}^{(j)} - c_{j}\|^2 $$

In [19]:
# Set k (number of clusters) to be 1/5 of the "vocabulary" size
# or an average of flows per cluster, this is a hyperparameter
# in kmeans that we can tweak later on

flow_vectors = flow_model_1.syn0[0:20000]
num_clusters = flow_vectors.shape[0] / 5

# Initialize k-means object and use it to extract centroids

kmeans_clustering = cluster.KMeans(n_clusters = num_clusters, init="k-means++", n_jobs=-1)
idx = kmeans_clustering.fit_predict(flow_vectors)

# Create a flow / Index dictionary, mapping "vocabulary words" to
# a cluster number

flow_centroid_map = dict(zip(flow_model_1.index2word, idx))

In [20]:
import operator
sorted_clusters = sorted(flow_centroid_map.items(), key=operator.itemgetter(1))

botnets = []

for i in sorted_clusters:
    if re.search(r"Botnet", i[0]):
        botnets.append(i)
        
botnets[0:10]

[("['147.32.84.165', '209.86.93.226', '25', 'tcp', 'flow=From-Botnet-V43-TCP-Attempt-SPAM']",
  3),
 ("['147.32.84.165', '192.33.4.12', '53', 'udp', 'flow=From-Botnet-V42-UDP-DNS']",
  14),
 ("['147.32.84.165', '85.214.220.206', '25', 'tcp', 'flow=From-Botnet-V42-TCP-Attempt-SPAM']",
  40),
 ("['147.32.84.165', '77.88.210.88', '53', 'udp', 'flow=From-Botnet-V42-UDP-DNS']",
  48),
 ("['147.32.84.165', '75.180.132.243', '25', 'tcp', 'flow=From-Botnet-V42-TCP-Attempt-SPAM']",
  49),
 ("['147.32.84.165', '67.23.231.68', '53', 'udp', 'flow=From-Botnet-V42-UDP-Attempt-DNS']",
  73),
 ("['147.32.84.165', '60.190.223.75', '888', 'tcp', 'flow=From-Botnet-V42-TCP-CC6-Plain-HTTP-Encrypted-Data']",
  74),
 ("['147.32.84.165', '80.93.50.53', '53', 'udp', 'flow=From-Botnet-V42-UDP-DNS']",
  89),
 ("['147.32.84.165', '94.100.176.20', '25', 'tcp', 'flow=From-Botnet-V43-TCP-Attempt-SPAM']",
  91),
 ("['147.32.84.165', '74.125.159.27', '25', 'tcp', 'flow=From-Botnet-V42-TCP-Attempt-SPAM']",
  99)]

In [21]:
cluster_members = []
for i in sorted_clusters:
    if i[1] == 73:
        cluster_members.append(i)
    
cluster_members[0:10]

[("['147.32.84.59', '72.21.210.129', '80', 'tcp', 'flow=Background-Established-cmpgw-CVUT']",
  73),
 ("['62.162.92.225', '147.32.84.229', '13363', 'udp', 'flow=Background-UDP-Established']",
  73),
 ("['147.32.84.59', '208.88.186.10', '34021', 'udp', 'flow=Background-Established-cmpgw-CVUT']",
  73),
 ("['147.32.84.165', '67.23.231.68', '53', 'udp', 'flow=From-Botnet-V42-UDP-Attempt-DNS']",
  73),
 ("['200.148.213.27', '147.32.84.229', '13363', 'udp', 'flow=Background-UDP-Established']",
  73),
 ("['187.75.138.219', '147.32.84.229', '13363', 'udp', 'flow=Background-UDP-Established']",
  73)]

## Cluster visualization

Raw flow vectors $V_{f}$, created by word2vec, are embedded in dimensionality equivalent to the input layer of the shallow neural network that is used within the model. In our example we're using 

### t-SNE Visualization

Use t-SNE and matplotlib to visualize the clusters created using Word2Vec.


In [10]:
def perform_tsne(word_vector):
    tsne = manifold.TSNE(n_components=2, random_state=42)
    return tsne.fit_transform(word_vector)

In [485]:
#flow_model_reduced = TruncatedSVD(n_components=100, random_state=42).fit_transform(flow_model_1.syn0)
test_tsne = manifold.TSNE(n_components=2, learning_rate=50).fit_transform(flow_model_1.syn0[0:4000])

In [None]:
fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'), figsize=(10, 10))

x = test_tsne[:,0]
y = test_tsne[:,1]

mpld3_scatter = ax.scatter(x, y, cmap='Blues', c = y)
ax.grid(color='white', linestyle='solid')

labels = [v[0] for k,v in enumerate(flow_model_1.vocab.items()[0-4000:])]
tooltip = mpld3.plugins.PointLabelTooltip(mpld3_scatter, labels=labels)
mpld3.plugins.connect(fig, tooltip)

In [None]:
fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'), figsize=(10, 10))


mpld3_scatter = ax.scatter(tsne_objs[0][:, 0], tsne_objs[0][:, 1])
ax.grid(color='white', linestyle='solid')

#ax.set_title("Scatter Plot (with tooltips!)", size=20)

#labels = [v[0][0] for k,v in enumerate(sample)]
tooltip = mpld3.plugins.PointLabelTooltip(mpld3_scatter)
mpld3.plugins.connect(fig, tooltip)

In [None]:
fig = plt.figure(figsize=(70, 70))
ax = plt.axes(frameon=False)
plt.setp(ax,xticks=(), yticks=())
plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9,
                wspace=0.0, hspace=0.0)
plt.scatter(flow_model_embedded_1[:, 0], flow_model_embedded_1[:, 1], marker="x")

#for k,v in enumerate(flow_model.vocab.items()):
#    plt.annotate(v[0], flow_model_embedded_1[k])

plt.savefig('test2.eps', format='eps', dpi=600)

## Things left to test

* Running true tuples of SRCIP, DSTIP, DSTPORT, and PROTO (label included for now, need to figure out how to persist through pipeline without skewing results)
* Tune hyperparameters of models for all algorithms (word2vec, kmeans, tSNE)
* Find fixes for limitations of larger datasets for tooling that has dependencies on numpy (kmeans, tSNE)


In [None]:
# Generate hash for all flows within the dataset

flowdata_dict = {}

# Parallelize the hashing of flows

with concurrent.futures.ProcessPoolExecutor() as executor:
    for flow in executor.map(process_flow, flowdata_sample.iterrows()):
        flowdata_dict[flow[0]] = (flow[1], flow[2])

In [None]:
## ONLY USE THIS BLOCK IF YOU WANT PER TO SORT FLOWS PER IP

# Lists for tcp and udp flows

tcp_flows = []
udp_flows = []

# Iterate over dataframe

for d in flowdata_sample.iterrows():
    if d[1][2] == 'tcp':
        
        #Append flow
        
        tcp_flows.append(d)
        
    elif d[1][2] == 'udp':
        udp_flows.append(d)

# Set for identifying unique IPs from flows

unique_per_proto = set()

for flow in tcp_flows:
    
    # Add unique SrcAddr to set for TCP flows
    
    unique_per_proto.add(flow[1][3])

for flow in udp_flows:
    
    # Add unique SrcAddr to set for UDP flows
    
    unique_per_proto.add(flow[1][3])

In [None]:
## ONLY USE THIS BLOCK IF YOU WANT PER TO SORT FLOWS PER IP

# Set for unique IPs for overall flowset 
# Maintaining ordering of existing data
# Use this if we wanted a corpus per srcIP

unique_per_flow = set()

for d in flowdata_sample.iterrows():
    unique_per_flow.add(d[1][3])

In [None]:
# Sort flows according to srcIP

ip_dicts = []

# Parallelization framework

with concurrent.futures.ProcessPoolExecutor() as executor:
    
    # pass in unique set to executor
    # Return dict from each process
    
    for d in executor.map(sort_ip_flow, unique_per_proto):
    
        # Roll all dicts up into list
        
        ip_dicts.append(d)