In [7]:
#
#       mnist_softmax_regression.py
#
#       softmax regression using TensorFlow. Uses Uses MNIST
#       database of handwritten digits (http://yann.lecun.com/exdb/mnist/) 
#
#       See http://www.tensorflow.org/tutorials/mnist/beginners
#
#	David Meyer
#	dmm@1-4-5.net
#	Mon Nov 30 07:44:52 2015
#
#	$Header: $
#
#
#
#       Need these
#
import input_data             # google code to import MNIST data sets
import tensorflow as tf
import numpy as np
import pandas as pd
import os
#
#       parameters
#
DEBUG           = 1             # set DEBUG = 1 to watch optimization logs
learning_rate   = 0.01
training_epochs = 25
batch_size      = 100
display_step    = 1

In [9]:
#
#       get MNIST data, one hot encoded, write it to /tmp/data
#

flowdata = pd.read_csv('/home/ehenry/CTU-13-Dataset/1/capture20110810.binetflow')

flowdata_sample = flowdata.sample(n=10000)

In [78]:
flowdata_subset = flowdata_sample.reindex(columns=['Dur', 'TotPkts', 'TotBytes','SrcBytes', 'Label'])
flowdata_subset.head(n=10)

Unnamed: 0,Dur,TotPkts,TotBytes,SrcBytes,Label
2092415,0.000921,2,136,76,flow=Background-UDP-Established
2726205,9.016457,3,222,222,flow=Background-TCP-Attempt
1642260,0.202772,2,380,75,flow=To-Background-UDP-CVUT-DNS-Server
1688601,3495.481689,16,2318,1286,flow=Background-UDP-Established
1210541,0.000475,2,473,140,flow=Background-UDP-Established
880743,0.000194,2,214,81,flow=To-Background-UDP-CVUT-DNS-Server
2361892,3102.302246,4,274,154,flow=Background-UDP-Established
2311378,17.925226,6,372,246,flow=Background-Established-cmpgw-CVUT
2432686,0.110233,2,129,69,flow=Background-UDP-Established
268055,0.589232,3,192,132,flow=Background-TCP-Attempt


Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
2092415,2011/08/10 14:15:22.578252,0.000921,udp,41.99.110.99,59356,<->,147.32.84.229,13363,CON,0,0.0,2,136,76,flow=Background-UDP-Established
2726205,2011/08/10 15:41:28.399526,9.016457,tcp,93.136.73.51,52949,->,147.32.86.165,443,S_,0,,3,222,222,flow=Background-TCP-Attempt
1642260,2011/08/10 13:20:14.751880,0.202772,udp,147.32.84.59,50656,<->,147.32.80.9,53,CON,0,0.0,2,380,75,flow=To-Background-UDP-CVUT-DNS-Server
1688601,2011/08/10 13:26:34.390799,3495.481689,udp,147.32.84.229,13363,<->,89.246.168.138,52187,CON,0,0.0,16,2318,1286,flow=Background-UDP-Established
1210541,2011/08/10 12:21:12.630016,0.000475,udp,94.195.168.69,55591,<->,147.32.86.194,35155,CON,0,0.0,2,473,140,flow=Background-UDP-Established
880743,2011/08/10 11:32:33.292579,0.000194,udp,147.32.84.138,59733,<->,147.32.80.9,53,CON,0,0.0,2,214,81,flow=To-Background-UDP-CVUT-DNS-Server
2361892,2011/08/10 14:53:37.316938,3102.302246,udp,81.137.240.148,13268,<->,147.32.84.229,13363,CON,0,0.0,4,274,154,flow=Background-UDP-Established
2311378,2011/08/10 14:46:32.110398,17.925226,tcp,147.32.84.59,56785,->,195.113.232.83,80,FSA_FSA,0,0.0,6,372,246,flow=Background-Established-cmpgw-CVUT
2432686,2011/08/10 15:02:56.180333,0.110233,udp,147.32.84.229,13363,<->,94.8.68.54,41681,CON,0,0.0,2,129,69,flow=Background-UDP-Established
268055,2011/08/10 10:18:58.424820,0.589232,tcp,70.37.98.60,61744,->,147.32.85.100,1656,S_RA,0,0.0,3,192,132,flow=Background-TCP-Attempt


In [None]:
#
#
#       build tf computation graph
#
#       MNIST data image has shape 28*28=784
#

x = tf.placeholder("float", [None, 784])

#
#       0-9 digits recognition => 10 classes (one hot)
#

y = tf.placeholder("float", [None, 10]) 

#
#       Model parameters
#

W = tf.Variable(tf.zeros([784, 10]), name="weight_matrix")
b = tf.Variable(tf.zeros([10]),      name="bias_vector")

#
#       model (softmax)
#
y_hat = tf.nn.softmax(tf.matmul(x, W) + b)
#
#       use the (convex) cross entropy error cost
#
#       special case of cross entropy where y = 1, where
#
#       L(W) = - \frac{1}{N}\sum\limits_{n = 1|^{N} H(p_n,q_n)
#            = - \frac{1}{N}\sum\limits_{n = 1|^{N}
#               [y_{n}\log \hat{y}_{n} + ( 1 - y_{n}) \log (1 - \hat{y}_{n}]
#       where
#       \hat{y}_{n} \equiv g(w \cdot x_{n}) and g(z) is the logistic function
#
#
cross_entropy = -tf.reduce_sum(y*tf.log(y_hat))
#
#       Train with GD/minibatch
#
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy)
#
#       Initialize everythying
# 
init = tf.initialize_all_variables()
#
#       Run it all
# 
with tf.Session() as sess:
    sess.run(init)
#
#       Training cycle
#
    for epoch in range(training_epochs):
        avg_cost    = 0.
        total_batch = int(mnist.train.num_examples/batch_size)
#
#       Loop over all batches
#
        for i in range(total_batch):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
#
#       Fit training using batch data
#
            sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})
#
#       Compute average loss
#
            avg_cost += sess.run(cross_entropy,                 \
                             feed_dict={x: batch_xs, y: batch_ys})/total_batch
#
#       Display logs per epoch step
#
        if (DEBUG):
                if (epoch % display_step) == 0:
                   print("Epoch:", '%04d' % (epoch+1),          \
                   "cost=", "{:.9f}".format(avg_cost))

    print("Done")




In [None]:
#
#       
#       Test model
#
#       Notes: tf.argmax is an extremely useful function which gives you
#       the index of the highest entry in a tensor along some axis. For
#       example, tf.argmax(y,1) is the label our model thinks is most
#       likely for each input, while tf.argmax(y_,1) is the correct label.
#       We can use tf.equal to check if our prediction matches the truth.
#
    correct_prediction = tf.equal(tf.argmax(y_hat, 1), tf.argmax(y, 1))
#
#
#       At this point correct_prediction is
#
#       Tensor("Equal:0", shape=TensorShape([Dimension(None)]), dtype=bool)
#
#       i.e., correct_prediction is a list of booleans. To determine what
#       fraction are correct, we cast to floating point numbers and then
#       take the mean. For example, [True, False, True, True] would become
#       [1,0,1,1] which would become 0.75.
#
#
#
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
#
#       accuracy:  Tensor("Mean:0", shape=TensorShape([]), dtype=float32)
#
#
    print("Accuracy:",                                                  \
          accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))
