In [14]:
# Import libraries
library(caret)
library(pander)
library(doMC)
library(plyr)
library(dplyr)
library(Matrix)
library(data.table)
library(stringr)
library(FeatureHashing)
library(ggplot2)
library(d3heatmap)

# Install required packages
#list.of.packages <- c("caret", "pander", "doMC", "plyr"
#                     "dplyr", "Matrix", "data.table",
#                     "stringr")
#new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
#if(length(new.packages)) install.packages(new.packages)


# Register CPU core count
registerDoMC(cores=23)

# Utility function for use with % frequency tables
frqtab <- function(x, caption) {
    round(100*prop.table(table(x)), 3)
}

# Utility function to round values in a list
# but only if they are numeric

round_numeric <- function(lst, decimals=2) {
    lappy(lst, function(x){
        if (is.numeric(x)) {
            x <- round(x, decimals)
        }
        return(x)
    })
}

# Utility function for model comparison

summod <- function(cm, fit) {
    summ <- list(k = fit$finalModel$k,
                metric = fit$metric,
                value = fit$results[fit$resultes$k == fit$finalModel$k, fit$metric],
                TN = cm$table[1,1], # True negatives
                TP = cm$table[2,2], # True positives
                FN = cm$table[1,2], # False negatives
                FP = cm$table[2,1], # False positives
                acc = cm$overall["Accuracy"], 
                sens = cm$byClass["Sensitivity"],
                spec = cm$byClass["Specificity"],
                PPV = cm$byClass["Positive Predicted Value"],
                NPV = cm$byClass["Negative Prediced Value"])
    round_numeric(summ)
}

# Utility function to normalize the data

normalize <- function(x){
    num <- x - min(x)
    denom <- max(x) - min(x)
    return (num/denom)
}

#Function to timeslice the data however user would like

timeslice <- function(df, slice, interval) {
    if (slice == 'secs'){
        df <- subset(df, df$StartTime <= df$StartTime[1] + (interval))
        return(df)
    }
    else if (slice == 'mins'){
        df <- subset(df, df$StartTime <= df$StartTime[1] + (interval * 60))
        return(df)
    }
    else if (slice == 'hours') {
        df <- subset(df, df$StartTime <= df$StartTime[1] + (interval * 3600))
        return(df)
    }
    else if (slice == 'days'){
        df <- subset(df, df$StartTime <= df$StartTime[1] + (interval * 86400))
        return(df)
    }
    else
      error <- print("Please enter a valid time interval.")
      return(error)
}

In [2]:
# Read .binetflow file into dataframe

#flowdata_csv <- read.csv("capture20110810.binetflow", colClasses = c("myPosixCt", "numeric", "character", 
                                                                    #"character","character","character",
                                                                    #"character","character","character",
                                                                    #"character","character","numeric", 
                                                                    #"numeric", "numeric", "character"), 
                                                                    #strip.white = TRUE, sep = ',')

flowdata_csv <- fread("capture20110810.binetflow", colClasses = c("character", "numeric", "character", 
                                                                  "character","character","character",
                                                                  "character","character","character",
                                                                  "character","character","numeric", 
                                                                  "numeric","numeric", "character"), 
                                                                  sep = 'auto')

# Set POSIX formatting for StartTime

options(set.seconds="6")
flowdata_csv$StartTime <- as.POSIXct(flowdata_csv$StartTime, format = "%Y/%m/%d %H:%M:%OS")
    
# Trim leading and trailing whitespace
##TODO

Read 2.8% of 2824636 rowsRead 13.8% of 2824636 rowsRead 25.5% of 2824636 rowsRead 38.6% of 2824636 rowsRead 54.9% of 2824636 rowsRead 59.1% of 2824636 rowsRead 75.1% of 2824636 rowsRead 83.9% of 2824636 rowsRead 98.8% of 2824636 rowsRead 2824636 rows and 15 (of 15) columns from 0.360 GB file in 00:00:15


In [3]:
# Subset data

#flowdata_slice <- timeslice(flowdata_csv, 'mins', 9)
set.seed(12345)

flowdata_slice <- sample_n(flowdata_csv, 75000)

# Keep only cat vars
cat_vars <- c("Proto", "SrcAddr", "Sport", "DstAddr", "Dport", "Label")

flowdata_slice <- subset(flowdata_slice, select = cat_vars)

str(flowdata_slice)

Classes ‘data.table’ and 'data.frame':	75000 obs. of  6 variables:
 $ Proto  : chr  "udp" "udp" "icmp" "udp" ...
 $ SrcAddr: chr  "147.32.84.138" "77.85.159.196" "147.32.85.75" "83.53.49.72" ...
 $ Sport  : chr  "59847" "2953" "0x0a03" "10105" ...
 $ DstAddr: chr  "147.32.80.9" "147.32.84.229" "113.105.171.59" "147.32.86.165" ...
 $ Dport  : chr  "53" "13363" "" "12114" ...
 $ Label  : chr  "flow=To-Background-UDP-CVUT-DNS-Server" "flow=Background-UDP-Established" "flow=Background" "flow=Background-UDP-Attempt" ...
 - attr(*, ".internal.selfref")=<externalptr> 


In [4]:
# Factorize columns in dataframe

flowdata_slice$Proto <- as.factor(flowdata_slice$Proto)
flowdata_slice$SrcAddr <- as.factor(flowdata_slice$SrcAddr)
flowdata_slice$Sport <- as.factor(flowdata_slice$Sport)
flowdata_slice$DstAddr <- as.factor(flowdata_slice$DstAddr)
flowdata_slice$Dport <- as.factor(flowdata_slice$Dport)
flowdata_slice$Label <- as.factor(flowdata_slice$Label)

# Clean get rid of NA's

flowdata_slice <- na.omit(flowdata_slice)

In [5]:
# Set randomization seed

set.seed(1234)

# Break dataset into training and test sets
## split dataset randomly with a 67/33% distribution

ind <- sample(2, nrow(flowdata_slice), replace=TRUE, prob=c(0.67, 0.33))

flowdata_training <- flowdata_slice[ind==1,]
flowdata_test <- flowdata_slice[ind==2,]

In [6]:
# Display label distribution in datasets

ft_orig <- frqtab(flowdata_slice$Label)
label_freq <- pander(ft_orig, style="rmarkdown", caption="Original Label Frequency (%)")

ft_train <- frqtab(flowdata_training$Label)
ft_test <- frqtab(flowdata_test$Label)
ftcmp_df <- as.data.frame(cbind(ft_orig, ft_train, ft_test))
colnames(ftcmp_df) <- c("Original", "Training Set", "Test Set")
pander(ftcmp_df, style="rmarkdown",
              caption="Comparison of Label frequencies ( in %)")



|  flow=Background  |  flow=Background-ajax.google  |
|:-----------------:|:-----------------------------:|
|       1.533       |             0.023             |

Table: Original Label Frequency (%) (continued below)

 

|  flow=Background-Attempt-cmpgw-CVUT  |
|:------------------------------------:|
|                1.103                 |

Table: Table continues below

 

|  flow=Background-Established-cmpgw-CVUT  |
|:----------------------------------------:|
|                  4.913                   |

Table: Table continues below

 

|  flow=Background-google-analytics1  |  flow=Background-google-analytics10  |
|:-----------------------------------:|:------------------------------------:|
|                0.015                |                0.021                 |

Table: Table continues below

 

|  flow=Background-google-analytics11  |  flow=Background-google-analytics12  |
|:------------------------------------:|:------------------------------------:|
|                0.0

In [7]:
flowdata_test[flowdata_test==""] <- NA
flowdata_training[flowdata_training==''] <- NA

flowdata_training <- as.data.frame(flowdata_training)
flowdata_test <- as.data.frame(flowdata_test)

cat_vars <- c("Proto", "SrcAddr", "Sport", "DstAddr", "Dport")
labels <- c("Label")

flowdata_training_labels <- flowdata_training[,labels]
flowdata_test_labels <- flowdata_test[,labels]

flowdata_training <- flowdata_training[,cat_vars]
flowdata_test <- flowdata_test[,cat_vars]

#dummy_flowdata_training <- dummyVars(~ ., data = flowdata_training, fullRank = TRUE)

#dummy_flowdata_test <- dummyVars(~ ., data = flowdata_test, fullRank = TRUE)

#trsf1 <- data.frame(predict(dummy_flowdata_training, newdata = flowdata_training))
#trfs2 <- data.frame(predict(dummy_flowdata_test, newdata = flowdata_test))

In [27]:
str(flowdata_training)
str(flowdata_training_labels)

str(flowdata_test)
str(flowdata_test_labels)

'data.frame':	50095 obs. of  5 variables:
 $ Proto  : Factor w/ 11 levels "arp","esp","icmp",..: 11 11 3 11 11 11 11 11 11 10 ...
 $ SrcAddr: Factor w/ 28668 levels "00:0c:29:97:a3:eb",..: 4085 16828 4206 20298 9921 4191 23980 15146 4185 4237 ...
 $ Sport  : Factor w/ 37710 levels "","0x0003","0x0008",..: 32668 9019 8 83 15561 35060 33894 13811 13514 36490 ...
 $ DstAddr: Factor w/ 8812 levels "10.0.0.2","101.102.51.96",..: 1385 1479 347 1722 1479 1385 1479 1479 1385 6691 ...
 $ Dport  : Factor w/ 7733 levels "","0x0001","0x0002",..: 5777 1001 NA 831 1001 5777 1001 1001 5777 7509 ...
 Factor w/ 63 levels "flow=Background",..: 62 26 1 25 26 62 26 26 62 24 ...
'data.frame':	24905 obs. of  5 variables:
 $ Proto  : Factor w/ 11 levels "arp","esp","icmp",..: 11 11 10 11 11 11 11 11 11 11 ...
 $ SrcAddr: Factor w/ 28668 levels "00:0c:29:97:a3:eb",..: 4238 28406 4137 4272 4085 25805 6976 24395 4427 4085 ...
 $ Sport  : Factor w/ 37710 levels "","0x0003","0x0008",..: 30381 18527 1679 32273 186

In [32]:
# Train on the dataset

# Define training parameters
    
ctrl <- trainControl(method="repeatedcv", repeats = 10)

# Run training! LET THE COMPUTER OVERLORD LEARN

flow_model_1 <- train(flowdata_training, flowdata_training_labels, method='nb', trControl=ctrl, tuneLength = 10)

flow_model_1

Naive Bayes 

50095 samples
    5 predictors
   63 classes: 'flow=Background', 'flow=Background-ajax.google', 'flow=Background-Attempt-cmpgw-CVUT', 'flow=Background-Established-cmpgw-CVUT', 'flow=Background-google-analytics1', 'flow=Background-google-analytics10', 'flow=Background-google-analytics11', 'flow=Background-google-analytics12', 'flow=Background-google-analytics13', 'flow=Background-google-analytics14', 'flow=Background-google-analytics15', 'flow=Background-google-analytics16', 'flow=Background-google-analytics2', 'flow=Background-google-analytics3', 'flow=Background-google-analytics4', 'flow=Background-google-analytics5', 'flow=Background-google-analytics6', 'flow=Background-google-analytics7', 'flow=Background-google-analytics8', 'flow=Background-google-analytics9', 'flow=Background-google-pop', 'flow=Background-google-webmail', 'flow=Background-TCP-Attempt', 'flow=Background-TCP-Established', 'flow=Background-UDP-Attempt', 'flow=Background-UDP-Established', 'flow=Backgroun

In [10]:
# Run prediction

flow_model_prediction <- predict(flow_model_1, flowdata_test)

In [11]:
# Calculate confusion matrix for prediction

cmat1 <- confusionMatrix(flow_model_prediction, flowdata_test_labels)
cmat1

Confusion Matrix and Statistics

                                                                         Reference
Prediction                                                                flow=Background
  flow=Background                                                                     319
  flow=Background-ajax.google                                                           0
  flow=Background-Attempt-cmpgw-CVUT                                                    1
  flow=Background-Established-cmpgw-CVUT                                                5
  flow=Background-google-analytics1                                                     0
  flow=Background-google-analytics10                                                    0
  flow=Background-google-analytics11                                                    0
  flow=Background-google-analytics12                                                    0
  flow=Background-google-analytics13                                      

In [12]:
str(cmat1$byClass)

 num [1:63, 1:8] 0.837 1 0.708 0.897 0.333 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:63] "Class: flow=Background" "Class: flow=Background-ajax.google" "Class: flow=Background-Attempt-cmpgw-CVUT" "Class: flow=Background-Established-cmpgw-CVUT" ...
  ..$ : chr [1:8] "Sensitivity" "Specificity" "Pos Pred Value" "Neg Pred Value" ...


In [None]:
cmat1$overall

cmat1_df <- data.frame(cmat1$table$Prediction)


In [17]:
heatmap <- d3heatmap(cmat1, scale = "column", dendrogram = "none", color="Blues")

In [26]:
str(heatmap)

List of 7
 $ x            :List of 6
  ..$ rows   : NULL
  ..$ cols   : NULL
  ..$ matrix :List of 4
  .. ..$ data: chr [1:3969] "319" "0" "0" "0" ...
  .. ..$ dim : int [1:2] 63 63
  .. ..$ rows: chr [1:63] "flow=Background" "flow=Background-ajax.google" "flow=Background-Attempt-cmpgw-CVUT" "flow=Background-Established-cmpgw-CVUT" ...
  .. ..$ cols: chr [1:63] "flow=Background" "flow=Background-ajax.google" "flow=Background-Attempt-cmpgw-CVUT" "flow=Background-Established-cmpgw-CVUT" ...
  ..$ image  : chr "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAD8AAAA/CAYAAABXXxDfAAAEIUlEQVRoge2bT2gcVRyAv5ndNQvWVEoSD0qhBzFrESqtafEorRhaCbTEQ"| __truncated__
  ..$ theme  : NULL
  ..$ options:List of 7
  .. ..$ xaxis_height   : num 80
  .. ..$ yaxis_width    : num 120
  .. ..$ xaxis_font_size: NULL
  .. ..$ yaxis_font_size: NULL
  .. ..$ brush_color    : chr "#0000FF"
  .. ..$ show_grid      : logi TRUE
  .. ..$ anim_duration  : num 500
 $ width        : NULL
 $ height       : NULL
 $ sizingPoli