In [1]:
# Import libraries
library(caret)
library(pander)
library(doMC)
library(plyr)
library(dplyr)
library(Matrix)
library(data.table)
library(stringr)

# Register CPU core count
registerDoMC(cores=23)

# Utility function for use with % frequency tables
frqtab <- function(x, caption) {
    round(100*prop.table(table(x)), 3)
}

# Utility function to round values in a list
# but only if they are numeric

round_numeric <- function(lst, decimals=2) {
    lappy(lst, function(x){
        if (is.numeric(x)) {
            x <- round(x, decimals)
        }
        return(x)
    })
}

# Utility function for model comparison

summod <- function(cm, fit) {
    summ <- list(k = fit$finalModel$k,
                metric = fit$metric,
                value = fit$results[fit$resultes$k == fit$finalModel$k, fit$metric],
                TN = cm$table[1,1], # True negatives
                TP = cm$table[2,2], # True positives
                FN = cm$table[1,2], # False negatives
                FP = cm$table[2,1], # False positives
                acc = cm$overall["Accuracy"], 
                sens = cm$byClass["Sensitivity"],
                spec = cm$byClass["Specificity"],
                PPV = cm$byClass["Positive Predicted Value"],
                NPV = cm$byClass["Negative Prediced Value"])
    round_numeric(summ)
}

# Utility function to normalize the data

normalize <- function(x){
    num <- x - min(x)
    denom <- max(x) - min(x)
    return (num/denom)
}

#Function to timeslice the data however user would like

timeslice <- function(df, slice, interval) {
    if (slice == 'secs'){
        df <- subset(df, df$StartTime <= df$StartTime[1] + (interval))
        return(df)
    }
    else if (slice == 'mins'){
        df <- subset(df, df$StartTime <= df$StartTime[1] + (interval * 60))
        return(df)
    }
    else if (slice == 'hours') {
        df <- subset(df, df$StartTime <= df$StartTime[1] + (interval * 3600))
        return(df)
    }
    else if (slice == 'days'){
        df <- subset(df, df$StartTime <= df$StartTime[1] + (interval * 86400))
        return(df)
    }
    else
      error <- print("Please enter a valid time interval.")
      return(error)
}

Loading required package: lattice
Loading required package: ggplot2
Loading required package: foreach
Loading required package: iterators
Loading required package: parallel

Attaching package: ‘dplyr’

The following objects are masked from ‘package:plyr’:

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, last



In [2]:
# Read .binetflow file into dataframe

#flowdata_csv <- read.csv("capture20110810.binetflow", colClasses = c("myPosixCt", "numeric", "character", 
                                                                    #"character","character","character",
                                                                    #"character","character","character",
                                                                    #"character","character","numeric", 
                                                                    #"numeric", "numeric", "character"), 
                                                                    #strip.white = TRUE, sep = ',')

flowdata_csv <- fread("capture20110810.binetflow", colClasses = c("myPosixCt", "numeric", "character", 
                                                                  "character","character","character",
                                                                  "character","character","character",
                                                                  "character","character","numeric", 
                                                                  "numeric","numeric", "character"), 
                                                                  sep = 'auto')

# Set POSIX formatting for StartTime

options(set.seconds="6")
flowdata_csv$StartTime <- as.POSIXct(flowdata_csv$StartTime, format = "%Y/%m/%d %H:%M:%OS")
    
# Trim leading and trailing whitespace
##TODO

Read 2824636 rows and 15 (of 15) columns from 0.360 GB file in 00:00:13
Classes ‘data.table’ and 'data.frame':	2824636 obs. of  15 variables:
 $ StartTime: POSIXct, format: "2011-08-10 09:46:59" "2011-08-10 09:47:00" ...
 $ Dur      : num  1.03 1.01 3.06 3.11 3.08 ...
 $ Proto    : chr  "tcp" "tcp" "tcp" "tcp" ...
 $ SrcAddr  : chr  "94.44.127.113" "94.44.127.113" "147.32.86.89" "147.32.86.89" ...
 $ Sport    : chr  "1577" "1577" "4768" "4788" ...
 $ Dir      : chr  "   ->" "   ->" "   ->" "   ->" ...
 $ DstAddr  : chr  "147.32.84.59" "147.32.84.59" "77.75.73.33" "77.75.73.33" ...
 $ Dport    : chr  "6881" "6881" "80" "80" ...
 $ State    : chr  "S_RA" "S_RA" "SR_A" "SR_A" ...
 $ sTos     : chr  "0" "0" "0" "0" ...
 $ dTos     : chr  "0" "0" "0" "0" ...
 $ TotPkts  : num  4 4 3 3 3 3 4 4 4 5 ...
 $ TotBytes : num  276 276 182 182 182 182 244 252 252 352 ...
 $ SrcBytes : num  156 156 122 122 122 122 124 132 132 208 ...
 $ Label    : chr  "flow=Background-Established-cmpgw-CVUT" "flow=B

In [11]:
# Subset and normalize data

#Function to carve up by timeslice / interval

flowdata_slice <- timeslice(flowdata_csv, 'mins', 10)

str(flowdata_slice)

Classes ‘data.table’ and 'data.frame':	90783 obs. of  15 variables:
 $ StartTime: POSIXct, format: "2011-08-10 09:46:59" "2011-08-10 09:47:00" ...
 $ Dur      : num  1.03 1.01 3.06 3.11 3.08 ...
 $ Proto    : chr  "tcp" "tcp" "tcp" "tcp" ...
 $ SrcAddr  : chr  "94.44.127.113" "94.44.127.113" "147.32.86.89" "147.32.86.89" ...
 $ Sport    : chr  "1577" "1577" "4768" "4788" ...
 $ Dir      : chr  "   ->" "   ->" "   ->" "   ->" ...
 $ DstAddr  : chr  "147.32.84.59" "147.32.84.59" "77.75.73.33" "77.75.73.33" ...
 $ Dport    : chr  "6881" "6881" "80" "80" ...
 $ State    : chr  "S_RA" "S_RA" "SR_A" "SR_A" ...
 $ sTos     : chr  "0" "0" "0" "0" ...
 $ dTos     : chr  "0" "0" "0" "0" ...
 $ TotPkts  : num  4 4 3 3 3 3 4 4 7 12 ...
 $ TotBytes : num  276 276 182 182 182 ...
 $ SrcBytes : num  156 156 122 122 122 122 124 132 520 638 ...
 $ Label    : chr  "flow=Background-Established-cmpgw-CVUT" "flow=Background-Established-cmpgw-CVUT" "flow=Background-TCP-Attempt" "flow=Background-TCP-Attempt"

In [20]:
# Factorize columns in dataframe

flowdata_slice$Proto <- as.factor(flowdata_slice$Proto)
flowdata_slice$SrcAddr <- as.factor(flowdata_slice$SrcAddr)
flowdata_slice$Sport <- as.factor(flowdata_slice$Sport)
flowdata_slice$DstAddr <- as.factor(flowdata_slice$DstAddr)
flowdata_slice$Dport <- as.factor(flowdata_slice$Dport)
flowdata_slice$Label <- as.factor(flowdata_slice$Label)

# Clean get rid of NA's

flowdata_slice <- na.omit(flowdata_slice)

In [21]:
# Set randomization seed

set.seed(1234)

# Break dataset into training and test sets
## split dataset randomly with a 67/33% distribution

ind <- sample(2, nrow(flowdata_slice), replace=TRUE, prob=c(0.67, 0.33))

flowdata_training <- flowdata_slice[ind==1,]
flowdata_test <- flowdata_slice[ind==2,]

#flowdata_training_classes <- flowdata_conts[ind==1,5]
#flowdata_test_classes <- flowdata_conts[ind==2,5]

In [22]:
# Display label distribution in datasets

ft_orig <- frqtab(flowdata_slice$Label)
label_freq <- pander(ft_orig, style="rmarkdown", caption="Original Label Frequency (%)")

ft_train <- frqtab(flowdata_training$Label)
ft_test <- frqtab(flowdata_test$Label)
ftcmp_df <- as.data.frame(cbind(ft_orig, ft_train, ft_test))
colnames(ftcmp_df) <- c("Original", "Training Set", "Test Set")
pander(ftcmp_df, style="rmarkdown",
              caption="Comparison of Label frequencies ( in %)")



|  flow=Background  |  flow=Background-ajax.google  |
|:-----------------:|:-----------------------------:|
|       2.831       |             0.029             |

Table: Original Label Frequency (%) (continued below)

 

|  flow=Background-Attempt-cmpgw-CVUT  |
|:------------------------------------:|
|                1.301                 |

Table: Table continues below

 

|  flow=Background-Established-cmpgw-CVUT  |
|:----------------------------------------:|
|                  3.417                   |

Table: Table continues below

 

|  flow=Background-google-analytics1  |  flow=Background-google-analytics10  |
|:-----------------------------------:|:------------------------------------:|
|                0.022                |                0.126                 |

Table: Table continues below

 

|  flow=Background-google-analytics11  |  flow=Background-google-analytics12  |
|:------------------------------------:|:------------------------------------:|
|                0.0

In [23]:
# NA omit after cleaning

flowdata_test <- na.omit(flowdata_test)
flowdata_training <- na.omit(flowdata_training)

In [None]:
# Train on the dataset
ctrl <- trainControl(method="cv", 10)
set.seed(12345)

flow_model_1 <- train(Label , flowdata_)