In [41]:
library(plyr)
library(snow)
library(dplyr)
library(ggplot2)
library(class)
library(Matrix)
library(gmodels)

In [2]:
cluster <- makeCluster(23)

In [3]:
setClass("myPosixCt")
setAs("character", "myPosixCt", function(from) as.POSIXct(from, format = "%Y/%m/%d %H:%M:%OS"))
options(set.seconds="6")

flowdata_csv <- read.csv("capture20110810.binetflow", colClasses = c("myPosixCt", "numeric", "factor", "factor","factor","factor","factor","factor","factor","factor","factor","numeric", "numeric", "numeric", "factor"), strip.white = TRUE, sep = ',')

In [4]:
# Clean dataframe of NA and 0 values
clean_na <- function(df) {
    dataframe <- na.omit(df)
    return(dataframe)
}

clean_flowdata_csv <- clean_na(flowdata_csv)            

In [35]:
# Use probability weights to split data into training and test sets

# Subset dataframe to 1 hour for brevity
hours <- function(h) {
    x <- h * 3600
    return(x)
}

mins <- function(m) {
    x <- m * 60
    return(x)
}

subset1 <- subset(clean_flowdata_csv, clean_flowdata_csv$StartTime[1] + mins(1) >= clean_flowdata_csv$StartTime)

In [5]:
# Use probability weights to split data into training and test sets

set.seed(1234)
ind <- sample(2, nrow(flowdata_csv), replace=TRUE, prob=c(0.67, 0.33))

In [36]:
set.seed(1234)
ind <- sample(2, nrow(subset1), replace=TRUE, prob=c(0.67, 0.33))

#Normalize the data
normalize <- function(x){
    num <- x - min(x)
    denom <- max(x) - min(x)
    return (num/denom)
}

cont_vars <- c("Dur", "TotPkts", "TotBytes", "SrcBytes")

# Normalize the data
subset1_norm <- subset1 %>% mutate_each_(funs(normalize), vars=cont_vars)

subset1_norm <- na.omit(subset1_norm)

# Divide Original dataframe into training and test sets

subset1_training <- subset1_norm[ind==1,cont_vars]
subset1_test <- subset1_norm[ind==2, cont_vars]

# Add labels as factors to dataframe for later cross validation
subset1_trainLabels <- subset1_norm[ind==1, 15]
subset1_testLabels <- subset1_norm[ind==2, 15]

#impute.mean <- function(x) replace(x, is.na(x) | is.nan(x) | is.infinite(x), mean(x[!is.na(x) & !is.nan(x) & !is.infinite(x)]))

#subset1_training <- parApply(cluster, subset1_training, 2, impute.mean)
#subset1_test <- parApply(cluster, subset1_test, 2, impute.mean)

str(subset1_training)

'data.frame':	7885 obs. of  4 variables:
 $ Dur     : num  0.000285 0.00028 0.000849 0.000864 0.00086 ...
 $ TotPkts : num  1.40e-05 1.40e-05 9.31e-06 9.31e-06 9.31e-06 ...
 $ TotBytes: num  8.70e-07 8.70e-07 4.91e-07 4.91e-07 4.91e-07 ...
 $ SrcBytes: num  6.28e-07 6.28e-07 4.91e-07 4.91e-07 4.91e-07 ...


In [37]:
# Run kNN
system.time(
flow_pred <- knn(train = subset1_training, test = subset1_test, cl = subset1_trainLabels, k=89)
)

   user  system elapsed 
  0.489   0.004   0.492 


 
   Cell Contents
|-------------------------|
|                       N |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  3897 

 
                                         | flow_pred 
                      subset1_testLabels |                        flow=Background |     flow=Background-Attempt-cmpgw-CVUT | flow=Background-Established-cmpgw-CVUT |        flow=Background-TCP-Established |            flow=Background-UDP-Attempt |        flow=Background-UDP-Established | flow=To-Background-UDP-CVUT-DNS-Server |                              Row Total | 
-----------------------------------------|----------------------------------------|----------------------------------------|----------------------------------------|----------------------------------------|----------------------------------------|----------------------------------------|----------------------------------------|---------------

In [6]:
#Normalize the data
normalize <- function(x){
    num <- x - min(x)
    denom <- max(x) - min(x)
    return (num/denom)
}

cont_vars <- c("Dur", "TotPkts", "TotBytes", "SrcBytes")

# Normalize the data
flowdata_csv_norm <- flowdata_csv %>% mutate_each_(funs(normalize), vars=cont_vars)

flowdata_csv_norm <- na.omit(flowdata_csv_norm)

# Divide Original dataframe into training and test sets

flowdata_csv_training <- flowdata_csv_norm[ind==1,]
flowdata_csv_test <- flowdata_csv_norm[ind==2,]

# Add labels as factors to dataframe for later cross validation
flowdata_trainLabels <- flowdata_csv_norm[ind==1, 15]
flowdata_testLabels <- flowdata_csv_norm[ind==2, 15]

flowdata_csv_training <- na.omit(flowdata_csv_training)
flowdata_csv_test <- na.omit(flowdata_csv_test)

In [None]:
system.time(
flow_pred <- knn(train = flowdata_csv_training, test = flowdata_csv_test, cl = flowdata_trainLabels, k=1375)
)

In [None]:
subset1 %>% ggvis(~TotPkts, ~TotBytes, fill = ~Label) %>% layer_points()

In [None]:
barplot(round(prop.table(table(flowdata_csv$Label)) * 100, digits = 1), las=1, cex.names=0.2)

In [None]:
head(flowdata_csv, n=10)

In [None]:
# Define a normalization function for continuous vars
normalize <- function(x) {
    num <- x - min(x)
    denom <- max(x) - min(x)
    return (num/denom)
}

# One hot encode the categorical vars
fList <- parLapply(cluster, names(flowdata_csv), reformulate, intercept=FALSE)
mList <- parLapply(fList, sparse.model.matrix,data=flowdata_csv)