# Traffic Classification using k Nearest Neighbor

# Background

## Motivation

I am currently working on feature extraction and identification using simple k nearest neighbor clustering algorithms over a data set.

### Preliminary Information

We're using a dataset that has been pre-captured, cleaned and labeled by an ongoing effort found here : https://stratosphereips.org/category/dataset.html

They're currently working to on a machine learning effort for Malware classification, much like the example we're showing here.

The dataset currently contains typical Netflow capture information, including time stamps, durations of the flows, Src/DstIP's, Src/Dst Port, and other flow related information. The key to the provided dataset was that it is labeled. We can then use these labels to start to train a model to identify and classify unlabeled flows as any of the labels provided in the dataset.

In [1]:
# Import libraries
library(caret)
library(pander)
library(doMC)
library(plyr)
library(dplyr)
library(Matrix)
library(data.table)
library(stringr)

Loading required package: lattice
Loading required package: ggplot2
Loading required package: foreach
Loading required package: iterators
Loading required package: parallel

Attaching package: ‘dplyr’

The following objects are masked from ‘package:plyr’:

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, last



In [2]:
# Read .binetflow file into dataframe

#flowdata_csv <- read.csv("capture20110810.binetflow", colClasses = c("myPosixCt", "numeric", "character", 
                                                                    #"character","character","character",
                                                                    #"character","character","character",
                                                                    #"character","character","numeric", 
                                                                    #"numeric", "numeric", "character"), 
                                                                    #strip.white = TRUE, sep = ',')

flowdata_csv <- fread("capture20110810.binetflow", colClasses = c("myPosixCt", "numeric", "character", 
                                                                  "character","character","character",
                                                                  "character","character","character",
                                                                  "character","character","numeric", 
                                                                  "numeric","numeric", "character"), 
                                                                  sep = 'auto')

# Set POSIX formatting for StartTime

options(set.seconds="6")
flowdata_csv$StartTime <- as.POSIXct(flowdata_csv$StartTime, format = "%Y/%m/%d %H:%M:%OS")
    
# Trim leading and trailing whitespace

str(flowdata_csv)


Read 2824636 rows and 15 (of 15) columns from 0.360 GB file in 00:00:13
Classes ‘data.table’ and 'data.frame':	2824636 obs. of  15 variables:
 $ StartTime: POSIXct, format: "2011-08-10 09:46:59" "2011-08-10 09:47:00" ...
 $ Dur      : num  1.03 1.01 3.06 3.11 3.08 ...
 $ Proto    : chr  "tcp" "tcp" "tcp" "tcp" ...
 $ SrcAddr  : chr  "94.44.127.113" "94.44.127.113" "147.32.86.89" "147.32.86.89" ...
 $ Sport    : chr  "1577" "1577" "4768" "4788" ...
 $ Dir      : chr  "   ->" "   ->" "   ->" "   ->" ...
 $ DstAddr  : chr  "147.32.84.59" "147.32.84.59" "77.75.73.33" "77.75.73.33" ...
 $ Dport    : chr  "6881" "6881" "80" "80" ...
 $ State    : chr  "S_RA" "S_RA" "SR_A" "SR_A" ...
 $ sTos     : chr  "0" "0" "0" "0" ...
 $ dTos     : chr  "0" "0" "0" "0" ...
 $ TotPkts  : num  4 4 3 3 3 3 4 4 4 5 ...
 $ TotBytes : num  276 276 182 182 182 182 244 252 252 352 ...
 $ SrcBytes : num  156 156 122 122 122 122 124 132 132 208 ...
 $ Label    : chr  "flow=Background-Established-cmpgw-CVUT" "flow=B

In [None]:
# Register CPU core count
registerDoMC(cores=23)

# Utility function for use with % frequency tables
frqtab <- function(x, caption) {
    round(100*prop.table(table(x)), 3)
}

# Utility function to round values in a list
# but only if they are numeric

round_numeric <- function(lst, decimals=2) {
    lappy(lst, function(x){
        if (is.numeric(x)) {
            x <- round(x, decimals)
        }
        return(x)
    })
}

# Utility function for model comparison

summod <- function(cm, fit) {
    summ <- list(k = fit$finalModel$k,
                metric = fit$metric,
                value = fit$results[fit$resultes$k == fit$finalModel$k, fit$metric],
                TN = cm$table[1,1], # True negatives
                TP = cm$table[2,2], # True positives
                FN = cm$table[1,2], # False negatives
                FP = cm$table[2,1], # False positives
                acc = cm$overall["Accuracy"], 
                sens = cm$byClass["Sensitivity"],
                spec = cm$byClass["Specificity"],
                PPV = cm$byClass["Positive Predicted Value"],
                NPV = cm$byClass["Negative Prediced Value"])
    round_numeric(summ)
}

# Utility function to normalize the data

normalize <- function(x){
    num <- x - min(x)
    denom <- max(x) - min(x)
    return (num/denom)
}

#Function to timeslice the data however user would like

timeslice <- function(df, slice, interval) {
    if (slice == 'secs'){
        df <- subset(df, df$StartTime <= df$StartTime[1] + (interval))
        return(df)
    }
    else if (slice == 'mins'){
        df <- subset(df, df$StartTime <= df$StartTime[1] + (interval * 60))
        return(df)
    }
    else if (slice == 'hours') {
        df <- subset(df, df$StartTime <= df$StartTime[1] + (interval * 3600))
        return(df)
    }
    else if (slice == 'days'){
        df <- subset(df, df$StartTime <= df$StartTime[1] + (interval * 86400))
        return(df)
    }
    else
      error <- print("Please enter a valid time interval.")
      return(error)
}


In [None]:
# Subset and normalize data

#Function to carve up by timeslice / interval

flowdata_slice <- timeslice(flowdata_csv, 'mins', 10)

str(flowdata_slice)


In [None]:
# Factorize columns in dataframe

flowdata_slice$Proto <- as.factor(flowdata_slice$Proto)
flowdata_slice$SrcAddr <- as.factor(flowdata_slice$SrcAddr)
flowdata_slice$Sport <- as.factor(flowdata_slice$Sport)
flowdata_slice$DstAddr <- as.factor(flowdata_slice$DstAddr)
flowdata_slice$Dport <- as.factor(flowdata_slice$Dport)

str(flowdata_slice)

In [None]:
# One hot encode categorical vars
#one_hot_proto <- model.matrix(~flowdata_slice$Proto)
#one_hot_src_addr <- model.matrix(~flowdata_slice$SrcAddr)
#one_hot_dst_addr <- model.matrix(~flowdata_slice$DstAddr)
#one_hot_d_port <- model.matrix(~flowdata_slice$Dport)
#one_hot_s_port <- model.matrix(~flowdata_slice$Sport)

#write.csv(model.matrix(~flowdata_slice$Proto), file="one_hot_proto.csv")
#write.csv(model.matrix(~flowdata_slice$SrcAddr), file="one_hot_src_addr.csv")
#write.csv(model.matrix(~flowdata_slice$Sport), file="one_hot_s_port.csv")
#write.csv(model.matrix(~flowdata_slice$DstAddr), file="one_hot_dst_addr.csv")
write.csv(model.matrix(~flowdata_slice$Dport), file="one_hot_d_port.csv")

#flowdata_one_hot <- cbind(flowdata_slice$Dur, one_hot_proto, one_hot_src_addr, one_hot_s_port, one_hot_dst_addr,
                          #one_hot_dst_port, flowdata_slice$TotPkts, flowdata_slice$TotBytes, flowdata_slice$SrcBytes,
                         #flowdata_slice$Label)

#str(flowdata_one_hot)
dim(flowdata_slice)
dim(flowdata_slice$SrcAddr)
            

In [None]:
one_hot_dst_addr <- fread("one_hot_dst_addr.csv", sep="auto")

str(one_hot_dst_addr)

In [None]:
flowdata_one_hot <- cbind(flowdata_slice$Dur, one_hot_proto, one_hot_src_addr, one_hot_s_port, one_hot_dst_addr,
                          one_hot_d_port, flowdata_slice$TotPkts, flowdata_slice$TotBytes, flowdata_slice$SrcBytes,
                         flowdata_slice$Label)

str(flowdata_one_hot)

In [None]:
#Define continuous vars, subset flowdata and save as CSV

contvars <- names(flowdata_csv) %in% c("StartTime", "Proto", "SrcAddr", "Sport", "Dir", "DstAddr", "Dport", "State", "sTos", "dTos")
flowdata_conts <- flowdata_csv[!contvars]

str(flowdata_conts)

# Write out to csv file for persistence

write.csv(flowdata_conts, file='flowdata_conts.csv')

In [None]:
# Normalize the data

cont_vars <- c("Dur", "TotPkts", "TotBytes", "SrcBytes")

flowdata_conts <- flowdata_conts %>% mutate_each_(funs(normalize), vars = cont_vars)

# Clean flowdata_conts, totally hacky but dataframe transforms are crazy fast and scale well

flowdata_conts <- flowdata_conts[!(flowdata_conts$Dur == 0),]
flowdata_conts <- flowdata_conts[!(flowdata_conts$TotPkts == 0),]
flowdata_conts <- flowdata_conts[!(flowdata_conts$TotBytes == 0),]
flowdata_conts <- flowdata_conts[!(flowdata_conts$SrcBytes == 0),]

In [None]:
# Re-factor-fy variable

flowdata_conts$Label <- as.factor(flowdata_conts$Label)

# Set randomization seed

set.seed(1234)

# Break dataset into training and test sets
## split dataset randomly with a 67/33% distribution

ind <- sample(2, nrow(flowdata_conts), replace=TRUE, prob=c(0.67, 0.33))

flowdata_training <- flowdata_conts[ind==1,]
flowdata_test <- flowdata_conts[ind==2,]

#flowdata_training_classes <- flowdata_conts[ind==1,5]
#flowdata_test_classes <- flowdata_conts[ind==2,5]

In [None]:
# Display label distribution in datasets

ft_orig <- frqtab(flowdata_conts$Label)
label_freq <- pander(ft_orig, style="rmarkdown", caption="Original Label Frequency (%)")

ft_train <- frqtab(flowdata_training$Label)
ft_test <- frqtab(flowdata_test$Label)
ftcmp_df <- as.data.frame(cbind(ft_orig, ft_train, ft_test))
colnames(ftcmp_df) <- c("Original", "Training Set", "Test Set")
pander(ftcmp_df, style="rmarkdown",
              caption="Comparison of Label frequencies ( in %)")

In [None]:
# NA omit after cleaning
flowdata_test <- na.omit(flowdata_test)
flowdata_training <- na.omit(flowdata_training)

set.seed(123)

# Create list for seed used with parallelization

seeds <- vector(mode = "list", length = 51)
for (i in 1:50) seeds[[i]] <- sample.int(1000, 22)

# Used for last model
    
seeds[[51]] <- sample.int(1000, 1)

# Define training parameters
    
ctrl <- trainControl(method="repeatedcv", repeats=3, seeds = seeds)

# Run training! LET THE COMPUTER OVERLORD LEARN
    
system.time(
knnFit1 <- train(Label ~., flowdata_training, method="knn",
                trControl = ctrl, tuneLength = 10, preProcess = c("scale", "center", "pca"))
)

# Output stats TODO : convert to markdown for web / slide view

knnFit1

In [None]:
plot(knnFit1)

In [None]:
# Run prediction over test dataset
system.time(
knnPredict1 <- predict(knnFit1, newdata = flowdata_test)
)

In [None]:
plot(knnPredict1)

In [None]:
# Calculate confusion matric for prediction accuracy
cmat1 <- confusionMatrix(knnPredict1, flowdata_test$Label)

In [None]:
cmat1

In [None]:
# render plot
# we use three different layers
# first we draw tiles and fill color based on percentage of test cases
tile <- ggplot() +
geom_tile(aes(x=Actual, y=Predicted,fill=Percent),data=cmat1, color="black",size=0.1) +
labs(x="Actual",y="Predicted")
tile = tile + 
geom_text(aes(x=Actual,y=Predicted, label=sprintf("%.1f", Percent)),data=cmat1, size=3, colour="black") +
scale_fill_gradient(low="grey",high="red")
 
# lastly we draw diagonal tiles. We use alpha = 0 so as not to hide previous layers but use size=0.3 to highlight border
tile = tile + 
geom_tile(aes(x=Actual,y=Predicted),data=subset(cmat1, as.character(Actual)==as.character(Predicted)), color="black",size=0.3, fill="black", alpha=0) 
 
#render
tile