model_nb.Rmd



## Building the model

```{r}
library(randomForest)
library(caret)
library(e1071)

```

        
## creating one more temporary table called kddcopy2 modifying result column of that to labels
#nearZeroVar() function gives the list attributes which have almost 0 variance.

```{r}
#New approach 
# Clean up near zero variance features
kddcopy2<-kddcopy
nzvcol <- nearZeroVar(kddcopy2)
nzvcol
# [1]  1  6  7  8  9 10 11 13 14 15 16 17 18 19 20 21 22 29 32
colnames(kddcopy2[nzvcol])
```
##These columns have zero variance 
colnames(kddcopy2[nzvcol])
 [1] "duration"           "dst_bytes"          "land"               "wrong_fragment"    
 [5] "urgent"             "hot"                "num_failed_logins"  "num_compromised"   
 [9] "root_shell"         "su_attempted"       "num_root"           "num_file_creations"
[13] "num_shells"         "num_access_files"   "num_outbound_cmds"  "is_hot_login"      
[17] "is_guest_login"     "same_srv_rate"      "dst_host_count" 


#train_raw has all non zero variance features
```{r}
train_raw <- kddcopy2[, -nzvcol]
#train_raw
names(train_raw)[dim(train_raw)[2]] <- "label" #renaming last column as label
colnames(train_raw)

```
#remaining important columns.


 [1] "protocol_type"               "service"                     "flag"                       
 [4] "src_bytes"                   "logged_in"                   "count"                      
 [7] "srv_count"                   "serror_rate"                 "srv_serror_rate"            
[10] "rerror_rate"                 "srv_rerror_rate"             "diff_srv_rate"              
[13] "srv_diff_host_rate"          "dst_host_srv_count"          "dst_host_same_srv_rate"     
[16] "dst_host_diff_srv_rate"      "dst_host_same_src_port_rate" "dst_host_srv_diff_host_rate"
[19] "dst_host_serror_rate"        "dst_host_srv_serror_rate"    "dst_host_rerror_rate"       
[22] "dst_host_srv_rerror_rate"    "result" 

# create temporary variable training2

```{r}
#label into factor
training2 <- train_raw
training2$label <- factor(training2$label)
d <- dim(training2)
d

```


#readin test data and store into test_raw

```{r}
test_raw

colnames(test_raw) = c("duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", 
                   "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", 
                   "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", 
                   "num_shells", "num_access_files", "num_outbound_cmds", "is_hot_login","is_guest_login", 
                   "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate","srv_rerror_rate", "same_srv_rate", "diff_srv_rate",                         "srv_diff_host_rate", "dst_host_count","dst_host_srv_count","dst_host_same_srv_rate", "dst_host_diff_srv_rate",                                    "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate","dst_host_srv_serror_rate", 
                   "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "result")
                   
                   
```


# Process the data
```{r}
names(test_raw) 
names(test_raw)[dim(test_raw)[2]] <- "label"  #chumma rename last as label
names(training2)[dim(training2)[2]] <- "label"  #chumma rename last as label


# create testing data in test_raw and store in temporary testing variable
test_raw <- test_raw[ , names(training2)]
testing<-test_raw
View(testing)
testing$label <- as.factor(testing$label)

```


#Building the model by using the Naive Bayes.
# build for only 5 columns.


```{r}
label_result = training2[1:100000 ,"result"]
training_data = training2[1:100000 ,1:5]
#training_data<-training2[1:100000 ,]

#View(training_data)

navie_bayes_tree_model = naiveBayes(as.factor(label_result)~.,
                                    training_data)

# Predict the testing
##testing_data = test.final[,"result"] 
testing_data=training2[100000:125973,]

navie_bayes_pred = predict(navie_bayes_tree_model, testing_data)


#golden_answer = test.final[, "result"]
golden_answer = training2[100000:125973, "result"]
navie_bayes_pred = factor(navie_bayes_pred, levels =levels(golden_answer))

# Get the accuracy


right <- navie_bayes_pred==golden_answer
answer<-table(navie_bayes_pred,golden_answer)
answer
round(prop.table(answer,1)*100,2)


NB_accuracy <- mean(golden_answer == navie_bayes_pred,na.rm = TRUE)
NB_accuracy
```
round(prop.table(answer,1)*100,2)
                golden_answer
navie_bayes_pred   dos normal probe   r2l   u2r
          dos    76.20  12.32 11.27  0.21  0.01
          normal  1.57  96.55  0.52  1.34  0.02
          probe   0.00  10.67 89.33  0.00  0.00
          r2l                                  
          u2r     0.00  90.35  1.32  6.14  2.19
          
> NB_accuracy
[1] 0.8588974          

```{r}
label_result = training2[1:100000 ,"result"]
#training_data = training2[ ,1:5]
training_data<-training2[1:100000 ,]

#View(training_data)

navie_bayes_tree_model = naiveBayes(as.factor(label_result)~.,
                                    training_data)

# Predict the testing
##testing_data = test.final[,"result"] 
testing_data=training2[100000:125973,]

navie_bayes_pred = predict(navie_bayes_tree_model, testing_data)


#golden_answer = test.final[, "result"]
golden_answer = training2[100000:125973, "result"]
navie_bayes_pred = factor(navie_bayes_pred, levels =levels(golden_answer))

# Get the accuracy


right <- navie_bayes_pred==golden_answer
answer<-table(navie_bayes_pred,golden_answer)
answer
round(prop.table(answer,1)*100,2)


NB_accuracy <- mean(golden_answer == navie_bayes_pred,na.rm = TRUE)
NB_accuracy
```

            golden_answer
navie_bayes_pred   dos normal probe   r2l   u2r
          dos     8898    198    19     1     0
          normal   299  11505    62     3     0
          probe     71    576  2114    18     0
          r2l       75     35     0    33     0
          u2r      124   1584   198   152     9
> round(prop.table(answer,1)*100,2)
                golden_answer
navie_bayes_pred   dos normal probe   r2l   u2r
          dos    97.61   2.17  0.21  0.01  0.00
          normal  2.52  96.93  0.52  0.03  0.00
          probe   2.55  20.73 76.07  0.65  0.00
          r2l    52.45  24.48  0.00 23.08  0.00
          u2r     6.00  76.63  9.58  7.35  0.44
> 
> 
> NB_accuracy <- mean(golden_answer == navie_bayes_pred,na.rm = TRUE)
> NB_accuracy
[1] 0.8685224

For test.final dataset
```{r}
label_result = training2[ ,"result"]
#training_data = training2[ ,1:5]
training_data<-training2[ ,]

#View(training_data)

navie_bayes_tree_model = naiveBayes(as.factor(label_result)~.,
                                    training_data)

# Predict the testing
##testing_data = test.final[,"result"] 
testing_data=test.final[,]

navie_bayes_pred = predict(navie_bayes_tree_model, testing_data)


#golden_answer = test.final[, "result"]
golden_answer = test.final[, "result"]
navie_bayes_pred = factor(navie_bayes_pred, levels =levels(golden_answer))

# Get the accuracy


right <- navie_bayes_pred==golden_answer
answer<-table(navie_bayes_pred,golden_answer)
answer
round(prop.table(answer,1)*100,2)


NB_accuracy <- mean(golden_answer == navie_bayes_pred,na.rm = TRUE)
NB_accuracy
```
round(prop.table(answer,1)*100,2)
                golden_answer
navie_bayes_pred   dos normal probe   r2l   u2r
          dos    98.60   0.99  0.02  0.33  0.06
          normal  7.43  86.63  0.00  5.94  0.00
          probe   5.39  13.79 80.02  0.80  0.00
          r2l    14.40  40.16  0.00 44.96  0.47
          u2r     1.56  28.76  9.73 58.49  1.46
> 
> 
> NB_accuracy <- mean(golden_answer == navie_bayes_pred,na.rm = TRUE)
> NB_accuracy
[1] 0.7811951