In [67]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score, recall_score, precision_score, auc, precision_recall_curve, accuracy_score, f1_score, confusion_matrix, average_precision_score
from sklearn.metrics import confusion_matrix

In [8]:
dataset = pd.read_csv('dataset/capture-scenario10.pcap.netflow.labeled', delim_whitespace=True,skiprows=1,header=None)
dataset.columns = ["Date","Start","Duration","Protocol", "Source_IP","->","Destination_IP", "Flags","Tos","Packets"
                            ,"Bytes", "Flows","Label"]

# convert to datetime
dataset['Start'] = dataset['Date'] + ' ' + dataset['Start']
dataset['Start'] = pd.to_datetime(dataset['Start'])

# split port information
dataset['Source_Port'] = dataset['Source_IP'].apply(lambda x: x.split(":")[1] if len(x.split(":")) > 1 else None)
dataset['Source_IP'] = dataset['Source_IP'].apply(lambda x: x.split(":")[0])
dataset['Destination_Port'] = dataset['Destination_IP'].apply(lambda x: x.split(":")[1] if len(x.split(":")) > 1 else None)
dataset['Destination_IP'] = dataset['Destination_IP'].apply(lambda x: x.split(":")[0])

dataset.head()

Unnamed: 0,Date,Start,Duration,Protocol,Source_IP,->,Destination_IP,Flags,Tos,Packets,Bytes,Flows,Label,Source_Port,Destination_Port
0,2011-08-18,2011-08-18 10:19:13.328,0.002,TCP,147.32.86.166,->,212.24.150.110,FRPA_,0,4,321,1,Background,33426,25443
1,2011-08-18,2011-08-18 10:19:13.328,4.995,UDP,82.39.2.249,->,147.32.84.59,INT,0,617,40095,1,Background,41915,43087
2,2011-08-18,2011-08-18 10:19:13.329,4.996,UDP,147.32.84.59,->,82.39.2.249,INT,0,1290,1909200,1,Background,43087,41915
3,2011-08-18,2011-08-18 10:19:13.330,0.0,TCP,147.32.86.166,->,147.32.192.34,A_,0,1,66,1,Background,42020,993
4,2011-08-18,2011-08-18 10:19:13.330,0.0,TCP,212.24.150.110,->,147.32.86.166,FPA_,0,2,169,1,Background,25443,33426


In [9]:
# remove background flows 
dataset = dataset[dataset['Label'] != 'Background']

In [34]:
# sort by values and group by source_ip
dataset.sort_values(by='Start',inplace=True)
dataset.sort_values(by='Source_IP',inplace=True)

dataset['Source_Port'] = dataset['Source_Port'].astype('category')
dataset['Destination_Port'] = dataset['Destination_Port'].astype('category')
dataset.head()

Unnamed: 0,Date,Start,Duration,Protocol,Source_IP,->,Destination_IP,Flags,Tos,Packets,Bytes,Flows,Label,Source_Port,Destination_Port
2384616,2011-08-18,2011-08-18 12:41:18.621,0.0,TCP,10.10.20.233,->,147.32.80.13,FA_,0,1,66,1,LEGITIMATE,46315,80
2609166,2011-08-18,2011-08-18 12:53:50.639,4.042,TCP,10.10.20.233,->,147.32.80.13,FA_,0,5,330,1,LEGITIMATE,36301,80
3510251,2011-08-18,2011-08-18 13:40:12.130,3.747,TCP,10.10.20.233,->,147.32.80.13,FA_,0,5,330,1,LEGITIMATE,44255,80
1326222,2011-08-18,2011-08-18 11:40:26.372,0.0,TCP,10.10.20.233,->,147.32.80.13,FA_,0,1,66,1,LEGITIMATE,43960,80
71185,2011-08-18,2011-08-18 10:23:50.526,0.0,TCP,10.10.20.233,->,147.32.80.13,FA_,0,1,66,1,LEGITIMATE,52007,80


In [62]:
features = ['Duration', 'Protocol', 'Flags', 'Packets', 'Bytes']
discrete_features = ['Protocol', 'Flags']
dataset[discrete_features] = dataset[discrete_features].apply(LabelEncoder().fit_transform)

X = dataset[features].values
y = np.array([1 if x=="Botnet" else 0 for x in dataset['Label'].values])

array([0, 0, 0, ..., 0, 0, 0])

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)
print("Training Data")
print("Botnet: ", len([label for label in y_train if label == 1]))
print("Legitimate: ", len([label for label in y_train if label == 0]))
    
print("Test Data")
print("Botnet: ", len([label for label in y_test if label == 1]))
print("Legitimate: ", len([label for label in y_test if label == 0]))

Training Data
Botnet:  193935
Legitimate:  193279
Test Data
Botnet:  129506
Legitimate:  128638


In [65]:
# instantiate learning model
clf = RandomForestClassifier()

# fitting the model
clf.fit(X_train, y_train)

# predict the response
pred = clf.predict(X_test)

In [66]:
# evaluate performance
print("Random Forest")
print("Precision: ", precision_score(y_test, pred))
print("Recall: ", recall_score(y_test, pred))
print("F1 Score: ", f1_score(y_test, pred))
print("Accuracy: ", accuracy_score(y_test, pred))

Random Forest
Precision:  0.9990343656794872
Recall:  0.9906027519960465
F1 Score:  0.9948006932409011
Accuracy:  0.9948052249907029


In [68]:
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
print("TP: ", tp)
print("FP: ", fp)
print("TN: ", tn)
print("FN: ", fn)

TP:  128289
FP:  124
TN:  128514
FN:  1217
