In [1]:
import pandas as pd
import torch
from torch.autograd import Variable
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [2]:
# read csv
# https://research.unsw.edu.au/projects/unsw-nb15-dataset
# According to the website above csv is wrongly saved, so I change the name
# The number of records in the training set is 175,341 records and the testing set is 82,332 records from the different types, attack and normal.
df = pd.read_csv('../../Data/UNSW_NB15_testing-set.csv')

In [3]:
# fix seed
# Pytorch
torch.manual_seed(0)
torch.cuda.manual_seed(0)

In [4]:
# nominal to numeric of data
# proto                 object
# service               object
# state                 object

# proto to numeric
# proto_mapping = {'xxx':2, 'xxx':1, 'xxx':0}
# data['proto'] = data['proto'].map(proto_mapping)

# proto to numeric
proto_le = LabelEncoder()
df['proto'] = proto_le.fit_transform(df['proto'])
 
# service to numeric
service_le = LabelEncoder()
df['service'] = service_le.fit_transform(df['service'])

# state to numeric
state_le = LabelEncoder()
df['state'] = state_le.fit_transform(df['state'])

# nominal to numeric of data
# attack_cat            object

# target to numeric
attack_cat_le = LabelEncoder()
df['attack_cat'] = attack_cat_le.fit_transform(df['attack_cat'])

df.head(10)

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,113,0,2,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,6,0
1,2,0.649902,113,0,2,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,6,0
2,3,1.623129,113,0,2,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,6,0
3,4,1.681642,113,3,2,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,6,0
4,5,0.449454,113,0,2,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,6,0
5,6,0.380537,113,0,2,10,6,534,268,39.41798,...,1,40,0,0,0,2,39,0,6,0
6,7,0.637109,113,0,2,10,8,534,354,26.683033,...,1,40,0,0,0,1,39,0,6,0
7,8,0.521584,113,0,2,10,8,534,354,32.593026,...,1,40,0,0,0,3,39,0,6,0
8,9,0.542905,113,0,2,10,8,534,354,31.313031,...,1,40,0,0,0,3,39,0,6,0
9,10,0.258687,113,0,2,10,6,534,268,57.985135,...,1,40,0,0,0,3,39,0,6,0


In [5]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=0)

In [6]:
#df_train.dtypes
df_train.shape

(122738, 45)

In [7]:
data_train = df_train.drop(['id', 'label', 'attack_cat'], axis=1)
label_train = df_train.iloc[:,-1]
data_train.head(10)

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
41479,0.581798,113,0,2,10,6,588,268,25.78214,254,...,2,1,1,1,0,0,0,1,5,0
174119,9e-06,119,2,3,2,0,114,0,111111.1,254,...,15,15,15,31,0,0,0,15,31,0
39585,0.294033,113,4,2,8,12,424,8824,64.6186,31,...,11,1,1,5,0,0,0,3,6,0
23269,0.001059,119,2,0,2,2,146,178,2832.861,31,...,2,1,1,1,0,0,0,4,2,0
35227,1.433471,113,3,2,52,54,2934,3742,73.24878,31,...,4,1,1,5,1,1,0,6,2,0
157804,3e-06,119,2,3,2,0,114,0,333333.3,254,...,12,12,12,14,0,0,0,12,14,0
170646,1.434152,113,4,2,10,8,450,782,11.8537,62,...,2,1,1,2,0,0,0,4,1,0
102714,1.725872,113,5,2,10,10,798,1730,11.00893,62,...,1,1,1,1,0,0,1,1,1,0
138365,1e-06,119,2,3,2,0,114,0,1000000.0,254,...,16,16,16,18,0,0,0,16,18,0
7840,0.528823,113,9,2,52,42,37372,3380,175.8622,31,...,2,1,1,3,0,0,0,4,1,0


In [8]:
#min-max scaling
data_train_norm = (data_train - data_train.min()) / (data_train.max() - data_train.min())
data_train_norm = data_train_norm.fillna(0)

In [9]:
data_train_norm.shape
data_train_norm.head(10)

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
41479,0.009696635,0.856061,0.0,0.285714,0.000936,0.000553,4.2e-05,1.8e-05,2.6e-05,0.996078,...,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065574,0.0
174119,1.5e-07,0.901515,0.166667,0.428571,0.000104,0.0,5e-06,0.0,0.111111,0.996078,...,0.28,0.28,0.311111,0.483871,0.0,0.0,0.0,0.237288,0.491803,0.0
39585,0.004900551,0.856061,0.333333,0.285714,0.000728,0.001106,2.9e-05,0.000609,6.5e-05,0.121569,...,0.2,0.0,0.0,0.064516,0.0,0.0,0.0,0.033898,0.081967,0.0
23269,1.765e-05,0.901515,0.166667,0.0,0.000104,0.000184,8e-06,1.2e-05,0.002833,0.121569,...,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.050847,0.016393,0.0
35227,0.02389119,0.856061,0.25,0.285714,0.005304,0.004977,0.000223,0.000258,7.3e-05,0.121569,...,0.06,0.0,0.0,0.064516,0.25,0.25,0.0,0.084746,0.016393,0.0
157804,5.000001e-08,0.901515,0.166667,0.428571,0.000104,0.0,5e-06,0.0,0.333333,0.996078,...,0.22,0.22,0.244444,0.209677,0.0,0.0,0.0,0.186441,0.213115,0.0
170646,0.02390254,0.856061,0.333333,0.285714,0.000936,0.000737,3.1e-05,5.4e-05,1.2e-05,0.243137,...,0.02,0.0,0.0,0.016129,0.0,0.0,0.0,0.050847,0.0,0.0
102714,0.02876454,0.856061,0.416667,0.285714,0.000936,0.000922,5.8e-05,0.000119,1.1e-05,0.243137,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
138365,1.666667e-08,0.901515,0.166667,0.428571,0.000104,0.0,5e-06,0.0,1.0,0.996078,...,0.3,0.3,0.333333,0.274194,0.0,0.0,0.0,0.254237,0.278689,0.0
7840,0.008813718,0.856061,0.75,0.285714,0.005304,0.003871,0.002879,0.000233,0.000176,0.121569,...,0.02,0.0,0.0,0.032258,0.0,0.0,0.0,0.050847,0.0,0.0


In [10]:
train_X = torch.tensor(data_train_norm.values, dtype=torch.float32)
train_Y = torch.tensor(label_train.values, dtype=torch.long) 
train = TensorDataset(train_X, train_Y)

In [11]:
train_loader = DataLoader(train, batch_size=100, shuffle=True)
# drop_last = True

In [12]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(42, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, 100)
        self.fc4 = nn.Linear(100, 2)
        self.bc1 = nn.BatchNorm1d(100)
        self.bc2 = nn.BatchNorm1d(100)
        self.bc3 = nn.BatchNorm1d(100)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.bc1(x)
        x = F.relu(x) # ReLU: max(x, 0)
        x = self.fc2(x)
        x = self.bc2(x)
        x = F.relu(x) # ReLU: max(x, 0)
        x = self.fc3(x)
        x = self.bc3(x)
        x = F.relu(x) # ReLU: max(x, 0)
        x = self.fc4(x)
        # return F.log_softmax(x, dim=1)
        return x

model = Net()

In [13]:
print(torch.cuda.is_available())

True


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print(device)

cuda


In [15]:
criterion = nn.CrossEntropyLoss()

In [16]:
#optimizer = torch.optim.SGD(model.parameters(), lr=0.03)
optimizer = torch.optim.Adam(model.parameters(), lr=0.03)

In [17]:
model.train()

Net(
  (fc1): Linear(in_features=42, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=100, bias=True)
  (fc3): Linear(in_features=100, out_features=100, bias=True)
  (fc4): Linear(in_features=100, out_features=2, bias=True)
  (bc1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bc2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bc3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [18]:
#Training the machine learning model
batch_loss_list=[]
for epoch in range(300): #learning 300 times
    #total_loss = 0
    model.train()
    loss_list=[]
    for train_x, train_y in train_loader:
        train_x, train_y = Variable(train_x), Variable(train_y)
        train_x = train_x.to(device)
        train_y = train_y.to(device)
        optimizer.zero_grad()
        output = model(train_x)
        loss = criterion(output, train_y)
        loss.backward()
        optimizer.step()
        loss_list.append(loss.data)
        #print('epoch {}, loss {}'.format(epoch, loss.item()))
    batch_loss = sum(loss_list)/len(train_loader)
    batch_loss_list.append(batch_loss.to('cpu').detach().numpy().tolist())

In [19]:
batch_loss_list

[0.15238837897777557,
 0.13725042343139648,
 0.13062803447246552,
 0.1265948861837387,
 0.12472110241651535,
 0.12473628669977188,
 0.12198960036039352,
 0.12027113884687424,
 0.1202104389667511,
 0.11896917968988419,
 0.11872575432062149,
 0.11795059591531754,
 0.11725261807441711,
 0.11676426231861115,
 0.11697987467050552,
 0.11647798866033554,
 0.11630872637033463,
 0.11592301726341248,
 0.1157388836145401,
 0.11504937708377838,
 0.11485551297664642,
 0.11515390127897263,
 0.114412322640419,
 0.11409960687160492,
 0.11397819221019745,
 0.11401350796222687,
 0.11313078552484512,
 0.11346150934696198,
 0.11269600689411163,
 0.11330129206180573,
 0.1120123341679573,
 0.11203563213348389,
 0.11192967742681503,
 0.11195848882198334,
 0.11200349777936935,
 0.11170470714569092,
 0.11122240126132965,
 0.1108516976237297,
 0.11143998056650162,
 0.11053893715143204,
 0.11064495146274567,
 0.1099187433719635,
 0.11022674292325974,
 0.11028346419334412,
 0.11071258783340454,
 0.110677868127822

In [20]:
#Calculating correct_rate using the training data
pred_list = list()
test_y_list = list()
with torch.no_grad():
    model.eval()
    correct = 0
    for test_x, test_y in train_loader:
        test_y_list.extend(test_y.to('cpu').detach().numpy().tolist())
        test_x, test_y = Variable(test_x), Variable(test_y)
        test_x = test_x.to(device)
        test_y = test_y.to(device)
        output = model(test_x)
        pred = torch.max(output.data, 1)[1]
        #print(pred)
        pred_list.extend(pred.to('cpu').detach().numpy().tolist())
        correct += pred.eq(test_y.data.view_as(pred)).sum() 
    # correct_rate
    data_num = len(train_loader.dataset)
    print('correct _rate: {}/{} ({:.0f}%)\n'.format(correct, data_num, 100. * correct / data_num))

correct _rate: 116750/122738 (95%)



In [21]:
testdata = df_test.drop(['id', 'label', 'attack_cat'], axis=1)
testlabel = df_test.iloc[:,-1]

#min-max scaling
testdata_norm = (testdata - testdata.min()) / (testdata.max() - testdata.min())
testdata_norm = testdata_norm.fillna(0)

testdata_norm.head(10)

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
87395,0.01928715,0.856061,0.416667,0.25,0.000948,0.000729,7.4e-05,2.4e-05,1.5e-05,0.996078,...,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0
171525,6.666668e-08,0.901515,0.166667,0.375,0.000105,0.0,7e-06,0.0,0.25,0.996078,...,0.28,0.26,0.288889,0.203125,0.0,0.0,0.0,0.26,0.213115,0.0
100997,0.003536851,0.856061,0.0,0.0,0.000527,0.000182,7.6e-05,6e-06,3.3e-05,0.243137,...,0.06,0.06,0.0,0.09375,0.0,0.0,0.0,0.08,0.098361,0.0
106304,1.5e-07,0.909091,0.0,0.375,0.000105,0.0,1.4e-05,0.0,0.111111,0.996078,...,0.04,0.04,0.044444,0.046875,0.0,0.0,0.0,0.04,0.04918,0.0
170606,0.01929705,0.856061,0.416667,0.25,0.000948,0.000729,4e-05,2.4e-05,1.5e-05,0.996078,...,0.0,0.02,0.0,0.0,0.0,0.0,0.033333,0.36,0.0,0.0
113031,1.333334e-07,0.901515,0.0,0.375,0.000105,0.0,1.1e-05,0.0,0.125,0.996078,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.016393,0.0
70458,1.333334e-07,0.022727,0.0,0.375,0.000105,0.0,1.4e-05,0.0,0.125,0.996078,...,0.02,0.02,0.022222,0.03125,0.0,0.0,0.0,0.04,0.032787,0.0
138025,1.5e-07,0.901515,0.166667,0.375,0.000105,0.0,7e-06,0.0,0.111111,0.996078,...,0.54,0.52,0.266667,0.40625,0.0,0.0,0.0,0.52,0.42623,0.0
135044,1.166667e-07,0.901515,0.166667,0.375,0.000105,0.0,7e-06,0.0,0.142857,0.996078,...,0.3,0.3,0.333333,0.375,0.0,0.0,0.0,0.32,0.393443,0.0
161530,5.000001e-08,0.901515,0.166667,0.375,0.000105,0.0,7e-06,0.0,0.333333,0.996078,...,0.06,0.06,0.066667,0.296875,0.0,0.0,0.0,0.06,0.311475,0.0


In [22]:
test_X = torch.tensor(testdata_norm.values, dtype=torch.float32)
test_Y = torch.tensor(testlabel.values, dtype=torch.long) 

test = TensorDataset(test_X, test_Y)
test_loader = DataLoader(test, batch_size=100)

In [23]:
with torch.no_grad():
    model.eval()
    correct = 0
    pred_list = list()
    test_y_list = list()
    for test_x, test_y in test_loader:
        test_x, test_y = Variable(test_x), Variable(test_y)
        test_y_list.extend(test_y.to('cpu').detach().numpy().tolist())
        test_x = test_x.to(device)
        test_y = test_y.to(device)
        output = model(test_x)
        pred = torch.max(output.data, 1)[1]
        pred_list.extend(pred.to('cpu').detach().numpy().tolist())
        correct += pred.eq(test_y.data.view_as(pred)).sum() 
    # correct_rate
    data_num = len(test_loader.dataset)
    print('correct _rate: {}/{} ({:.0f}%)\n'.format(correct, data_num, 100. * correct / data_num))

correct _rate: 49682/52603 (94%)



In [24]:
accuracy_score(test_y_list, pred_list)

0.944470847670285

In [25]:
precision_score(test_y_list, pred_list)

0.9496020497165286

In [26]:
recall_score(test_y_list, pred_list)

0.970148422488931

In [27]:
f1_score(test_y_list, pred_list)

0.9597652860232234

In [28]:
cm = confusion_matrix(test_y_list, pred_list, labels=[0,1])
print(cm)

[[14843  1849]
 [ 1072 34839]]
