In [20]:
import pandas as pd
import torch
from torch.autograd import Variable
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [21]:
# read csv
# https://research.unsw.edu.au/projects/unsw-nb15-dataset
# According to the website above csv is wrongly saved, so I change the name
# The number of records in the training set is 175,341 records and the testing set is 82,332 records from the different types, attack and normal.
df = pd.read_csv('../../Data/UNSW_NB15_testing-set.csv')

In [22]:
# fix seed
# Pytorch
torch.manual_seed(0)
torch.cuda.manual_seed(0)

In [23]:
# nominal to numeric of data
# proto                 object
# service               object
# state                 object

# proto to numeric
# proto_mapping = {'xxx':2, 'xxx':1, 'xxx':0}
# data['proto'] = data['proto'].map(proto_mapping)

# proto to numeric
proto_le = LabelEncoder()
df['proto'] = proto_le.fit_transform(df['proto'])
 
# service to numeric
service_le = LabelEncoder()
df['service'] = service_le.fit_transform(df['service'])

# state to numeric
state_le = LabelEncoder()
df['state'] = state_le.fit_transform(df['state'])

# nominal to numeric of data
# attack_cat            object

# target to numeric
attack_cat_le = LabelEncoder()
df['attack_cat'] = attack_cat_le.fit_transform(df['attack_cat'])

df.head(10)

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,113,0,2,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,6,0
1,2,0.649902,113,0,2,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,6,0
2,3,1.623129,113,0,2,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,6,0
3,4,1.681642,113,3,2,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,6,0
4,5,0.449454,113,0,2,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,6,0
5,6,0.380537,113,0,2,10,6,534,268,39.41798,...,1,40,0,0,0,2,39,0,6,0
6,7,0.637109,113,0,2,10,8,534,354,26.683033,...,1,40,0,0,0,1,39,0,6,0
7,8,0.521584,113,0,2,10,8,534,354,32.593026,...,1,40,0,0,0,3,39,0,6,0
8,9,0.542905,113,0,2,10,8,534,354,31.313031,...,1,40,0,0,0,3,39,0,6,0
9,10,0.258687,113,0,2,10,6,534,268,57.985135,...,1,40,0,0,0,3,39,0,6,0


In [24]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=0)

In [25]:
#df_train.dtypes
df_train.shape

(122738, 45)

In [26]:
data_train = df_train.drop(['id', 'label', 'attack_cat'], axis=1)
label_train = df_train.iloc[:,-2]
data_train.head(10)

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
41479,0.581798,113,0,2,10,6,588,268,25.78214,254,...,2,1,1,1,0,0,0,1,5,0
174119,9e-06,119,2,3,2,0,114,0,111111.1,254,...,15,15,15,31,0,0,0,15,31,0
39585,0.294033,113,4,2,8,12,424,8824,64.6186,31,...,11,1,1,5,0,0,0,3,6,0
23269,0.001059,119,2,0,2,2,146,178,2832.861,31,...,2,1,1,1,0,0,0,4,2,0
35227,1.433471,113,3,2,52,54,2934,3742,73.24878,31,...,4,1,1,5,1,1,0,6,2,0
157804,3e-06,119,2,3,2,0,114,0,333333.3,254,...,12,12,12,14,0,0,0,12,14,0
170646,1.434152,113,4,2,10,8,450,782,11.8537,62,...,2,1,1,2,0,0,0,4,1,0
102714,1.725872,113,5,2,10,10,798,1730,11.00893,62,...,1,1,1,1,0,0,1,1,1,0
138365,1e-06,119,2,3,2,0,114,0,1000000.0,254,...,16,16,16,18,0,0,0,16,18,0
7840,0.528823,113,9,2,52,42,37372,3380,175.8622,31,...,2,1,1,3,0,0,0,4,1,0


In [27]:
#without min-max scaling
#data_train_norm = (data_train - data_train.min()) / (data_train.max() - data_train.min())
#data_train_norm = data_train_norm.fillna(0)

In [28]:
#data_train_norm.shape
#data_train_norm.head(10)

In [29]:
train_X = torch.tensor(data_train.values, dtype=torch.float32)
train_Y = torch.tensor(label_train.values, dtype=torch.long) 
train = TensorDataset(train_X, train_Y)

In [30]:
train_loader = DataLoader(train, batch_size=100, shuffle=True)
# drop_last = True

In [31]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(42, 850)
        self.fc2 = nn.Linear(850, 10)
        
    def forward(self, x):
        x = F.relu(self.fc1(x)) # ReLU: max(x, 0)
        x = self.fc2(x)
        # return F.log_softmax(x, dim=1)
        return x

model = Net()

In [32]:
print(torch.cuda.is_available())

True


In [33]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print(device)

cuda


In [34]:
criterion = nn.CrossEntropyLoss()

In [35]:
#optimizer = torch.optim.SGD(model.parameters(), lr=0.03)
optimizer = torch.optim.Adam(model.parameters(), lr=0.03)

In [36]:
model.train()

Net(
  (fc1): Linear(in_features=42, out_features=850, bias=True)
  (fc2): Linear(in_features=850, out_features=10, bias=True)
)

In [37]:
#Training the machine learning model
loss_list=[]
for epoch in range(100): #learning 100 times
    #total_loss = 0
    model.train()
    for train_x, train_y in train_loader:
        train_x, train_y = Variable(train_x), Variable(train_y)
        train_x = train_x.to(device)
        train_y = train_y.to(device)
        optimizer.zero_grad()
        output = model(train_x)
        loss = criterion(output, train_y)
        loss.backward()
        optimizer.step()
        loss_list.append(loss.data)
        #print('epoch {}, loss {}'.format(epoch, loss.item()))

In [38]:
#Calculating correct_rate using the training data
pred_list = list()
test_y_list = list()
with torch.no_grad():
    model.eval()
    correct = 0
    for test_x, test_y in train_loader:
        test_y_list.extend(test_y.to('cpu').detach().numpy().tolist())
        test_x, test_y = Variable(test_x), Variable(test_y)
        test_x = test_x.to(device)
        test_y = test_y.to(device)
        output = model(test_x)
        pred = torch.max(output.data, 1)[1]
        #print(pred)
        pred_list.extend(pred.to('cpu').detach().numpy().tolist())
        correct += pred.eq(test_y.data.view_as(pred)).sum() 
    # correct_rate
    data_num = len(train_loader.dataset)
    print('correct _rate: {}/{} ({:.0f}%)\n'.format(correct, data_num, 100. * correct / data_num))

correct _rate: 39326/122738 (32%)



In [39]:
testdata = df_test.drop(['id', 'label', 'attack_cat'], axis=1)
testlabel = df_test.iloc[:,-2]

#without min-max scaling
#testdata_norm = (testdata - testdata.min()) / (testdata.max() - testdata.min())
#testdata_norm = testdata_norm.fillna(0)

testdata.head(10)

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
87395,1.157229,113,5,2,10,8,966,354,14.690265,254,...,1,1,1,5,0,0,0,1,1,0
171525,4e-06,119,2,3,2,0,114,0,250000.0006,254,...,15,14,14,14,0,0,0,14,14,0
100997,0.212211,113,0,0,6,2,986,86,32.986038,62,...,4,4,1,7,0,0,0,5,7,0
106304,9e-06,120,0,3,2,0,200,0,111111.1072,254,...,3,3,3,4,0,0,0,3,4,0
170606,1.157823,113,5,2,10,8,526,354,14.682728,254,...,1,2,1,1,0,0,1,19,1,0
113031,8e-06,119,0,3,2,0,168,0,125000.0003,254,...,1,1,1,1,0,0,0,2,2,0
70458,8e-06,3,0,3,2,0,200,0,125000.0003,254,...,2,2,2,3,0,0,0,3,3,0
138025,9e-06,119,2,3,2,0,114,0,111111.1072,254,...,28,27,13,27,0,0,0,27,27,0
135044,7e-06,119,2,3,2,0,114,0,142857.1409,254,...,16,16,16,25,0,0,0,17,25,0
161530,3e-06,119,2,3,2,0,114,0,333333.3215,254,...,4,4,4,20,0,0,0,4,20,0


In [40]:
test_X = torch.tensor(testdata.values, dtype=torch.float32)
test_Y = torch.tensor(testlabel.values, dtype=torch.long) 

test = TensorDataset(test_X, test_Y)
test_loader = DataLoader(test, batch_size=100)

In [41]:
with torch.no_grad():
    model.eval()
    correct = 0
    pred_list = list()
    test_y_list = list()
    for test_x, test_y in test_loader:
        test_x, test_y = Variable(test_x), Variable(test_y)
        test_y_list.extend(test_y.to('cpu').detach().numpy().tolist())
        test_x = test_x.to(device)
        test_y = test_y.to(device)
        output = model(test_x)
        pred = torch.max(output.data, 1)[1]
        pred_list.extend(pred.to('cpu').detach().numpy().tolist())
        correct += pred.eq(test_y.data.view_as(pred)).sum() 
    # correct_rate
    data_num = len(test_loader.dataset)
    print('correct _rate: {}/{} ({:.0f}%)\n'.format(correct, data_num, 100. * correct / data_num))

correct _rate: 16695/52603 (32%)



In [42]:
accuracy_score(test_y_list, pred_list)

0.3173773358933901

In [43]:
precision_score(test_y_list, pred_list, average='macro')

  _warn_prf(average, modifier, msg_start, len(result))


0.16506708240647439

In [44]:
recall_score(test_y_list, pred_list, average='macro')

0.10006783671428676

In [45]:
f1_score(test_y_list, pred_list, average='macro')

0.048325477335097525

In [46]:
cm = confusion_matrix(test_y_list, pred_list, labels=[0,1,2,3,4,5,6,7,8,9])
print(cm)

[[    0     0     0     0     0     0   612     0     0     0]
 [    0     0     0     1     0     0   519     0     0     0]
 [    0     0     2     0     0     0  3697     0     0     0]
 [    0     0     0     2     0     0 10120     0     0     0]
 [    0     0     0     0     0     0  5339     0     0     0]
 [    0     0     0     0     0     0 12121     0     0     0]
 [    0     0     1     0     0     0 16691     0     0     0]
 [    0     0     0     0     0     0  3126     0     0     0]
 [    0     0     0     0     0     0   337     0     0     0]
 [    0     0     0     0     0     0    35     0     0     0]]
