In [1]:
import pandas as pd
import torch
from torch.autograd import Variable
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [2]:
# read csv
# https://research.unsw.edu.au/projects/unsw-nb15-dataset
# According to the website above csv is wrongly saved, so I change the name
# The number of records in the training set is 175,341 records and the testing set is 82,332 records from the different types, attack and normal.
df = pd.read_csv('../../Data/UNSW_NB15_testing-set.csv')

In [3]:
# fix seed
# Pytorch
torch.manual_seed(0)
torch.cuda.manual_seed(0)

In [4]:
# nominal to numeric of data
# proto                 object
# service               object
# state                 object

# proto to numeric
# proto_mapping = {'xxx':2, 'xxx':1, 'xxx':0}
# data['proto'] = data['proto'].map(proto_mapping)

# one Hot Encoding is also applicable like using pandas dummies
df_processed = pd.get_dummies(df, columns=(['proto','service','state']))
 
# nominal to numeric of data
# attack_cat            object

# target to numeric
attack_cat_le = LabelEncoder()
df_processed['attack_cat'] = attack_cat_le.fit_transform(df_processed['attack_cat'])

df_processed.head(10)

Unnamed: 0,id,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,...,service_ssl,state_CON,state_ECO,state_FIN,state_INT,state_PAR,state_REQ,state_RST,state_URN,state_no
0,1,0.121478,6,4,258,172,74.08749,252,254,14158.94238,...,0,0,0,1,0,0,0,0,0,0
1,2,0.649902,14,38,734,42014,78.473372,62,252,8395.112305,...,0,0,0,1,0,0,0,0,0,0
2,3,1.623129,8,16,364,13186,14.170161,62,252,1572.271851,...,0,0,0,1,0,0,0,0,0,0
3,4,1.681642,12,12,628,770,13.677108,62,252,2740.178955,...,0,0,0,1,0,0,0,0,0,0
4,5,0.449454,10,6,534,268,33.373826,254,252,8561.499023,...,0,0,0,1,0,0,0,0,0,0
5,6,0.380537,10,6,534,268,39.41798,254,252,10112.02539,...,0,0,0,1,0,0,0,0,0,0
6,7,0.637109,10,8,534,354,26.683033,254,252,6039.783203,...,0,0,0,1,0,0,0,0,0,0
7,8,0.521584,10,8,534,354,32.593026,254,252,7377.527344,...,0,0,0,1,0,0,0,0,0,0
8,9,0.542905,10,8,534,354,31.313031,254,252,7087.796387,...,0,0,0,1,0,0,0,0,0,0
9,10,0.258687,10,6,534,268,57.985135,254,252,14875.12012,...,0,0,0,1,0,0,0,0,0,0


In [10]:
df_train, df_test = train_test_split(df_processed, test_size=0.3, random_state=0)

In [11]:
#df_train.dtypes
df_train.shape

(122738, 197)

In [12]:
data_train = df_train.drop(['id', 'label', 'attack_cat'], axis=1)
label_train = df_train.iloc[:,-157]
data_train.head(10)

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,service_ssl,state_CON,state_ECO,state_FIN,state_INT,state_PAR,state_REQ,state_RST,state_URN,state_no
41479,0.581798,10,6,588,268,25.78214,254,252,7287.752,3080.106689,...,0,0,0,1,0,0,0,0,0,0
174119,9e-06,2,0,114,0,111111.1,254,0,50666660.0,0.0,...,0,0,0,0,1,0,0,0,0,0
39585,0.294033,8,12,424,8824,64.6186,31,29,10094.11,220084.1406,...,0,0,0,1,0,0,0,0,0,0
23269,0.001059,2,2,146,178,2832.861,31,29,551463.6,672332.375,...,0,1,0,0,0,0,0,0,0,0
35227,1.433471,52,54,2934,3742,73.24878,31,29,16061.71,20498.49609,...,0,0,0,1,0,0,0,0,0,0
157804,3e-06,2,0,114,0,333333.3,254,0,152000000.0,0.0,...,0,0,0,0,1,0,0,0,0,0
170646,1.434152,10,8,450,782,11.8537,62,252,2259.175,3821.073242,...,0,0,0,1,0,0,0,0,0,0
102714,1.725872,10,10,798,1730,11.00893,62,252,3332.808,7217.221191,...,0,0,0,1,0,0,0,0,0,0
138365,1e-06,2,0,114,0,1000000.0,254,0,456000000.0,0.0,...,0,0,0,0,1,0,0,0,0,0
7840,0.528823,52,42,37372,3380,175.8622,31,29,554499.3,49922.18359,...,0,0,0,1,0,0,0,0,0,0


In [13]:
#without min-max scaling
#data_train_norm = (data_train - data_train.min()) / (data_train.max() - data_train.min())
#data_train_norm = data_train_norm.fillna(0)

In [14]:
#data_train_norm.shape
#data_train_norm.head(10)

In [15]:
train_X = torch.tensor(data_train.values, dtype=torch.float32)
train_Y = torch.tensor(label_train.values, dtype=torch.long) 
train = TensorDataset(train_X, train_Y)

In [16]:
train_loader = DataLoader(train, batch_size=100, shuffle=True)
# drop_last = True

In [17]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(194, 850)
        self.fc2 = nn.Linear(850, 10)
        
    def forward(self, x):
        x = F.relu(self.fc1(x)) # ReLU: max(x, 0)
        x = self.fc2(x)
        # return F.log_softmax(x, dim=1)
        return x

model = Net()

In [18]:
print(torch.cuda.is_available())

True


In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print(device)

cuda


In [20]:
criterion = nn.CrossEntropyLoss()

In [21]:
#optimizer = torch.optim.SGD(model.parameters(), lr=0.03)
optimizer = torch.optim.Adam(model.parameters(), lr=0.03)

In [22]:
model.train()

Net(
  (fc1): Linear(in_features=194, out_features=850, bias=True)
  (fc2): Linear(in_features=850, out_features=10, bias=True)
)

In [23]:
#Training the machine learning model
loss_list=[]
for epoch in range(100): #learning 100 times
    #total_loss = 0
    model.train()
    for train_x, train_y in train_loader:
        train_x, train_y = Variable(train_x), Variable(train_y)
        train_x = train_x.to(device)
        train_y = train_y.to(device)
        optimizer.zero_grad()
        output = model(train_x)
        loss = criterion(output, train_y)
        loss.backward()
        optimizer.step()
        loss_list.append(loss.data)
        #print('epoch {}, loss {}'.format(epoch, loss.item()))

In [24]:
#Calculating correct_rate using the training data
pred_list = list()
test_y_list = list()
with torch.no_grad():
    model.eval()
    correct = 0
    for test_x, test_y in train_loader:
        test_y_list.extend(test_y.to('cpu').detach().numpy().tolist())
        test_x, test_y = Variable(test_x), Variable(test_y)
        test_x = test_x.to(device)
        test_y = test_y.to(device)
        output = model(test_x)
        pred = torch.max(output.data, 1)[1]
        #print(pred)
        pred_list.extend(pred.to('cpu').detach().numpy().tolist())
        correct += pred.eq(test_y.data.view_as(pred)).sum() 
    # correct_rate
    data_num = len(train_loader.dataset)
    print('correct _rate: {}/{} ({:.0f}%)\n'.format(correct, data_num, 100. * correct / data_num))

correct _rate: 39314/122738 (32%)



In [25]:
testdata = df_test.drop(['id', 'label', 'attack_cat'], axis=1)
testlabel = df_test.iloc[:,-157]

#without min-max scaling
#testdata_norm = (testdata - testdata.min()) / (testdata.max() - testdata.min())
#testdata_norm = testdata_norm.fillna(0)

testdata.head(10)

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,service_ssl,state_CON,state_ECO,state_FIN,state_INT,state_PAR,state_REQ,state_RST,state_URN,state_no
87395,1.157229,10,8,966,354,14.690265,254,252,6014.367,2143.050537,...,0,0,0,1,0,0,0,0,0,0
171525,4e-06,2,0,114,0,250000.0006,254,0,114000000.0,0.0,...,0,0,0,0,1,0,0,0,0,0
100997,0.212211,6,2,986,86,32.986038,62,252,30988.03,1621.028198,...,0,1,0,0,0,0,0,0,0,0
106304,9e-06,2,0,200,0,111111.1072,254,0,88888890.0,0.0,...,0,0,0,0,1,0,0,0,0,0
170606,1.157823,10,8,526,354,14.682728,254,252,3275.112,2141.950928,...,0,0,0,1,0,0,0,0,0,0
113031,8e-06,2,0,168,0,125000.0003,254,0,84000000.0,0.0,...,0,0,0,0,1,0,0,0,0,0
70458,8e-06,2,0,200,0,125000.0003,254,0,100000000.0,0.0,...,0,0,0,0,1,0,0,0,0,0
138025,9e-06,2,0,114,0,111111.1072,254,0,50666660.0,0.0,...,0,0,0,0,1,0,0,0,0,0
135044,7e-06,2,0,114,0,142857.1409,254,0,65142860.0,0.0,...,0,0,0,0,1,0,0,0,0,0
161530,3e-06,2,0,114,0,333333.3215,254,0,152000000.0,0.0,...,0,0,0,0,1,0,0,0,0,0


In [26]:
test_X = torch.tensor(testdata.values, dtype=torch.float32)
test_Y = torch.tensor(testlabel.values, dtype=torch.long) 

test = TensorDataset(test_X, test_Y)
test_loader = DataLoader(test, batch_size=100)

In [27]:
with torch.no_grad():
    model.eval()
    correct = 0
    pred_list = list()
    test_y_list = list()
    for test_x, test_y in test_loader:
        test_x, test_y = Variable(test_x), Variable(test_y)
        test_y_list.extend(test_y.to('cpu').detach().numpy().tolist())
        test_x = test_x.to(device)
        test_y = test_y.to(device)
        output = model(test_x)
        pred = torch.max(output.data, 1)[1]
        pred_list.extend(pred.to('cpu').detach().numpy().tolist())
        correct += pred.eq(test_y.data.view_as(pred)).sum() 
    # correct_rate
    data_num = len(test_loader.dataset)
    print('correct _rate: {}/{} ({:.0f}%)\n'.format(correct, data_num, 100. * correct / data_num))

correct _rate: 16691/52603 (32%)



In [28]:
accuracy_score(test_y_list, pred_list)

0.3173012946029694

In [29]:
precision_score(test_y_list, pred_list, average='macro')

  _warn_prf(average, modifier, msg_start, len(result))


0.13173064126694425

In [30]:
recall_score(test_y_list, pred_list, average='macro')

0.10018032590462497

In [31]:
f1_score(test_y_list, pred_list, average='macro')

0.04855752164416275

In [32]:
cm = confusion_matrix(test_y_list, pred_list, labels=[0,1,2,3,4,5,6,7,8,9])
print(cm)

[[    0     0     0     0     0     0   612     0     0     0]
 [    0     1     0     0     0     0   519     0     0     0]
 [    0     0     0     0     0     0  3699     0     0     0]
 [    0     0     1     0     0     0 10121     0     0     0]
 [    0     0     0     0     0     0  5339     0     0     0]
 [    0     0     0     0     0     0 12121     0     0     0]
 [    0     0     1     1     0     0 16690     0     0     0]
 [    0     0     0     0     0     0  3126     0     0     0]
 [    0     0     0     0     0     0   337     0     0     0]
 [    0     0     0     0     0     0    35     0     0     0]]
