In [1]:
import os
import sys

from collections import Counter
import pandas as pd

import torch
import torch.nn as nn

seed = 5487
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


In [2]:
# = def count_event(root, event_collect):

def count_event(path, event_collect):
    #print(path)
    dataset = pd.read_json(path, lines=True)
    event_count = dict()
    for i in range(0, len(dataset)):
        event_id = dataset['winlog'][i]['event_id']
        if event_id not in event_count:
            event_count[event_id] = 1
        else:
            event_count[event_id] += 1
            
    event_collect.update(event_count)
    return event_count

In [3]:
def count_event_test(path, event_collect):
    #print(path)
    dataset = pd.read_json(path, lines=True)
    event_count = dict()
    for i in range(0, len(dataset)):
        event_id = dataset['winlog'][i]['event_id']
        if event_id not in event_count and event_id in event_collect:
            event_count[event_id] = 1
        elif event_id in event_count and event_id in event_collect:
            event_count[event_id] += 1
            
    event_collect.update(event_count)
    return event_count

In [4]:
def read_records(path, event_collect, test_data = False):
    data = list()
    for user in sorted(os.listdir(path)):
        event_count = Counter()
        atk = os.path.join(path, user, 'winlogbeat.json')
        
        if not test_data:
            event_count.update(count_event(atk, event_collect))
        else:
            event_count.update(count_event_test(atk, event_collect))
        
        
        data.append(event_count.most_common())
        
    return data

In [5]:
def normalize(data, event_collect, print_info=True):
    normalized_data = list()
    for i, user_record in enumerate(data):
        event_set = dict()
        total_events = 0
        for event, freq in user_record:
            event_set[event] = freq
            total_events += freq
            
        if print_info:
            print(f"User {i + 1}'s event distribution: \n{[(event, event_set[event]) for event in sorted(event_set)]}\n")
        
        for event in event_set:
            if event not in event_set:
                event_set[event] = 1
                total_events += 1
        
        normalized_events = list()
        for event in sorted(event_collect):
            if event not in event_set:
                normalized_events.append(1 / total_events)
            else:
                normalized_events.append(event_set[event] / total_events)
        
        normalized_data.append(normalized_events)

    return normalized_data

In [6]:
def test(model, test_path, event_collect):
    with torch.no_grad():
        test_data = read_records(test_path,  event_collect,True)
        #here here 原本的training data只有35種 可是testing有37種
        test_data = torch.FloatTensor(normalize(test_data, event_collect, False)).to(device)
        #print(test_data.shape)
        predict = model.forward(test_data)
#         print(predict, '\n')
        
    return torch.argmax(predict, dim=1) + 1

In [7]:
class FullyConnect(nn.Module):
    def __init__(self, num_events):
        super(FullyConnect, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(num_events, 100),
            nn.ReLU(True),
            nn.Linear(100, 5)
        )
    
    def forward(self, x):
        return self.classifier(x)

In [8]:
#mode = str(sys.argv[2])
data_root = os.path.join(os.getcwd(), 'Logs')
train_path = os.path.join(data_root, 'Train')
test_path = os.path.join(str(sys.argv[1]))
event_collect = set()
train_path

'/root/NS_project2/Logs/Train'

In [9]:
train_data = read_records(train_path, event_collect)
train_data = torch.FloatTensor(normalize(train_data, event_collect)).to(device)
print(train_data.shape)
label = torch.LongTensor([0, 1, 2, 3, 4]).to(device)

User 1's event distribution: 
[(4624, 11), (4656, 4569), (4658, 2167), (4663, 919), (4672, 11), (4688, 79), (4689, 78), (4690, 1061), (4702, 1), (4703, 22), (4798, 1), (4799, 4), (5379, 26), (7040, 4), (10016, 1)]

User 2's event distribution: 
[(4624, 4), (4656, 5859), (4658, 4533), (4663, 2135), (4672, 4), (4688, 58), (4689, 56), (4690, 2244), (4703, 15)]

User 3's event distribution: 
[(26, 1), (4624, 4), (4656, 943), (4658, 1855), (4660, 2), (4663, 731), (4670, 8), (4672, 4), (4688, 79), (4689, 75), (4690, 919), (4698, 1), (4703, 28), (4798, 5), (5156, 357), (5158, 167), (7045, 1)]

User 4's event distribution: 
[(15, 2), (4624, 19), (4625, 1), (4634, 8), (4648, 2), (4656, 3571), (4658, 7193), (4660, 4), (4663, 2632), (4672, 21), (4688, 106), (4689, 101), (4690, 3625), (4702, 1), (4703, 26), (4719, 6), (4798, 25), (5058, 1), (5061, 1), (5156, 790), (5158, 146), (5379, 1), (5381, 2), (5382, 5), (6416, 17), (10016, 1), (16384, 1), (16394, 1)]

User 5's event distribution: 
[(1001, 1)

In [10]:
model = FullyConnect(train_data.shape[1]).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [11]:
test_path = '/root/NS_project2/Logs/Example_Test'

for epoch in range(1, 2000):
        total_loss = 0
        optimizer.zero_grad()

        predict = model(train_data)
        loss = criterion(predict, label)
        loss.backward()
        total_loss += loss.item()
        optimizer.step()

        if epoch % 100 == 0:
            model.eval()
            predict = test(model, test_path, event_collect).cpu().tolist()
            print(total_loss, predict)

            model.train()
torch.save(model.state_dict(), 'weights.pth')

0.4104757308959961 [1, 2, 3, 4, 5]
0.07392624765634537 [1, 2, 3, 4, 5]
0.024139681831002235 [1, 2, 3, 4, 5]
0.01182499434798956 [1, 2, 3, 4, 5]
0.0070427716709673405 [1, 2, 3, 4, 5]
0.0046871500089764595 [1, 2, 3, 4, 5]
0.003348772879689932 [1, 2, 3, 4, 5]
0.0025125571992248297 [1, 2, 3, 4, 5]
0.0019541876390576363 [1, 2, 3, 4, 5]
0.0015619860496371984 [1, 2, 3, 4, 5]
0.001275727991014719 [1, 2, 3, 4, 5]
0.0010602741967886686 [1, 2, 3, 4, 5]
0.0008937337552197278 [1, 2, 3, 4, 5]
0.0007624734425917268 [1, 2, 3, 4, 5]
0.0006570810219272971 [1, 2, 3, 4, 5]
0.000571254757232964 [1, 2, 3, 4, 5]
0.0005003082333132625 [1, 2, 3, 4, 5]
0.0004411012923810631 [1, 2, 3, 4, 5]
0.0003911109524779022 [1, 2, 3, 4, 5]


In [12]:
model.load_state_dict(torch.load('weights.pth'))

<All keys matched successfully>

In [13]:
predict = test(model, test_path, event_collect).cpu().tolist()
for i, label in enumerate(predict):
    print(f"TestCase {i + 1}: Attack{label}")

TestCase 1: Attack1
TestCase 2: Attack2
TestCase 3: Attack3
TestCase 4: Attack4
TestCase 5: Attack5
