This file includes the code of taking events that flagged as suspicious and splitted to train/var/test- as described in section 4.2 

In [1]:
import gzip
import torch
import pandas as pd
from collections import defaultdict
import uuid
import numpy as np
from tqdm import tqdm
from torch.nn.functional import softmax, cross_entropy
import networkx as nx
import pickle

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# Load data:

In [3]:
ds_name = "PublicArena" #Theia, Cadets, PublicArena

if ds_name == "Theia":
    data_folder = "dataset/theia/"
elif ds_name == "Cadets":
    data_folder = "dataset/cadets/"
elif ds_name == "PublicArena":
    data_folder = "dataset/PublicArena/"

events_df = pd.read_csv(data_folder+f"{ds_name}_Krystal_transformation.csv")


sequence_len = 10 # set seq len

In [4]:
if ds_name == "PublicArena":
    event2code = {
     'TcpIp/Recv': 0,
     'TcpIp/Send': 1,
     'FileIO/Read': 2,
     'Process/Start': 3,
     'Image/Load': 4,
    }
    events_df['type'] = events_df['event']
else:
    event2code = {
     'EVENT_WRITE': 0,
     'EVENT_MODIFY_FILE_ATTRIBUTES': 1,
     'EVENT_EXECUTE': 2,
     'EVENT_SENDTO': 3,
     'EVENT_RECVFROM': 4,
     'EVENT_CLONE':5,
}
event2code

{'TcpIp/Recv': 0,
 'TcpIp/Send': 1,
 'FileIO/Read': 2,
 'Process/Start': 3,
 'Image/Load': 4}

In [5]:
events_df['encoded_type'] = events_df['type'].apply(lambda x: event2code[x])#.values

In [6]:
events_df.head()

Unnamed: 0,label,PID,PName,event,eid,user,timestamp,date,saddr,sport,...,ParentID,ImageFileName,CommandLine,PPName,techniques,subject,object,anomaly_label,type,encoded_type
0,,5260,,TcpIp/Recv,107516c49f25f834f303d259f0cd069b,admin,1652108220,5/9/2022 22:57,192.168.0.110,49784.0,...,,,nil,,T1589,5260#nan,"49,784:42.81.86.68:443",0,TcpIp/Recv,0
1,,5260,iexplore,TcpIp/Recv,2adc6537eb5533876f20120cf5c963c1,admin,1652108220,5/9/2022 22:57,192.168.0.110,49784.0,...,,,nil,,T1589,5260#iexplore,"49,784:42.81.86.68:443",0,TcpIp/Recv,0
2,,1240,devenv,TcpIp/Send,e4f93e96a4fd137ee1eab1f4ca61eecd,admin,1652108243,5/9/2022 22:57,192.168.0.110,49846.0,...,,,nil,,T1020,1240#devenv,"49,846:13.94.47.61:9,354",0,TcpIp/Send,1
3,,2980,MsMpEng,FileIO/Read,1e852ace593ef6bfd6c386da5dadb3d2,admin,1652108265,5/9/2022 22:57,,,...,,,nil,,T1005,2980#MsMpEng,D:\distdet-log\benign_collect_time.txt,0,FileIO/Read,2
4,,6512,LogonUI,Process/Start,2e446d6b15964401882cb1643ebd7623,admin,1652108277,5/9/2022 22:57,,,...,2156.0,LogonUI.exe,&quot;LogonUI.exe&quot; /flags:0x0 /state0:0xa...,unknown,T1204.002,2156#unknown,6512#LogonUI,0,Process/Start,3


In [7]:
events_df.iloc[5150:5160]

Unnamed: 0,label,PID,PName,event,eid,user,timestamp,date,saddr,sport,...,ParentID,ImageFileName,CommandLine,PPName,techniques,subject,object,anomaly_label,type,encoded_type
5150,,7396,AtBroker,Process/Start,a07c6ee6b6102325e1403516d9d85771,admin,1652423009,5/13/2022 14:23,,,...,2156.0,AtBroker.exe,atbroker.exe,unknown,T1204.002,2156#unknown,7396#AtBroker,0,Process/Start,3
5151,,1240,devenv,TcpIp/Send,1afb8a88eb0dd2aaea8c82a3f0d2610a,admin,1652423009,5/13/2022 14:23,192.168.0.110,53842.0,...,,,nil,,T1020,1240#devenv,"53,842:13.94.47.61:9,354",0,TcpIp/Send,1
5152,,1240,devenv,TcpIp/Recv,07d3c158ee4bff9795fdb92d1fb6b700,admin,1652423010,5/13/2022 14:23,192.168.0.110,53842.0,...,,,nil,,T1589,1240#devenv,"53,842:13.94.47.61:9,354",0,TcpIp/Recv,0
5153,,3592,powershell,Process/Start,3cb3087021b78d960805fcd3b3918cba,admin,1652423148,5/13/2022 14:25,,,...,5548.0,powershell.exe,powershell.exe -nop -w hidden -c &quot;IEX ((...,cmd,T1105,5548#cmd,3592#powershell,0,Process/Start,3
5154,,3592,powershell,Image/Load,8edbbfcfa2b8a314a7fe5217123dc2a9,admin,1652423148,5/13/2022 14:25,,,...,,,nil,,T1059,3592#powershell,C:\Windows\System32\WindowsPowerShell\v1.0\pow...,0,Image/Load,4
5155,,3592,powershell,TcpIp/Send,943af3ea7a0ed86e93fa011bdfb5f135,admin,1652423149,5/13/2022 14:25,192.168.0.110,53860.0,...,,,nil,,T1020,3592#powershell,"53,860:124.223.85.207:8,900",0,TcpIp/Send,1
5156,,3592,powershell,TcpIp/Recv,b1507143afc892b65a062d5e912ef847,admin,1652423149,5/13/2022 14:25,192.168.0.110,53860.0,...,,,nil,,T1589,3592#powershell,"53,860:124.223.85.207:8,900",0,TcpIp/Recv,0
5157,a,4312,powershell,Process/Start,712b3d9040922f413c0017bf38c2f5d8,admin,1652423280,5/13/2022 14:28,,,...,888.0,powershell.exe,powershell.exe -nop -w hidden -c &quot;IEX ((...,cmd,T1105,888#cmd,4312#powershell,0,Process/Start,3
5158,a,4312,powershell,Image/Load,2c4b4175c3d67dff82f4136bccb9a8a4,admin,1652423280,5/13/2022 14:28,,,...,,,nil,,T1059,4312#powershell,C:\Windows\System32\WindowsPowerShell\v1.0\pow...,0,Image/Load,4
5159,a,4312,powershell,TcpIp/Send,f74397f6d76a84d3e879de40a5594a62,admin,1652423281,5/13/2022 14:28,192.168.0.110,53872.0,...,,,nil,,T1020,4312#powershell,"53,872:124.223.85.207:8,900",0,TcpIp/Send,1


In [8]:
def make_sequences_dict(df): #
    data_for_model = {"data_for_sequences":[]}
    """
    data_for_graph - data for lstm for all the sequences
    data_for_graph["data_for_sequences"] - list of dictionaries for sequnces with their 
        seq_id, 
        logs_for_sequence, 
        next_event_types
    """

    data_for_graph = {"node2label_dict":{}, "data_for_sequences":[]}
    """
    data_for_graph - data for graphs for all the sequences
    data_for_graph["node2label_dict"] - dictionary of nodes to thier labels to print in the graph
    data_for_graph["data_for_sequences"] - list of dictionaries for sequnces with their:
        seq id, 
        nodes_ls, 
        edges_ls, 
        edge2label_dict - map edge to label to print in the graph,
        edge2info_dict - mapping edge to information (attention - some edges may include more than one event) 
    """

    #iterate sequences
    for seq_ind in range(len(df)-sequence_len): 
        relevant_df = df.iloc[seq_ind:seq_ind + sequence_len + 1]
    
        eid_list = relevant_df['eid'].values
        sid_list = relevant_df['subject'].values
        oid_list = relevant_df['object'].values
        etype_ls = relevant_df['encoded_type'].values 
        etype_string_ls = relevant_df['type'].values
        ttps = relevant_df["techniques"].values
        event_labels = relevant_df["label"].values
        if ds_name == "PublicArena":
            object_resource_name = oid_list
            subject_resource_name = sid_list
        else:
            object_resource_name = relevant_df["object_name"].values
            subject_resource_name = relevant_df["subject_name"].values

        logs_for_sequence = []
        next_event_types = etype_ls[1:]  # the labels are the next event types

        nodes_ls = []
        edges_ls = []
        edge2info_dict = {} #
        edge2label_dict = {}
        
        malicious_events_ls = [] #list of the malicious events as edges, use it to label the sequences for the model
        label_for_sequence = 0 
        powershell_generation, powershell_communicate_external_ip, external_ip_communicate_powershell = False, False, False
        
        #iterate the events in the sequence
        # make sequence that include event type, present and next subject and object:
        for event_ind in range(len(relevant_df)-1): #-1 because we won't take the 11'th event but its object,subject id 
                    
            #--------------- Strat - handel data for model ---------------
            log = []
            e_type = etype_ls[event_ind]
            log.append(e_type)

            # for present subject
            subject_id = sid_list[event_ind]
            same_subjects = [int(subject_id == x) for x in sid_list[:-1]]
            log.extend(same_subjects)

            # for next subject
            next_subject_id = sid_list[event_ind + 1]
            same_subjects_next = [int(next_subject_id == x) for x in sid_list[:-1]]
            log.extend(same_subjects_next)

            # for present object
            object_id = oid_list[event_ind]
            same_objects = [int(object_id == x) for x in oid_list[:-1]]
            log.extend(same_objects)

            # for next object
            next_object_id = oid_list[event_ind + 1]
            same_objects_next = [int(next_object_id == x) for x in oid_list[:-1]]
            log.extend(same_objects_next)
            logs_for_sequence.append(log)
            #--------------- END - handel data for model ---------------
            
        
            #--------------- Strat - handel data for graph ---------------
            ttp = ttps[event_ind]
            e_type_string = etype_string_ls[event_ind]
            if ds_name == "PublicArena":
                if e_type_string in ["TcpIp/Send", "Process/Start"]:
                    direction_object_to_subject = False
                elif e_type_string in ["TcpIp/Recv", "FileIO/Read", "Image/Load"]:
                    direction_object_to_subject = True
            else:
                if e_type_string in ["EVENT_EXECUTE", "EVENT_MODIFY_FILE_ATTRIBUTES", "EVENT_SENDTO", "EVENT_CLONE"]:
                    direction_object_to_subject = False
                elif e_type_string in ["EVENT_RECVFROM", "EVENT_WRITE"]:
                    direction_object_to_subject = True
                else:
                    raise Exception("Unfamiliar event type")
            
            e_type_string = e_type_string[6:] #dont take the "EVENT_" part of the string

            data_for_graph["node2label_dict"][object_id] = object_resource_name[event_ind]
            data_for_graph["node2label_dict"][subject_id] = subject_resource_name[event_ind]

            edge = (object_id, subject_id) if direction_object_to_subject else (subject_id, object_id)
            time_stamp = event_ind
            if edge not in edge2info_dict: edge2info_dict[edge] = [[time_stamp], e_type_string, ttp]
            else: #add only the current time stamp to the list
                edge2info_dict[edge][0].append(time_stamp)
                            
            edges_ls.append(edge)
            nodes_ls.append(object_id)
            nodes_ls.append(subject_id)
            #check if the event is malicious and add it to the malicious list
            if not pd.isna(event_labels[event_ind]): malicious_events_ls.append(edge)

        #make labels for the edges:
        for edge in edge2info_dict:
            total_label = ""
            info = edge2info_dict[edge]
            time_stampe_info = info[0]
            type_ttp_info = info[1:]
            #handle time stamp
            if len(time_stampe_info) <2 : #only one time stamp (one event)
                total_label = f'e{time_stampe_info[0]}'
            else: #more than one time stamp (at least two events)
                total_label = '('+', '.join([f'e{i}' for i in time_stampe_info])+')'
            #handle rest of the information
            total_label += ', '.join(['']+type_ttp_info)
            edge2label_dict[edge] = total_label
            #--------------- END - handel data for graph ---------------
        
        #--------------- Strat - check label for model - make undirected graph ---------------
        cur_g = nx.DiGraph()  # Creating Directed Graph #MultiDiGraph
        # adding nodes and vertices
        cur_g.add_nodes_from(nodes_ls)
        cur_g.add_edges_from([(edge[0], edge[1]) for edge in edges_ls])
        cur_g.add_edges_from([(edge[1], edge[0]) for edge in edges_ls])

        sorted_components = sorted([cur_g.subgraph(subg) for subg in nx.weakly_connected_components(cur_g)], key=len, reverse=True)
        for subg in sorted_components:
            undirected_subg = subg.to_undirected(as_view=False)
            diameter = nx.diameter(undirected_subg)
            malicious_events_counter = 0
            for event in subg.edges:
                if event in malicious_events_ls: malicious_events_counter+=1
            if malicious_events_counter>=3 : 
                if diameter >= 3: label_for_sequence = 1
                elif len(np.unique(ttps)) >= 4: label_for_sequence = 1
                    
                    
            
                
                
                
        #--------------- End - check label for model ---------------

        info_for_model = {"seq_ind":seq_ind, 
                             "logs_for_sequence":logs_for_sequence, 
                             "next_event_types":next_event_types,
                             "label":label_for_sequence
                         }
        data_for_model["data_for_sequences"].append(info_for_model)

    
        info_for_graph = {"seq_ind":seq_ind, 
                          "edges_ls":edges_ls, 
                          "nodes_ls":nodes_ls,
                          "edge2info_dict":edge2info_dict,
                          "edge2label_dict":edge2label_dict,
                         }
        data_for_graph["data_for_sequences"].append(info_for_graph)

        
    return data_for_model, data_for_graph



# Split dataset:

In [9]:
if ds_name == "Cadets":#for cadets
    train_size = 6510
    test_size = 360

elif ds_name == "Theia":#for thiea
    train_size = 2910
    test_size = 410

elif ds_name == "PublicArena":#for thiea
    train_size = 5153
    test_size = 200
    
train_df = events_df.iloc[:train_size, :]
test_df = events_df.iloc[train_size:train_size+test_size, :]
val_df = events_df.iloc[train_size+test_size:, ]

In [10]:
f"train_df size:{len(train_df)}, test_df size:{len(test_df)}, val_df size:{len(val_df)}"

'train_df size:5153, test_df size:200, val_df size:84'

# Write data to disk:

In [11]:
# make_sequences_dict(test_df)

In [12]:
sets_to_make = ["train", "val", "test"] #train, val, test
write_data_to_disk = True



def write_to_disk(ds, set_type):
    print("make:", set_type)
    data_for_model, data_for_graph = make_sequences_dict(ds)
    fname_data_for_model = f"{ds_name}_{set_type}_data_for_model.pkl"
    fname_data_for_graph = f"{ds_name}_{set_type}_data_for_graph.pkl"
    pickle.dump(data_for_model, open(data_folder+fname_data_for_model, "wb",pickle.HIGHEST_PROTOCOL))
    pickle.dump(data_for_graph, open(data_folder+fname_data_for_graph, "wb",pickle.HIGHEST_PROTOCOL))
    print("finish", set_type)
    


if write_data_to_disk:
    if "train" in sets_to_make:
        write_to_disk(ds = train_df, set_type = "train")

    if "val" in sets_to_make:
        write_to_disk(ds = val_df, set_type = "val")

    if "test" in sets_to_make:
        write_to_disk(ds = test_df, set_type = "test")

make: train
finish train
make: val
finish val
make: test
finish test
