This file includes the code of taking events that flagged as suspicious and splitted to train/var/test- as described in section 4.2 

In [2]:
import gzip
import torch
import pandas as pd
from collections import defaultdict
import uuid
import numpy as np
from tqdm import tqdm
from torch.nn.functional import softmax, cross_entropy
import networkx as nx
import pickle


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# Load data:

In [4]:
ds_name = "Theia" #Theia, Cadets

if ds_name == "Theia":
    data_folder = "dataset/theia/"
elif ds_name == "Cadets":
    data_folder = "dataset/cadets/"
events_df = pd.read_csv(data_folder+f"{ds_name}_Krystal_transformation.csv")
sequence_len = 10 # set seq len

In [5]:
event2code = {
     'EVENT_WRITE': 0,
     'EVENT_MODIFY_FILE_ATTRIBUTES': 1,
     'EVENT_EXECUTE': 2,
     'EVENT_SENDTO': 3,
     'EVENT_RECVFROM': 4,
     'EVENT_CLONE':5,
}
event2code

{'EVENT_WRITE': 0,
 'EVENT_MODIFY_FILE_ATTRIBUTES': 1,
 'EVENT_EXECUTE': 2,
 'EVENT_SENDTO': 3,
 'EVENT_RECVFROM': 4,
 'EVENT_CLONE': 5}

In [6]:
events_df['encoded_type'] = events_df['type'].apply(lambda x: event2code[x])#.values

In [7]:
events_df.head()

Unnamed: 0,label,timestamp,eid,type,subject,object,machine,subject_name,object_name,techniques,anomaly_label,encoded_type
0,,1.52e+18,899938CD-6674-2415-CDFD-630100000010,EVENT_WRITE,85049578-0100-0000-0000-000000000020,0F000000-BE1D-0700-0000-000000000000,0A00063C-5254-00F0-0D60-000000000070,dash,/run/motd.new,Corrupt_File,0,0
1,,1.52e+18,84A09787-7ED0-2415-D82C-C20200000010,EVENT_WRITE,931D1E04-0300-0000-0000-000000000020,0F000000-A346-1000-0000-000000000000,0A00063C-5254-00F0-0D60-000000000070,run-parts,/run/motd.new,Corrupt_File,0,0
2,,1.52e+18,2B3410C6-7976-2415-DDD1-740100000010,EVENT_WRITE,70177E81-0100-0000-0000-000000000020,0F000000-CA79-0700-0000-000000000000,0A00063C-5254-00F0-0D60-000000000070,dash,/run/motd.new,Corrupt_File,0,0
3,,1.52e+18,716823A0-66D0-2415-09D6-C00200000010,EVENT_WRITE,0717B803-0300-0000-0000-000000000020,0F000000-453C-1000-0000-000000000000,0A00063C-5254-00F0-0D60-000000000070,run-parts,/run/motd.new,Corrupt_File,0,0
4,,1.52e+18,BCBA3BBD-43CA-2415-9437-9B0200000010,EVENT_WRITE,70715DE9-0200-0000-0000-000000000020,0F000000-4835-0F00-0000-000000000000,0A00063C-5254-00F0-0D60-000000000070,dash,/run/motd.new,Corrupt_File,0,0


In [9]:
def make_sequences_dict(df): #
    data_for_model = {"data_for_sequences":[]}
    """
    data_for_graph - data for lstm for all the sequences
    data_for_graph["data_for_sequences"] - list of dictionaries for sequnces with their 
        seq_id, 
        logs_for_sequence, 
        next_event_types
    """

    data_for_graph = {"node2label_dict":{}, "data_for_sequences":[]}
    """
    data_for_graph - data for graphs for all the sequences
    data_for_graph["node2label_dict"] - dictionary of nodes to thier labels to print in the graph
    data_for_graph["data_for_sequences"] - list of dictionaries for sequnces with their:
        seq id, 
        nodes_ls, 
        edges_ls, 
        edge2label_dict - map edge to label to print in the graph,
        edge2info_dict - mapping edge to information (attention - some edges may include more than one event) 
    """

    #iterate sequences
    for seq_ind in range(len(df)-sequence_len): 
        relevant_df = df.iloc[seq_ind:seq_ind + sequence_len + 1]
    
        eid_list = relevant_df['eid'].values
        sid_list = relevant_df['subject'].values
        oid_list = relevant_df['object'].values
        etype_ls = relevant_df['encoded_type'].values 
        etype_string_ls = relevant_df['type'].values
        ttps = relevant_df["techniques"].values
        event_labels = relevant_df["label"].values
        object_resource_name = relevant_df["object_name"].values
        subject_resource_name = relevant_df["subject_name"].values

        logs_for_sequence = []
        next_event_types = etype_ls[1:]  # the labels are the next event types

        nodes_ls = []
        edges_ls = []
        edge2info_dict = {} #
        edge2label_dict = {}
        
        malicious_events_ls = [] #list of the malicious events as edges, use it to label the sequences for the model

        #iterate the events in the sequence
        # make sequence that include event type, present and next subject and object:
        for event_ind in range(len(relevant_df)-1): #-1 because we won't take the 11'th event but its object,subject id            
            
            #--------------- Strat - handel data for model ---------------
            log = []
            e_type = etype_ls[event_ind]
            log.append(e_type)

            # for present subject
            subject_id = sid_list[event_ind]
            same_subjects = [int(subject_id == x) for x in sid_list[:-1]]
            log.extend(same_subjects)

            # for next subject
            next_subject_id = sid_list[event_ind + 1]
            same_subjects_next = [int(next_subject_id == x) for x in sid_list[:-1]]
            log.extend(same_subjects_next)

            # for present object
            object_id = oid_list[event_ind]
            same_objects = [int(object_id == x) for x in oid_list[:-1]]
            log.extend(same_objects)

            # for next object
            next_object_id = oid_list[event_ind + 1]
            same_objects_next = [int(next_object_id == x) for x in oid_list[:-1]]
            log.extend(same_objects_next)
            logs_for_sequence.append(log)
            #--------------- END - handel data for model ---------------
            
        
            #--------------- Strat - handel data for graph ---------------
            ttp = ttps[event_ind]
            e_type_string = etype_string_ls[event_ind]

            if e_type_string in ["EVENT_EXECUTE", "EVENT_MODIFY_FILE_ATTRIBUTES", "EVENT_SENDTO", "EVENT_CLONE"]:
                direction_object_to_subject = False
            elif e_type_string in ["EVENT_RECVFROM", "EVENT_WRITE"]:
                direction_object_to_subject = True
            else:
                raise Exception("Unfamiliar event type")
            
            e_type_string = e_type_string[6:] #dont take the "EVENT_" part of the string

            data_for_graph["node2label_dict"][object_id] = object_resource_name[event_ind]
            data_for_graph["node2label_dict"][subject_id] = subject_resource_name[event_ind]

            edge = (object_id, subject_id) if direction_object_to_subject else (subject_id, object_id)
            time_stamp = event_ind
            if edge not in edge2info_dict: edge2info_dict[edge] = [[time_stamp], e_type_string, ttp]
            else: #add only the current time stamp to the list
                edge2info_dict[edge][0].append(time_stamp)
                            
            edges_ls.append(edge)
            nodes_ls.append(object_id)
            nodes_ls.append(subject_id)
            #check if the event is malicious and add it to the malicious list
            if not pd.isna(event_labels[event_ind]): malicious_events_ls.append(edge)

        #make labels for the edges:
        for edge in edge2info_dict:
            total_label = ""
            info = edge2info_dict[edge]
            time_stampe_info = info[0]
            type_ttp_info = info[1:]
            #handle time stamp
            if len(time_stampe_info) <2 : #only one time stamp (one event)
                total_label = f'e{time_stampe_info[0]}'
            else: #more than one time stamp (at least two events)
                total_label = '('+', '.join([f'e{i}' for i in time_stampe_info])+')'
            #handle rest of the information
            total_label += ', '.join(['']+type_ttp_info)
            edge2label_dict[edge] = total_label
            #--------------- END - handel data for graph ---------------
        
        #--------------- Strat - check label for model - make undirected graph ---------------
        label_for_sequence = 0
        cur_g = nx.DiGraph()  # Creating Directed Graph #MultiDiGraph
        # adding nodes and vertices
        cur_g.add_nodes_from(nodes_ls)
        cur_g.add_edges_from([(edge[0], edge[1]) for edge in edges_ls])
        cur_g.add_edges_from([(edge[1], edge[0]) for edge in edges_ls])

        sorted_components = sorted([cur_g.subgraph(subg) for subg in nx.weakly_connected_components(cur_g)], key=len, reverse=True)
        for subg in sorted_components:
            undirected_subg = subg.to_undirected(as_view=False)
            diameter = nx.diameter(undirected_subg)
            if diameter >= 3:
                malicious_events_counter = 0
                for event in subg.edges:
                    if event in malicious_events_ls: malicious_events_counter+=1
                if malicious_events_counter>=3 : label_for_sequence = 1
        #--------------- End - check label for model ---------------

        info_for_model = {"seq_ind":seq_ind, 
                             "logs_for_sequence":logs_for_sequence, 
                             "next_event_types":next_event_types,
                             "label":label_for_sequence
                         }
        data_for_model["data_for_sequences"].append(info_for_model)

    
        info_for_graph = {"seq_ind":seq_ind, 
                          "edges_ls":edges_ls, 
                          "nodes_ls":nodes_ls,
                          "edge2info_dict":edge2info_dict,
                          "edge2label_dict":edge2label_dict,
                         }
        data_for_graph["data_for_sequences"].append(info_for_graph)

        
    return data_for_model, data_for_graph



# Split dataset:

In [10]:
if ds_name == "Cadets":#for cadets
    train_size = 6510
    test_size = 360

elif ds_name == "Theia":#for thiea
    train_size = 2910
    test_size = 410

train_df = events_df.iloc[:train_size, :]
test_df = events_df.iloc[train_size:train_size+test_size, :]
val_df = events_df.iloc[train_size+test_size:, ]

In [11]:
f"train_df size:{len(train_df)}, test_df size:{len(test_df)}, val_df size:{len(val_df)}"

'train_df size:2910, test_df size:410, val_df size:114'

# Write data to disk:

In [69]:
sets_to_make = ["train", "val", "test"] #train, val, test
write_data_to_disk = True



def write_to_disk(ds, set_type):
    print("make:", set_type)
    data_for_model, data_for_graph = make_sequences_dict(ds)
    fname_data_for_model = f"{ds_name}_{set_type}_data_for_model.pkl"
    fname_data_for_graph = f"{ds_name}_{set_type}_data_for_graph.pkl"
    pickle.dump(data_for_model, open(data_folder+fname_data_for_model, "wb",pickle.HIGHEST_PROTOCOL))
    pickle.dump(data_for_graph, open(data_folder+fname_data_for_graph, "wb",pickle.HIGHEST_PROTOCOL))
    print("finish", set_type)
    


if write_data_to_disk:
    if "train" in sets_to_make:
        write_to_disk(ds = train_df, set_type = "train")

    if "val" in sets_to_make:
        write_to_disk(ds = val_df, set_type = "val")

    if "test" in sets_to_make:
        write_to_disk(ds = test_df, set_type = "test")

make: train
finish train
make: val
finish val
make: test
finish test
