## Data Prep

Some of the concepts and functions are taken from the safekit source code

In [1]:
auth_fname = "./data/auth_100M.txt"
redteam_fname = "./data/redteam.txt"
char_features_fname = "./data/char_features.txt"
redteam_lines_fname = "./data/redteam_lines.txt"

char_feats_dir = './data/char_feats/'


In [2]:
def translate_line(long_line, pad_len):
    '''
        log line: line to be translated, 
        pad_len : the number of 0's to append so that the length of the translated line is the same in the dataset 
                  (character-wise). 
        
        note - 0 and 1 are used to describe START and END of the translated sentence.
    '''
    return "0 " + " ".join([str(ord(c) - 30) for c in long_line]) + " 1 " + " ".join(["0"] * pad_len) + "\n"

### Note1
*There is a difference on what the source in LANL_LM_data.ipynp is trying to filter and the format of the auth.txt file. For simplicity we'll just use the whole line length*

*Here is the original code as a reference*
```
max_len = 0
with open("auth_proc.txt", "r") as infile:
    for line in infile:
        tmp = line.strip().split(',')
        line_minus_time = tmp[0] + ',' + ','.join(tmp[2:])
        if len(line_minus_time) > max_len:
            max_len = len(line)
print (max_len)*
```

In [3]:
# Get the length of the max log line in auth.txt (note: we are using a subset auth_100M.txt for trials)

max_len = 0
num_lines = 0
with open(auth_fname, "r") as infile:
    for line in infile:
        num_lines += 1
        if len(line) > max_len:
            max_len = len(line)
print ('max_line_length: ', max_len)
print ('number of lines: ', num_lines)

max_line_length:  113
number of lines:  100000000


In [4]:
# original notebook filter the following days as weekend.
# we are not sure where they got the exact numbers from.
weekend_days = [3, 4, 10, 11, 17, 18, 24, 25, 31, 32, 38, 39, 45, 46, 47, 52, 53]

### Note2
*It's not clear how the original notebook deal with the file segments. which is described here*
```
Each event is on a separate line in the form of "time,user@domain,computer,process name,start/end" and represents a process event at the given time.
```
so, we decided to use such description. For now we'll ignore marking the redteam events to see if we can discover them during inference, we'll also include all days.

In [5]:
import math
from tqdm import trange 

redevents = set()

red_outfile = open(redteam_lines_fname, "w")
with open(redteam_fname, 'r') as red:
    for line in red:
        redevents.add(line.strip())

#print(redevents)

# parse the data file, reading in (raw) and writing out (translated) log lines.
with open(auth_fname, 'r') as infile, open(char_features_fname, 'w') as outfile:
    outfile.write('line_num sec day user red seq_len sentence\n') # header
#     infile.readline()
    for line_num in trange(num_lines):
        raw_line = infile.readline().strip().split(',')
        line_minus_time = ','.join(raw_line[1:])
        diff = max_len - len(line_minus_time)
        sec = raw_line[0]
        day = math.floor(int(sec)/86400)
        user = raw_line[1].strip().split('@')[0]
        red = 0
        redentry = "{0},{1},{2},{3}".format(raw_line[0], raw_line[1], raw_line[3], raw_line[4])
#         line_minus_event = ",".join(raw_line[1:])
#         red += int(line_minus_event in redevents) # 1 if line is red event
#         if user.startswith('U') and day not in weekend_days:
        if user.startswith('U'):
            translated_line = translate_line(line_minus_time, diff)
            outfile.write("%s %s %s %s %s %s %s" % (line_num, sec, day, 
                                                    user.replace("U", ""), 
                                                    red, len(line_minus_time) + 1, translated_line))
            
            # tag the redteam entry with a line number correspond to the processed line
            if redentry in redevents:
                #print(redentry)
                red_outfile.write("{0},{1}\n".format(line_num, redentry))
    
    outfile.close()
    red_outfile.close()
                

100%|██████████| 100000000/100000000 [19:53<00:00, 83804.21it/s]


In [None]:
# The final preprocessing step is to split the translated data into multiple files; one for each day.

import os
from tqdm import trange


if not os.path.exists(char_feats_dir):
    os.makedirs(char_feats_dir)
    
num_data_lines = 0
print("get number of lines in {0} file".format(char_features_fname))

with open(char_features_fname, 'r') as data:
    for x in data:
        num_data_lines +=1
    
with open(char_features_fname, 'r') as data:
    current_day = '0'
    data.readline() # skip the header
    outfile = open(char_feats_dir + current_day + '.txt', 'w')
    for i in trange(num_data_lines):
        try:
            line = data.readline()
            if not line:
                continue
            larray = line.strip().split(' ')
            if int(larray[2]) == int(current_day):
                outfile.write(line)
            else:
                outfile.close()
                current_day = larray[2]
                outfile = open(char_feats_dir + current_day + '.txt', 'w')
                outfile.write(line)
        except:
            print('error processing file.')
    outfile.close()

