## Data processing phase 2 for LANL auth.txt file 

In this notebook we do few things
1. Tag redteam events in each of the following users' files which has 10M or more events
   - U12, U13, U24, U66, U78, U207, U293, U453, U679, U1289, U1480
2. Processes each file to convert text to character features with some meta data.
3. Split each file to multiple files based on event days, stored in a user directory. 

Once all is done, we can use the character feature data for our RNN model processing


In [1]:
# user_names = ['U12', 'U13', 'U24', 'U66', 'U78', 'U207', 'U293', 'U453', 'U679', 'U1289', 'U1480']
# initially we'll initially ignore U66, it has 11M events
user_names = ['U12', 'U13', 'U24', 'U78', 'U207', 'U293', 'U453', 'U679', 'U1289', 'U1480']
users_in_dir = 'data/users'
users_out_dir = 'data/char_feats'
redteam_fname = 'data/redteam.txt'

In [14]:
def translate_line(line, pad_len):
    '''
        log line: line to be translated, 
        pad_len : the number of 0's to append so that the length of the translated line is the same in the dataset 
                  (character-wise). 
        
        note - 0 and 1 are used to describe START and END of the translated sentence.
    '''
    #return "0 " + " ".join([str(ord(c) - 30) for c in line]) + " 1 " + " ".join(["0"] * pad_len) + "\n"
    return line.replace(',', '|')

In [15]:
import math

redevents = set()
max_len = 120

with open(redteam_fname, 'r') as red:
    for line in red:
        redevents.add(line.strip())

for u in user_names:
    user_infile = '{0}/{1}.txt'.format(users_in_dir, u)
    user_outfile = '{0}/{1}_feats.txt'.format(users_out_dir, u)
    print('processing: ', u, '...', end='')
    with open(user_infile, 'r') as infile, open(user_outfile, 'w') as outfile:
        outfile.write('sec,day,red,seq_len,sentence\n') # header
        redcount = 0
        for line in infile.readlines():
            line = line.strip().split(',')
            sentence = ','.join(line[1:])
            diff = max_len - len(sentence)
            sec = line[0]
            day = math.floor(int(sec)/86400)
            red = 0
            redentry = "{0},{1},{2},{3}".format(line[0], line[1], line[3], line[4])
            red += int(redentry in redevents) # 1 if line is red event
            redcount += red
            translated = translate_line(sentence, diff)
            outfile.write("%s,%s,%s,%s,%s\n" % (sec, day, 
                                                    red, len(sentence), translated))
        print('done - red team events:', redcount)
        outfile.close()
        infile.close()



processing:  U12 ...done - red team events: 6
processing:  U13 ...done - red team events: 2
processing:  U24 ...done - red team events: 5
processing:  U78 ...done - red team events: 2
processing:  U207 ...done - red team events: 2
processing:  U293 ...done - red team events: 31
processing:  U453 ...done - red team events: 2
processing:  U679 ...done - red team events: 2
processing:  U1289 ...done - red team events: 3
processing:  U1480 ...done - red team events: 12


In [16]:
# The final preprocessing step is to split the translated data into multiple files; one for each day.

import os
from tqdm import trange


for u in user_names:
    user_infile = '{0}/{1}_feats.txt'.format(users_out_dir, u)
    user_outdir = '{0}/{1}/'.format(users_out_dir, u)

    if not os.path.exists(user_outdir):
        os.makedirs(user_outdir)
    
    with open(user_infile, 'r') as data:
        current_day = 0
        outfile = open(user_outdir + str(current_day) + '.txt', 'w')
        print('processing:', u, '...', current_day, end='')
        data.readline()
        for line in data.readlines():
            try:
                line_items = line.strip().split(',')
                day = int(line_items[1])
                if day == current_day:
                    outfile.write(line)
                else:
                    outfile.close()
                    current_day = day
                    outfile = open(user_outdir + str(current_day) + '.txt', 'w')
                    print(',', current_day, end='')
                    outfile.write(line)
            except:
                print('error processing file.... line: ', line)
        outfile.close()
        print(' ...Done!')



processing: U12 ... 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57 ...Done!
processing: U13 ... 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57 ...Done!
processing: U24 ... 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57 ...Done!
processing: U78 ... 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57 ...Done!
