## Data processing phase 2 for LANL auth.txt file 

In this notebook we do few things
1. Tag redteam events in each of the following users' files which has 10M or more events
   - U12, U13, U24, U66, U78, U207, U293, U453, U679, U1289, U1480
2. Processes each file to convert text to character features with some meta data.
3. Split each file to multiple files based on event days, stored in a user directory. 

Once all is done, we can use the character feature data for our RNN model processing


In [None]:
# user_names = ['U12', 'U13', 'U24', 'U66', 'U78', 'U207', 'U293', 'U453', 'U679', 'U1289', 'U1480']
# for simple testing, we'll initially ignore U66, it has 11M events
user_names = ['U12', 'U13', 'U24', 'U78', 'U207', 'U293', 'U453', 'U679', 'U1289', 'U1480']
users_indir = 'data/users'
users_outdir = 'data/users_feats'
redteam_fname = 'data/redteam.txt'

In [None]:
def transform_line(line):
    '''
        log line: line to be transformed, 
        
        replace ',' with '|' for easy of processing of each sentence later during training.
    '''
    return line.replace(',', '|')

In [None]:
import os
import math

redevents = set()
max_len = 120

# make sure we have the output dir
if not os.path.exists(users_outdir):
    os.makedirs(users_outdir)

with open(redteam_fname, 'r') as red:
    for line in red:
        redevents.add(line.strip())

for u in user_names:
    user_infile = '{0}/{1}.txt'.format(users_indir, u)
    user_outfile = '{0}/{1}_feats.txt'.format(users_outdir, u)
    print('processing: ', u, '...', end='')
    with open(user_infile, 'r') as infile, open(user_outfile, 'w') as outfile:
        outfile.write('sec,day,red,seq_len,sentence\n') # header
        redcount = 0
        for line in infile.readlines():
            line = line.strip().split(',')
            sentence = ','.join(line[1:])
            diff = max_len - len(sentence)
            sec = line[0]
            day = math.floor(int(sec)/86400)
            red = 0
            redentry = "{0},{1},{2},{3}".format(line[0], line[1], line[3], line[4])
            red += int(redentry in redevents) # 1 if line is red event
            redcount += red
            translated = transform_line(sentence)
            outfile.write("%s,%s,%s,%s,%s\n" % (sec, day, 
                                                    red, len(sentence), translated))
        print('done - red team events:', redcount)
        outfile.close()
        infile.close()



In [None]:
# The final preprocessing step is to split the translated data into multiple files; one for each day.

for u in user_names:
    user_infile = '{0}/{1}_feats.txt'.format(users_outdir, u)
    user_outdir = '{0}/{1}/'.format(users_outdir, u)

    if not os.path.exists(user_outdir):
        os.makedirs(user_outdir)
    
    with open(user_infile, 'r') as data:
        current_day = 0
        outfile = open(user_outdir + str(current_day) + '.txt', 'w')
        print('processing:', u, '...', current_day, end='')
        data.readline()
        for line in data.readlines():
            try:
                line_items = line.strip().split(',')
                day = int(line_items[1])
                if day == current_day:
                    outfile.write(line)
                else:
                    outfile.close()
                    current_day = day
                    outfile = open(user_outdir + str(current_day) + '.txt', 'w')
                    print(',', current_day, end='')
                    outfile.write(line)
            except:
                print('error processing file.... line: ', line)
        outfile.close()
        print(' ...Done!')

