# LANL Research

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gzip
import shutil
import datetime
import networkx as nx

from IPython.display import clear_output

In [2]:
pd.options.display.float_format = '{:,}'.format

### Authentication and Process Data

We neeed to import the data we will use. Below we import the authentication and process summaries.

In [4]:
rootdir = 'C:/Users/corri/OneDrive/Documents/Uni/Postgraduate/Final Project/LANL/ATI Data/Summaries/wls'
unzippeddir = 'C:/Users/corri/OneDrive/Documents/Uni/Postgraduate/Final Project/LANL/ATI Data/Summaries/wls/Unzipped'
frames = []

count = 0

for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        if file[-3:] == '.gz':
            filedir = rootdir + '/' + file
            with gzip.open(filedir) as f:
                df = pd.read_csv(filedir, header=None)
                frames.append(df)
            if 'authentications' in str(file):
                count = count + len(df)

df = pd.concat(frames)

authentication_data = df[:count]
authentication_data.columns = ['UserName', 'SrcDevice','DstDevice', 'Authent Type', 'Failure', 'DailyCount']

process_data = df[count:]
process_data = process_data[[0,1,2,3,4]]
process_data.columns = ['UserName', 'Device', 'ProcessName', 'ParentProcessName', 'DailyCount']

authentication_data.to_csv('../Data/Authentication data.gz', header=True, compression='gzip')
process_data.to_csv('../Data/Process data.gz', header=True, compression='gzip')

In [5]:
authentication_data

Unnamed: 0,UserName,SrcDevice,DstDevice,Authent Type,Failure,DailyCount
0,User035855,Comp808475,Comp081330,TGS,0,17.0
1,Comp655251$,Comp655251,ActiveDirectory,NetworkLogon,0,350.0
2,User762066,Comp306129,ActiveDirectory,TGS,0,22.0
3,User384215,Comp095190,EnterpriseAppServer,NetworkLogon,0,35.0
4,User043263,Comp883307,Comp384394,TGS,0,2.0
...,...,...,...,...,...,...
80187,Comp955366$,Comp955366,ActiveDirectory,TGS,0,23.0
80188,Comp589380$,Comp589380,ActiveDirectory,TGT,0,37.0
80189,Comp629793$,Comp629793,ActiveDirectory,TGS,0,21.0
80190,User104387,Comp917362,ActiveDirectory,NetworkLogon,0,387.0


In [6]:
process_data

Unnamed: 0,UserName,Device,ProcessName,ParentProcessName,DailyCount
0,Comp748297$,Comp748297,Proc391839.exe,Proc387473,1
1,Comp563664$,Comp563664,rundll32.exe,services,1
2,User607396,Comp609111,Proc417435.exe,Proc417435,1
3,Comp641702$,Comp641702,Proc249569.exe,services,1
4,Comp157389$,Comp157389,Proc402696.exe,services,1
...,...,...,...,...,...
251545,Comp738970$,Comp738970,Proc207472.exe,svchost,2
251546,User925794,Comp097857,cmd.exe,Proc174492,2
251547,Comp996714$,Comp996714,Proc612297.exe,services,5
251548,Comp391736$,Comp391736,rundll32.exe,Proc247259,2


### Clustering

UserName, Device, ProcessName, ParentProcessName, DailyCount

In [None]:
# source computers dictionary
user_names = process_data[''].unique()
user_names_dict = {}
for i, key in enumerate(user_names):
    user_names_dict[key] = i
    
# destination computers ditionary
devices = process_data[1].unique()
devices_dict = {}
for i, key in enumerate(devices):
    devices_dict[key] = i
    
# source port dictionary
processes = process_data[2].unique()
processes_dict = {}
for i, key in enumerate(processes):
    processes_dict[key] = i
    
# destination port dictionary
parent_processes = process_data[3].unique()
parent_processes_dict= {}
for i, key in enumerate(parent_processes):
    parent_processes_dict[key] = i
    
process_data[0] = process_data[0].map(user_names_dict)
process_data[1] = process_data[1].map(devices_dict)
process_data[2] = process_data[2].map(processes_dict)
process_data[3] = process_data[3].map(parent_processes_dict)

In [None]:
from sklearn.cluster import DBSCAN
from sklearn import metrics

db = DBSCAN(eps=1, min_samples=10).fit(process_data)
labels = db.labels_
no_clust.append(len(np.unique(labels)))
no_noise.append(np.sum(np.array(labels) == -1, axis=0))

### Graph Drawing

In [None]:
def draw_day(data,i):

    # split the data into the selected time period
    df_day = data

    # print day we're working on
    print(i)

    # get all unqiue users for that period
    userlist_1 = list(list(df_day[0].unique()))
    userlist_2 = list(list(df_day[1].unique()))
    unique_users = set(userlist_1 + userlist_2)

    # get all connections made for that period
    connections = zip(df_day[0], df_day[1])

    # create the graph
    G = nx.DiGraph()

    # create the nodes of the graph
    for u in unique_users:
        G.add_node(u)

    # create the edges of the graph
    for a in connections:
        G.add_edge(*a)

    # draw the network
    plt.figure(figsize=(15,15), dpi=400)
    nx.draw(G, node_size=20, linewidths=0.8)
    plt.savefig('day {}.png'.format(i))

    return G

In [None]:
index_list = authentication_data.index.tolist()
start_days = [i for i, e in enumerate(index_list) if e == 0]
start_days.append(len(authentication_data))

data_ = [(authentication_data[start_days[i]:start_days[i+1]],i) for i in range(len(start_days)-1)]

In [None]:
begin_time = datetime.datetime.now()
draw_day(data_[0][0], data_[0][1])
print("Finished creating networks!")
end_time = datetime.datetime.now()
print(end_time - begin_time)

Creating a single network takes about 23 minutes. We do this for all 91 days so this is a large process which takes multiple hours so I would not recommend running the below line of code since this has parallelised this process for 8 cores but will still take hours.

In [None]:
# %run -i Parallelised-Drawing.py

### Netflow Data

This has currently hit a dead end. The files are hard to work with due to their size and therefore temporal analysis isn't feasible.

In [None]:
a = 'hihi.gz'
a[:3]

In [None]:
def read_data(day):
    path = 'C:\\Users\\corri\\OneDrive\\Documents\\Uni\\Postgraduate\\Final Project\\LANL\\Netflow Data'

    sh_file = os.getcwd()[:-4] + 'Data\\session_hosts.txt'
    rt_sh = list(pd.read_csv(sh_file, header=None)[0])

    try:
        print('Reading entire data set.')
        df_netflow = pd.read_csv(path + '/netflow_day-' + day + '.bz2', header=None)

        # adding headers
        headers = (['Time', 'Duration', 'SrcDevice', 'DstDevice', 'Protocol',
                'SrcPort', 'DstPort', 'SrcPackets', 'DstPackets', 'SrcBytes', 'DstBytes'])
        df_netflow.columns = headers

        # isolating red team computers
        rt_src_sh = df_netflow[df_netflow['SrcDevice'].isin(rt_sh)]
        print('The red team session host reduction process reduced the data to: {}%.'.format(rt_src_sh.shape[0]/df_netflow.shape[0]))
        pickle.dump(rt_src_sh, open(path + '/Reduced-day' + day + '.p','wb'))

    except:
        clear_output()
        print('Unable to read entire data set, reading iteratively with red team selection.')

        # iteratively reading while seperating red team data
        iter_csv = pd.read_csv(path + '/netflow_day-' + day + '.bz2', iterator=True, chunksize=1000)
        rt_src_sh = pd.concat([chunk[chunk['2'].isin(rt_sh)] for chunk in iter_csv])
        print('Reduced data down to {} lines.'.format(rt_src_sh.shape[0]))
        pickle.dump(rt_src_sh, open(path + '/Reduced-day' + day + '.p','wb'))

In [None]:
read_data('02')

We have over 115 million data points and therefore can't run anything without a massive amount of storage - functions such as pd.describe and pd.get_dummies wont work on this. The first steps are therefore to reduce the size of the data we are obtaining.

### Session Host Data

One of the useful pieces of information provided to us by Dan is the session host data. This is a list of computers that were used during the attacks. A first step we can take is to reduce the data set by only keeping the data from these source computers.

In [None]:
sh_file = os.getcwd()[:-4] + 'Data\\session_hosts.txt'
rt_sh = list(pd.read_csv(sh_file, header=None)[0])

In [None]:
rt_src_sh = df_netflow[df_netflow['SrcDevice'].isin(rt_sh)]
#rt_dest_sh = df_netflow[df_netflow['DstDevice'].isin(rt_sh)]
#rt_all = pd.concat([rt_src_sh, rt_dest_sh]).drop_duplicates()

In [None]:
rt_src_sh

In [None]:
print('The red team session host reduction process reduced the data to: {}%.'.format(rt_src_sh.shape[0]/df_netflow.shape[0])

In [None]:
pickle.dump(rt_src_sh, open(path + '/Reduced-day03.p','wb'))

### Clustering Red Team Session Host Data

In [None]:
# source computers dictionary
src_comps = rt_src_sh['SrcDevice'].unique()
src_comps_dict= {}
for i, key in enumerate(src_comps):
    src_comps_dict[key] = i
    
# destination computers ditionary
dst_comps = rt_src_sh['DstDevice'].unique()
dst_comps_dict= {}
for i, key in enumerate(dst_comps):
    dst_comps_dict[key] = i
    
# source port dictionary
src_port = rt_src_sh['SrcPort'].unique()
src_port_dict= {}
for i, key in enumerate(src_port):
    src_port_dict[key] = i
    
# destination port dictionary
dst_port = rt_src_sh['DstPort'].unique()
dst_port_dict= {}
for i, key in enumerate(dst_port):
    dst_port_dict[key] = i
    
rt_src_sh['SrcDevice'] = rt_src_sh['SrcDevice'].map(src_comps_dict)
rt_src_sh['DstDevice'] = rt_src_sh['DstDevice'].map(dst_comps_dict)
rt_src_sh['SrcPort'] = rt_src_sh['SrcPort'].map(src_port_dict)
rt_src_sh['DstPort'] = rt_src_sh['DstPort'].map(dst_port_dict)

In [None]:
rt_src_sh.dtypes

In [None]:
from sklearn.cluster import DBSCAN
from sklearn import metrics

eps = [1,5,10,15,20,25,30]
min_samples = [1,2,5,10,15,20]

no_clust = []
no_noise = []
sil_co = []
grid_vals = []

for e in eps:
    for ms in min_samples:
        
        clear_output(wait=True)
        print('Working with {} eps and {} min_samples.'.format(e,ms))
        
        grid_vals.append((e,ms))
        db = DBSCAN(eps=e, min_samples=ms).fit(rt_src_sh)
        labels = db.labels_
        no_clust.append(len(np.unique(labels)))
        no_noise.append(np.sum(np.array(labels) == -1, axis=0))
        sil_co.append(metrics.silhouette_score(rt_src_sh, labels, metric='euclidean'))

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)

In [None]:
pca.fit(rt_src_sh)
rt_sh_trans = pca.transform(rt_src_sh)

In [None]:
fig = plt.figure(figsize=(15, 8))
plt.scatter(rt_sh_trans[:, 0], rt_sh_trans[:, 1])
plt.show()

#### Source Devices

In [None]:
src_device_counts = df_netflow.groupby('SrcDevice').size().sort_values(ascending=False)

In [None]:
src_device_counts.index

In [None]:
src_device_20l = src_device_counts.nlargest(20)

In [None]:
plt.figure(figsize=(12,12))
plt.pie(src_device_20l, labels=src_device_20l.index, autopct = '%.2f')
plt.title('Pie chart showing the percentage contribution of the top 20 source computers')
plt.show()

### Time

In [None]:
duration_data = df_netflow['Duration']
fig1, ax1 = plt.subplots()
ax1.set_title('Box Plot of Duration data')
ax1.boxplot(duration_data)

In [None]:
dur_mean = df_netflow['Duration'].mean()
dur_75 = np.percentile(df_netflow['Duration'], 75)
dur_25 = np.percentile(df_n)