## Loading XML 

In [None]:
import pandas as pd
from lxml import etree

FILE = 'logs.xml'

# need to use lxml's XMLParser with recover=True
# as from manual analysis we see that file is truncated
# (i.e. does not have correct XML structure)
parser = etree.XMLParser(recover=True)

with open(FILE) as file:
    data = file.readlines()

# ignore XML documentation's tag (1st line), so taking only data[1]
raw = etree.fromstring(data[1], parser=parser)

In [None]:
# we can see that parser appended xml tokens to file
print(etree.tostring(raw)[:100])
print(etree.tostring(raw)[-100:])

### .iter() performs iteration over every element in XML, not grouping them

In [None]:
elements = []
for i,element in enumerate(raw.iter()):
    elements.append(element)
    if i == 20:
        break

In [None]:
# type: https://lxml.de/api/lxml.etree._Element-class.html
print(elements[0])
print(elements[0].tag)

In [None]:
# exploring first 20 elements and their tag syntaxis
for i in range(len(elements)):
    print(elements[i].tag, ':', elements[i].text)

In [None]:
# get all events in list
events = []

# we see prefix on every tag, define that
tag = '{http://schemas.microsoft.com/win/2004/08/events/event}'
for element in raw.iter(tag+'Event'):
    events.append(element)

In [None]:
# .iterchildren() gives us direct subtokens
for i in events[0].iterchildren():
    print(i.tag)
    print(i.text)

In [None]:
# .iterdescendants() give us all tokens in event
for i in events[0].iterdescendants():
    print(i.tag, ':', i.text, ':', i.items())
    print()

## Creating DataFrame

First of all we make dictionary for every event, and by iterating over XML Element object collect data into that dictionary  
Then, we create pandas DataFrame out of this dictionary and adding it to larger dataset

In [None]:
df = pd.DataFrame()
tag = '{http://schemas.microsoft.com/win/2004/08/events/event}'
for idx, event in enumerate(events):
    edict = {}
    for element in event.iterdescendants():
        if any(x in element.tag for x in ['TimeCreated', 'Execution', 'Security']):
            for item in element.items():
                edict[item[0]] = item[1]
        # filter out empty fields
        elif any(x in element.tag for x in ['Provider', 'System', 'Correlation']):
            pass
        elif 'Data' in element.tag:
            for item in element.items():
                edict[item[1]] = element.text
        else:
            edict[element.tag.replace(tag,'')] = element.text
    
    # add raw text event to have ability always access full value of eventlog
    edict['raw'] = etree.tostring(event, pretty_print=True).decode()

    edf = pd.DataFrame(edict, index=[idx])
    df = df.append(edf)

In [None]:
# take a look on created dataset
df.head()

In [None]:
# what data was collected
df.columns

In [None]:
# now we can work with data much easier than through XML parser, e.g. filter events with eventID == 20
df[df['EventID'] == '20']

## Query dataframe for suspicious events (manual heuristics)

In [None]:
# see all unique events - all come from sysmon
df.EventID.unique()

In [None]:
# see event apearance in dataset
df.EventID.value_counts()

### WMI Event activity (Sysmon EventID: 19-21)

In [None]:
for i in range(19,22):
    df[df.EventID == str(i)]

In [None]:
# E.g. persistence attempts via WMI Event Subscriptions
print(df[df.EventID == '20'].raw.iloc[0])

### Suspicious Process creation activity

In [None]:
# Sysmon EventID 1 has information about created Processes
# Lets see what CommandLines were used during Process creation
for cmd in df[df.EventID == '1'].CommandLine.unique():
    print(cmd)

Lots of suspicious processes are seen.
Finding heuristics are implemented in `detections.py` and are not described here.

In [None]:
print(df[df.CommandLine.str.contains('wScript', na=False)].iloc[0].raw)

### Analyse powershell network activity

In [None]:
# filtering logs, where powershell is figuring in
psdf = df[df.raw.str.contains('powershell', na=False)]

In [None]:
# see where PS connected
psdf.DestinationIp.value_counts()

In [None]:
# only one IP address
# what ports?
psdf.DestinationPort.value_counts()

In [None]:
# all connections to '192.168.124.135:8080'
# statistical analysis of these connections
pd.to_datetime(psdf[psdf.DestinationIp == '192.168.124.135'].UtcTime).\
    sort_values().diff().iloc[1:].describe()

In [None]:
pd.to_datetime(psdf[psdf.DestinationIp == '192.168.124.135'].UtcTime).\
    sort_values().diff()

In [None]:
# we may visualize these connections to see distribution across time
# - it's obviously an automotive process with almost same time delays
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
sns.countplot(x='UtcTime', data=psdf[psdf.DestinationPort == '8080'], palette=sns.color_palette("Blues"))

We see that almost all connections have same time delay of ~5 seconds, which may indicate that there's Command and Control communication happening with C&C server at 192.168.124.135:8080  
We may assume that C&C delay settings are ~5 seconds and jitter is ~10%.

## Other events

In [None]:
df[df.EventID == '15'].index

In [None]:
[print('Event nr in file:', x[0], '\n', x[1].Image, 'created', x[1].TargetFilename) for x in df[df.EventID == '15'].iterrows()]

In [None]:
print(df[df.EventID == '2'].iloc[2].raw)

## Persistence attempts via registry

In [None]:
for row in df[df.EventID == '13'].iterrows():
    if 'powershell' in row[1].raw.lower():
        print(row[1].raw)

##  DNS activity

We see that only valid services like F-Secure's sensor, Cortana or Sysmon using DNS.  
No attempts to establish C&C via DNS

In [None]:
[print(x[1].Image, ':', x[1].QueryName) for x in df[df.EventID == '22'].iterrows()]

In [None]:
df.EventID.value_counts()

## File creations

In [None]:
[print(x[1].Image, ':', x[1].TargetFilename) for x in df[df.EventID == '11'].iterrows()]

In [None]:
df[df.EventID == '11'][df[df.EventID == '11'].Image.str.contains('suspicious_binary')].index

In [None]:
[print(x[1].TargetFilename) for x in df[df.EventID == '11'][df[df.EventID == '11'].\
    Image.str.contains('suspicious_binary')].iterrows()]