In [1]:
import pandas as pd 
from bs4 import BeautifulSoup

import re

def camel_to_snake(name):
    name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()



with open("Microsoft-Windows-Kernel-Process.xml") as infile:
    data = infile.read()
#


soup = BeautifulSoup(data, "xml")

provider_data = soup.find("provider")
tasks = soup.find_all("task")
events = soup.find_all("event")
templates = soup.find_all("template")

taskdata = []
for task in tasks:
    taskdata.append({"event_id":task['value'], "event_description":task['name']})
#

eventdata = []
for event in events:
    try:
        eventdata.append({"event_id": event["value"], "template": event["template"], "symbol": event["symbol"]})
    except Exception as e:
        None

#
templatedata = []
for template in templates:
    ret = {"name": template["tid"]}
    for datafield in template.find_all("data"):
        templatedata.append({"template_name": template["tid"], "template_name_sc": camel_to_snake(template["tid"]) ,"datafield_name": datafield["name"], "datafield_name_sc": camel_to_snake(datafield["name"]), "datafield_type": datafield["inType"]})

tdf = pd.DataFrame(templatedata)
edf = pd.DataFrame(eventdata)

In [2]:
eid_match_str = """
let event_desc = match record.event_id() {

"""

for event in eventdata:
    eid_match_str +=  "\t" + event["event_id"] + ' => "' + event["symbol"] + '",\n'
#

eid_match_str += "}"

with open(f"{provider_data['name']}_eid_match.txt","w") as outfile:
    outfile.write(eid_match_str)


In [7]:
tdf['rdtype'] = ''

tdf.loc[tdf['datafield_type']=='win:UInt32', 'rdtype'] =  'Option<u32>'
tdf.loc[tdf['datafield_type']=='win:Pointer','rdtype'] =  'Option<Vec<u8>>'
tdf.loc[tdf['datafield_type']=='win:UInt64', 'rdtype'] = 'Option<u64>'
tdf.loc[tdf['datafield_type']=='win:UInt16', 'rdtype'] = 'Option<u16>'
tdf.loc[tdf['datafield_type']=='win:AnsiString', 'rdtype'] = 'Option<String>'
tdf.loc[tdf['datafield_type']=='win:Binary', 'rdtype'] = 'Option<Vec<u8>>'
tdf.loc[tdf['datafield_type']=='win:GUID', 'rdtype'] = 'Option<Vec<u8>>'
tdf.loc[tdf['datafield_type']=='win:Int32', 'rdtype'] = 'Option<i32>'
tdf.loc[tdf['datafield_type']=='win:Int64', 'rdtype'] = 'Option<i64>'
tdf.loc[tdf['datafield_type']=='win:UInt8', 'rdtype'] = 'Option<u8>'
tdf.loc[tdf['datafield_type']=='win:UnicodeString', 'rdtype'] = 'Option<String>'
tdf.loc[tdf['datafield_type']=='win:FILETIME', 'rdtype'] = 'Option<Vec<u8>>'

tdf.groupby(['datafield_type','rdtype']).agg({'template_name':['count']})

template_index = tdf[['template_name']].copy().drop_duplicates()

In [39]:
str1 = "#[derive(Serialize, Deserialize, Debug)]"
template_index = template_index[~template_index['template_name'].str.contains("_")].copy()

for idx, row in template_index.iterrows():
    datafields = tdf[tdf['template_name']==row['template_name']]
    str1 += "\n#[derive(Serialize, Deserialize, Debug)]"
    str1 += "\npub struct " + row['template_name'] + '{\n'
    str1 += '\tpub timestamp: String,\n\tpub event_id: u16,\n\tpub event_desc: String,'
    for i, r in datafields.iterrows():
        str1 += "\tpub "+ r['datafield_name_sc'] + ": " + r['rdtype'] + ",\n"
    str1 += "} "

with open(f"{provider_data['name']}_structs.txt","w") as outfile:
    outfile.write(str1)

In [37]:
str2 = "if record.event_id() == "
eid_datafield_lookup = pd.merge(left=edf, right=tdf, left_on='template', right_on='template_name').copy()
eids = list(set(edf['event_id'].to_list()))


for eid in eids:
    data_fields = eid_datafield_lookup[(eid_datafield_lookup['event_id']==eid) & (~eid_datafield_lookup['template'].str.contains("_"))]
    template_name = data_fields.iloc[0].template
    str2 += eid + " { \n"
    str2 += "\t" + "let mut eventdata = templates::"+template_name + "{"
    str2 += "\n\t\tevent_id: record.event_id(),\n\t\tevent_desc: event_desc.to_string(),\n\t\ttimestamp: record.timestamp().to_string()"
    for idx, row in data_fields.iterrows():
        str2 += ",\n\t\t" + row['datafield_name_sc'] + ": parser.try_parse(" + '"' + row['datafield_name'] + '").ok()'
    str2 += "\t\n\t} \n} else if record.event_id() == "


with open(f"{provider_data['name']}_struct_assignment.txt","w") as outfile:
    outfile.write(str2)
    

In [22]:
eid_datafield_lookup[(eid_datafield_lookup['event_id']=='1') & (~eid_datafield_lookup['template'].str.contains("_V"))]

Unnamed: 0,event_id,template,symbol,template_name,template_name_sc,datafield_name,datafield_name_sc,datafield_type,rdtype
0,1,ProcessStartArgs,ProcessStart,ProcessStartArgs,process_start_args,ProcessID,process_id,win:UInt32,Option<u32>
1,1,ProcessStartArgs,ProcessStart,ProcessStartArgs,process_start_args,CreateTime,create_time,win:FILETIME,Option<Vec<u8>>
2,1,ProcessStartArgs,ProcessStart,ProcessStartArgs,process_start_args,ParentProcessID,parent_process_id,win:UInt32,Option<u32>
3,1,ProcessStartArgs,ProcessStart,ProcessStartArgs,process_start_args,SessionID,session_id,win:UInt32,Option<u32>
4,1,ProcessStartArgs,ProcessStart,ProcessStartArgs,process_start_args,ImageName,image_name,win:UnicodeString,Option<String>
