In [112]:
neo4j_url = 'http://localhost:7474/'
user = 'neo4j'
pwd = 'xmxujie13'
graph = Graph(neo4j_url,  auth=(user, pwd), name = "neo4j")
node_matcher = NodeMatcher(graph)

In [113]:
graph.run("MATCH (n) DETACH DELETE n")

### Read Data

In [114]:
logFile = '../logs/auditbeat-20221131_filtered.json'

In [115]:
# read
fr = open(logFile, "r")
logs = [json.loads(line) for line in fr.readlines()]
fr.close()


In [116]:
# 过滤只有program相关操作
filterLogs = []
filesLogs = []
netLogs = []

for log in logs:
    ruletype = RuleType().parse_jsons(log['rule'])
    
    if ruletype.tag == "sys_access":
        filesLogs.append(json.loads(log['log']))
        
    if ruletype.tag == "socket_as_client" or ruletype.tag == "sys_curl":
        netLogs.append(json.loads(log['log']))


### File Access Provenance graph
* Open File
* Read File
* Write File

In [117]:
for log in filesLogs:
    
    pLocation = log['executable']
    
    if 'program' not in pLocation:
        continue
    
    fLocation = log['accessed_file']
    pName = pLocation.split('/')[-1]
    fName = fLocation.split('/')[-1]
    fType = 'File' if fName != 'secret.txt' else 'Secret'

    op = log['syscall']
    
    # additional information
    pid = log['pid'] 
    pUser = log['user']
    ts = log['timestamp']

    
    node1 = node_matcher.match("Process").where(name=pName).first()
    node2 = node_matcher.match("File").where(name=fName).first()
    
    if node1 is None:
        node1 = Node("Process", name=pName, location = pLocation, pid = pid, user = pUser['name'])
        graph.create(node1)
    
    if node2 is None:
        node2 = Node(fType, name=fName, location = fLocation)
        graph.create(node2)
        
    # build relationship
    relation1 = Relationship(node1, op, node2, timeStamp = ts)
    
    graph.create(relation1)
    

In [118]:
print(len(netLogs))

6285


### NetWork Provenance Graph

1 . build the relationship between end-hosts

In [119]:
for log in netLogs:
    pLocation = log['executable']
    
    if 'socket' not in log:
        continue;
    
    socket_addr = ast.literal_eval(log['socket'])
    
    
    if 'addr' not in socket_addr:
        continue
    
    print(type(socket_addr))
    
    pName = pLocation.split('/')[-1]
    ip = socket_addr['addr']
    port = socket_addr['port']
    protocol = socket_addr['family']
    
    op = log['syscall']
    
     # additional information
    pid = log['pid'] 
    pUser = log['user']
    ts = log['timestamp']

    node1 = node_matcher.match("Process").where(name = pName).first()
    node2 = node_matcher.match("NetWork").where(ip = ip).where(port = port).first()
    
    
    if node1 is None:
        node1 = Node("Process", name=pName, location = pLocation, pid = pid, user = pUser['name'])
        graph.create(node1)
    
    if node2 is None:
        node2 = Node("Network", name = "Host", pid = pid, ip = ip, port = port)
        graph.create(node2)
    
    if node2 is None:
        continue
    
     # build relationship
    relation1 = Relationship(node1, op, node2, timeStamp = ts)
    
    graph.create(relation1)


    

<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>


2. build the realtion between program and download file

In [120]:
for log in filesLogs:
    pLocation = log['executable']
    
    if 'accessed_file' not in log: 
        continue
    
    fLocation = log['accessed_file']
    pName = pLocation.split('/')[-1]
    fName = fLocation.split('/')[-1]
    op = log['syscall']
    
    if pName != 'curl':
        continue
    
    if op != 'openat':
        continue
    
    if 'program' not in fName:
        continue
    
    
     # additional information
    pid = log['pid'] 
    pUser = log['user']
    ts = log['timestamp']

    node1 = node_matcher.match("Process").where(name = pName).first()
    node2 = node_matcher.match("Process").where(name = fName).first()
    
    
    if node1 is None:
        node1 = Node("Process", name=pName, location = pLocation, pid = pid, user = pUser['name'])
        graph.create(node1)
    
     # build relationship
    relation1 = Relationship(node1, 'download', node2, timeStamp = ts)
    
    graph.create(relation1)


    

3. build recevm graph

In [121]:
for log in netLogs:
    pLocation = log['executable']
    
    if 'syscall' not in log:
        continue;
    
    op = log['syscall']
    
    if op != 'recvfrom':
        continue    
    
    pName = pLocation.split('/')[-1]
    
    if pName != 'curl':
        continue
    
    print(pid)
    
    pid = log['pid'] 
    pUser = log['user']
    ts = log['timestamp']
    length = log['length of received message']

    node1 = node_matcher.match("Process").where(name = pName).first()
    node2 = node_matcher.match("Network").where(pid=pid).first()
    
    print(node2)
    
    if node2 is None:
        continue
    
     # build relationship
    relation1 = Relationship(node2, op, node1, timeStamp = ts, reveive_byte = length)
    
    graph.create(relation1)

286590
(_109:Network {ip: '127.0.0.1', name: 'Host', pid: '286560', port: '8080'})
286560
(_110:Network {ip: '127.0.0.1', name: 'Host', pid: '286563', port: '8080'})
286563
(_111:Network {ip: '127.0.0.1', name: 'Host', pid: '286566', port: '8080'})
286566
(_112:Network {ip: '127.0.0.1', name: 'Host', pid: '286569', port: '8080'})
286569
(_113:Network {ip: '127.0.0.1', name: 'Host', pid: '286572', port: '8080'})
286572
(_114:Network {ip: '127.0.0.1', name: 'Host', pid: '286575', port: '8080'})
286575
(_115:Network {ip: '127.0.0.1', name: 'Host', pid: '286578', port: '8080'})
286578
(_116:Network {ip: '127.0.0.1', name: 'Host', pid: '286581', port: '8080'})
286581
(_117:Network {ip: '127.0.0.1', name: 'Host', pid: '286584', port: '8080'})
286584
(_118:Network {ip: '127.0.0.1', name: 'Host', pid: '286587', port: '8080'})
286587
(_119:Network {ip: '127.0.0.1', name: 'Host', pid: '286590', port: '8080'})


4. build sendto graph

In [123]:
for log in netLogs:
    pLocation = log['executable']
    
    if 'syscall' not in log:
        continue;
    
    op = log['syscall']
    
    if op != 'sendto':
        continue    
    
    pName = pLocation.split('/')[-1]
    
    if pName != 'curl':
        continue
    
    print(pid)
    
    pid = log['pid'] 
    pUser = log['user']
    ts = log['timestamp']
    length = log['length of sent message']

    node1 = node_matcher.match("Process").where(name = pName).first()
    node2 = node_matcher.match("Network").where(pid=pid).first()
    
    print(node2)
    
    if node2 is None:
        continue
    
     # build relationship
    relation1 = Relationship(node1, op, node2, timeStamp = ts, reveive_byte = length)
    
    graph.create(relation1)

286560
(_109:Network {ip: '127.0.0.1', name: 'Host', pid: '286560', port: '8080'})
286560
(_109:Network {ip: '127.0.0.1', name: 'Host', pid: '286560', port: '8080'})
286560
(_110:Network {ip: '127.0.0.1', name: 'Host', pid: '286563', port: '8080'})
286563
(_110:Network {ip: '127.0.0.1', name: 'Host', pid: '286563', port: '8080'})
286563
(_111:Network {ip: '127.0.0.1', name: 'Host', pid: '286566', port: '8080'})
286566
(_111:Network {ip: '127.0.0.1', name: 'Host', pid: '286566', port: '8080'})
286566
(_112:Network {ip: '127.0.0.1', name: 'Host', pid: '286569', port: '8080'})
286569
(_112:Network {ip: '127.0.0.1', name: 'Host', pid: '286569', port: '8080'})
286569
(_113:Network {ip: '127.0.0.1', name: 'Host', pid: '286572', port: '8080'})
286572
(_113:Network {ip: '127.0.0.1', name: 'Host', pid: '286572', port: '8080'})
286572
(_114:Network {ip: '127.0.0.1', name: 'Host', pid: '286575', port: '8080'})
286575
(_114:Network {ip: '127.0.0.1', name: 'Host', pid: '286575', port: '8080'})
2865