# Knowledge Graph & Alternatives for Event Chains
2/22/2025, Dave Sisk, https://github.com/davidcsisk, https://www.linkedin.com/in/davesisk-doctordatabase/

For this notebook to work, you will need to install Neo4j desktop locally, then create and start an empty graph db.  Alternatively, you could use a free cloud-based AuraDB instance.

In [1]:
# Check that we can get a connection to the Neo4j database
from neo4j import GraphDatabase

# Define the connection details
uri = "bolt://localhost:7687"
user = "neo4j"
password = "password"

# Create a driver instance
driver = GraphDatabase.driver(uri, auth=(user, password))

# Verify the connection
try:
    with driver.session() as session:
        result = session.run("RETURN 1")
        print("Connection successful")
except Exception as e:
    print(f"Connection failed: {e}")
finally:
    driver.close()

Connection successful


In [43]:
import pandas as pd

# Load the DNS log data
df = pd.read_csv('dns.log.gz', compression='gzip', sep='\t', header=None)

# Rename columns based on the provided information
df = df.rename(columns={0:'ts', 1:'uid', 2:'src_ip', 3:'src_port', 4:'dest_ip', 5:'dest_port', 
                        6:'proto', 7:'port', 8:'query', 9:'qclass', 10:'qclass_name', 11:'qtype', 
                        12:'qtype_name', 13:'rcode', 14:'rcode_name', 15:'qr', 16:'aa', 17:'tc', 
                        18:'rd', 19:'z', 20:'answers', 21:'ttls', 22:'rejected'})

In [44]:
# What size?
df.shape

(427935, 23)

In [45]:
# Let's first start with a smaller subset of the data
#df = df.head(10000)
#df = df.head(20000)
#df = df.head(40000)
df = df.head(80000)

df.shape

(80000, 23)

In [46]:
df.sample(3)
#df[df['rejected'] != 'F'].sample(3)

Unnamed: 0,ts,uid,src_ip,src_port,dest_ip,dest_port,proto,port,query,qclass,qclass_name,qtype,qtype_name,rcode,rcode_name,qr,aa,tc,rd,z,answers,ttls,rejected
70095,1331915000.0,C5t5RIJZuYAa3L6el,192.168.202.121,51727,192.168.207.4,53,udp,45801,teredo.ipv6.microsoft.com,1,C_INTERNET,1,A,-,-,F,F,T,F,0,-,-,F
75357,1331915000.0,CQX0cW23Ci1D08eA78,10.10.117.210,19198,192.168.207.4,53,udp,14848,teredo.ipv6.microsoft.com,1,C_INTERNET,1,A,-,-,F,F,T,F,0,-,-,F
58894,1331914000.0,CZ6P023bXFwrV0Waxj,10.10.117.210,19198,192.168.207.4,53,udp,7325,teredo.ipv6.microsoft.com,1,C_INTERNET,1,A,-,-,F,F,T,F,0,-,-,F


Can access Neo4j in browser here:
http://localhost:7474

In [47]:
# Load the graph database
from neo4j import GraphDatabase

# Define the connection details
uri = "bolt://localhost:7687"
user = "neo4j"
password = "password"

# Create a driver instance
driver = GraphDatabase.driver(uri, auth=(user, password))

# Clear the existing graph
def clear_graph(driver):
    query = "MATCH (n) DETACH DELETE n"
    with driver.session() as session:
        session.execute_write(lambda tx: tx.run(query))

# Function to list current indexes
def list_indexes(driver):
    query = "SHOW INDEXES"
    with driver.session() as session:
        result = session.execute_read(lambda tx: tx.run(query).data())
        for index in result:
            print(index)

# Function to drop existing indexes
def drop_indexes(driver):
    query = "SHOW INDEXES"
    with driver.session() as session:
        indexes = session.execute_read(lambda tx: tx.run(query).data())
        for index in indexes:
            index_name = index['name']
            drop_query = f"DROP INDEX {index_name} IF EXISTS"
            session.execute_write(lambda tx: tx.run(drop_query))

# Function to create indexes
def create_indexes(driver):
    index_queries = [
        "CREATE INDEX IF NOT EXISTS FOR (n:Event) ON (n.uid)",
        "CREATE INDEX IF NOT EXISTS FOR (n:Event) ON (n.src_ip)",
        "CREATE INDEX IF NOT EXISTS FOR (n:Event) ON (n.rcode_name)",
        "CREATE INDEX IF NOT EXISTS FOR (n:Event) ON (n.rejected)"
    ]
    with driver.session() as session:
        for query in index_queries:
            session.execute_write(lambda tx: tx.run(query))

# Create nodes and relationships
def create_graph(driver, df):
    # Create nodes with all columns as properties
    create_node_query = """
    MERGE (n:Event {uid: $uid})
    SET """ + ", ".join([f"n.{col} = ${col}" for col in df.columns if col != 'uid']) + """
    """
    
    create_relationship_query = """
    MATCH (a:Event {uid: $uid1}), (b:Event {uid: $uid2})
    MERGE (a)-[:SUCCESSIVE_EVENT]->(b)
    """
    
    previous_row = None
    for index, row in df.iterrows():
        parameters = {col: row[col] for col in df.columns}
        with driver.session() as session:
            session.execute_write(lambda tx: tx.run(create_node_query, parameters))
        
        if previous_row is not None and row['src_ip'] == previous_row['src_ip']:
            relationship_parameters = {
                'uid1': previous_row['uid'],
                'uid2': row['uid']
            }
            with driver.session() as session:
                session.execute_write(lambda tx: tx.run(create_relationship_query, relationship_parameters))
        
        previous_row = row

# Clear the existing graph, drop existing indexes, create new indexes, and load the new data
clear_graph(driver)
drop_indexes(driver)
#create_indexes(driver)
create_graph(driver, df)

# List current indexes to verify
list_indexes(driver)

Performance notes:
- 10K rows, 0 indexes...import time: 54s
- 10K rows, 4 indexes...import time: 47s
- 20K rows, 0 indexes...import time: 2m 5s
- 20K rows, 4 indexes...import time: 49s
- 40K rows, 0 indexes...import time: 7m 5s
- 40K rows, 4 indexes...import time: 1m 35s
- 80K rows, 0 indexes...import time: 23m 59s
- 80K rows, 0 indexes...import time: 

In [48]:
# Function to list current indexes
def list_indexes(driver):
    query = "SHOW INDEXES"
    with driver.session() as session:
        result = session.execute_read(lambda tx: tx.run(query).data())
        for index in result:
            print(index)

# List current indexes
list_indexes(driver)

In [29]:
#drop_indexes(driver)

In [49]:
# Helper function to execute a query
def execute_query(driver, query, parameters=None):
    try:
        with driver.session() as session:
            result = session.execute_read(lambda tx: tx.run(query, parameters).single())
            return result
    except Exception as e:
        print(f"Failed to execute query: {e}")
        return None

# Helper function to execute a query that returns multiple records
def execute_query_multiple(driver, query, parameters=None):
    try:
        with driver.session() as session:
            result = session.execute_read(lambda tx: list(tx.run(query, parameters)))
            return result
    except Exception as e:
        print(f"Failed to execute query: {e}")
        return None

In [50]:
query_num_nodes = """
MATCH (n)
RETURN count(n) AS num_nodes
"""

num_nodes_result = execute_query(driver, query_num_nodes)
if num_nodes_result:
    print(f"Number of nodes: {num_nodes_result['num_nodes']}")

Number of nodes: 35208


In [51]:
query_num_relationships = """
MATCH ()-[r]->()
RETURN count(r) AS num_relationships
"""

num_relationships_result = execute_query(driver, query_num_relationships)
if num_relationships_result:
    print(f"Number of relationships: {num_relationships_result['num_relationships']}")

Number of relationships: 26228


In [52]:
query_distinct_node_labels = """
MATCH (n)
RETURN DISTINCT labels(n) AS label, count(n) AS count
"""

distinct_node_labels_result = execute_query_multiple(driver, query_distinct_node_labels)
if distinct_node_labels_result:
    print("Distinct node labels and their counts:")
    for record in distinct_node_labels_result:
        label = record["label"]
        count = record["count"]
        print(f"Label: {label}, Count: {count}")

Distinct node labels and their counts:
Label: ['Event'], Count: 35208


In [53]:
query_distinct_relationships = """
MATCH ()-[r]->()
RETURN type(r) AS relationship_type, count(r) AS count
"""

distinct_relationships_result = execute_query_multiple(driver, query_distinct_relationships)
if distinct_relationships_result:
    print("Distinct relationship types and their counts:")
    for record in distinct_relationships_result:
        relationship_type = record["relationship_type"]
        count = record["count"]
        print(f"Relationship Type: {relationship_type}, Count: {count}")

Distinct relationship types and their counts:
Relationship Type: SUCCESSIVE_EVENT, Count: 26228


In [None]:
# Capture exactly 3 related events
query_successive_events = """
MATCH (e1:Event)-[:SUCCESSIVE_EVENT]->(e2:Event)-[:SUCCESSIVE_EVENT]->(e3:Event)
WHERE e1.src_ip = e2.src_ip AND e2.src_ip = e3.src_ip AND
      e2.ts - e1.ts < 1 AND e3.ts - e2.ts < 1
RETURN e1, e2, e3
"""

successive_events_result = execute_query_multiple(driver, query_successive_events)
if successive_events_result:
    print('Result count', len(successive_events_result))
    for record in successive_events_result:
        e1 = record["e1"]
        e2 = record["e2"]
        e3 = record["e3"]
        print(f"Event 1: {e1} \nEvent 2: {e2} \nEvent 3: {e3} \n")
else:
    print("No successive events found.")

Performance notes:
- 10K rows, 0 indexes...query time: 1s
- 10K rows, 4 indexes...query time: 1s
- 20K rows, 0 indexes...query time: 1.4s
- 20K rows, 4 indexes...query time: 1.4s
- 40K rows, 0 indexes...query time: 3.3s
- 40K rows, 4 indexes...query time: 3.2s
- 80K rows, 0 indexes...query time: 3.8s

The indexes aren't be used by this query.

In [None]:
# Capture exactly 10 related events
query_successive_events = """
MATCH (e1:Event)-[:SUCCESSIVE_EVENT]->(e2:Event)-[:SUCCESSIVE_EVENT]->(e3:Event)-[:SUCCESSIVE_EVENT]->(e4:Event)-[:SUCCESSIVE_EVENT]->(e5:Event)-[:SUCCESSIVE_EVENT]->(e6:Event)-[:SUCCESSIVE_EVENT]->(e7:Event)-[:SUCCESSIVE_EVENT]->(e8:Event)-[:SUCCESSIVE_EVENT]->(e9:Event)-[:SUCCESSIVE_EVENT]->(e10:Event)
WHERE e1.src_ip = e2.src_ip AND e2.src_ip = e3.src_ip AND e3.src_ip = e4.src_ip AND e4.src_ip = e5.src_ip AND e5.src_ip = e6.src_ip AND e6.src_ip = e7.src_ip AND e7.src_ip = e8.src_ip AND e8.src_ip = e9.src_ip AND e9.src_ip = e10.src_ip AND
      e2.ts - e1.ts < 1 AND e3.ts - e2.ts < 1 AND e4.ts - e3.ts < 1 AND e5.ts - e4.ts < 1 AND e6.ts - e5.ts < 1 AND e7.ts - e6.ts < 1 AND e8.ts - e7.ts < 1 AND e9.ts - e8.ts < 1 AND e10.ts - e9.ts < 1
RETURN e1, e2, e3, e4, e5, e6, e7, e8, e9, e10
"""

successive_events_result = execute_query_multiple(driver, query_successive_events)
if successive_events_result:
    print('Result count', len(successive_events_result))
    for record in successive_events_result:
        e1 = record["e1"]
        e2 = record["e2"]
        e3 = record["e3"]
        e4 = record["e4"]
        e5 = record["e5"]
        e6 = record["e6"]
        e7 = record["e7"]
        e8 = record["e8"]
        e9 = record["e9"]
        e10 = record["e10"]
        print(f" Event 1: {e1}\n Event 2: {e2}\n Event 3: {e3}\n Event 4: {e4}\n Event 5: {e5}\n Event 6: {e6}\n Event 7: {e7}\n Event 8: {e8}\n Event 9: {e9}\n Event 10: {e10}\n\n")
else:
    print("No successive events found.")

Performance notes: 
- 10K rows, 0 indexes...query time: 4.3s
- 10K rows, 4 indexes...query time: 3.9s
- 20K rows, 0 indexes...query time: 5.7s
- 20K rows, 4 indexes...query time: 5.7s
- 40K rows, 0 indexes...query time:  11.4s
- 40K rows, 4 indexes...query time: 12.3s
- 80K rows, 0 indexes...query time: 20.1s

Indexes don't appear to be used by this query

In [57]:
# What combinations and counts of recode_name and rejected do we have?
df.groupby(['rcode_name', 'rejected']).size().reset_index(name='count').sort_values(by='count', ascending=False)

# rcode_name = '-' and rejected = 'F' => success
# rcode_name = 'NOERROR' and rejected = 'F' => success
# All other combinations => failure

Unnamed: 0,rcode_name,rejected,count
0,-,F,62040
7,NXDOMAIN,F,16099
2,NOERROR,F,1456
10,REFUSED,T,217
8,NXRRSet,F,98
11,SERVFAIL,T,31
6,NOTIMP,T,19
1,-,T,18
4,NOTAUTH,T,7
5,NOTIMP,F,7


In [58]:
# Identify 9 failed attempts + 1 successful attempt
# Capture 9 failed attempts followed by a successful attempt
query_attempts = """
MATCH (e1:Event)-[:SUCCESSIVE_EVENT]->(e2:Event)-[:SUCCESSIVE_EVENT]->(e3:Event)-[:SUCCESSIVE_EVENT]->(e4:Event)-[:SUCCESSIVE_EVENT]->(e5:Event)-[:SUCCESSIVE_EVENT]->(e6:Event)-[:SUCCESSIVE_EVENT]->(e7:Event)-[:SUCCESSIVE_EVENT]->(e8:Event)-[:SUCCESSIVE_EVENT]->(e9:Event)-[:SUCCESSIVE_EVENT]->(e10:Event)
WHERE e1.src_ip = e2.src_ip AND e2.src_ip = e3.src_ip AND e3.src_ip = e4.src_ip AND 
      e4.src_ip = e5.src_ip AND e5.src_ip = e6.src_ip AND e6.src_ip = e7.src_ip AND 
      e7.src_ip = e8.src_ip AND e8.src_ip = e9.src_ip AND e9.src_ip = e10.src_ip AND
      NOT ((e1.rcode_name = '-' AND e1.rejected = 'F') OR (e1.rcode_name = 'NOERROR' AND e1.rejected = 'F')) AND
      NOT ((e2.rcode_name = '-' AND e2.rejected = 'F') OR (e2.rcode_name = 'NOERROR' AND e2.rejected = 'F')) AND
      NOT ((e3.rcode_name = '-' AND e3.rejected = 'F') OR (e3.rcode_name = 'NOERROR' AND e3.rejected = 'F')) AND
      NOT ((e4.rcode_name = '-' AND e4.rejected = 'F') OR (e4.rcode_name = 'NOERROR' AND e4.rejected = 'F')) AND
      NOT ((e5.rcode_name = '-' AND e5.rejected = 'F') OR (e5.rcode_name = 'NOERROR' AND e5.rejected = 'F')) AND
      NOT ((e6.rcode_name = '-' AND e6.rejected = 'F') OR (e6.rcode_name = 'NOERROR' AND e6.rejected = 'F')) AND
      NOT ((e7.rcode_name = '-' AND e7.rejected = 'F') OR (e7.rcode_name = 'NOERROR' AND e7.rejected = 'F')) AND
      NOT ((e8.rcode_name = '-' AND e8.rejected = 'F') OR (e8.rcode_name = 'NOERROR' AND e8.rejected = 'F')) AND
      NOT ((e9.rcode_name = '-' AND e9.rejected = 'F') OR (e9.rcode_name = 'NOERROR' AND e9.rejected = 'F')) AND
      ((e10.rcode_name = '-' AND e10.rejected = 'F') OR (e10.rcode_name = 'NOERROR' AND e10.rejected = 'F')) AND
      e2.ts - e1.ts < 1 AND e3.ts - e2.ts < 1 AND e4.ts - e3.ts < 1 AND 
      e5.ts - e4.ts < 1 AND e6.ts - e5.ts < 1 AND e7.ts - e6.ts < 1 AND 
      e8.ts - e7.ts < 1 AND e9.ts - e8.ts < 1 AND e10.ts - e9.ts < 1
RETURN e1, e2, e3, e4, e5, e6, e7, e8, e9, e10
"""

attempts_result = execute_query_multiple(driver, query_attempts)
if attempts_result:
    print('Result count:', len(attempts_result))
    for record in attempts_result:
        e1 = record["e1"]
        e2 = record["e2"]
        e3 = record["e3"]
        e4 = record["e4"]
        e5 = record["e5"]
        e6 = record["e6"]
        e7 = record["e7"]
        e8 = record["e8"]
        e9 = record["e9"]
        e10 = record["e10"]
        print(f" Event 1: {e1} /n Event 2: {e2} \n Event 3: {e3} \n Event 4: {e4} \n Event 5: {e5} \n Event 6: {e6} \n Event 7: {e7} \n Event 8: {e8} \n Event 9: {e9} \n Event 10: {e10} \n")
else:
    print("No matching attempts found.")

Result count: 8
 Event 1: <Node element_id='4:03a97bf0-2a61-4e40-81ae-f6bb2b4ee47f:512' labels=frozenset({'Event'}) properties={'aa': 'F', 'qclass_name': 'C_INTERNET', 'qr': 'F', 'qtype_name': 'AAAA', 'rejected': 'F', 'qtype': '28', 'query': 'commondatastorage.googleapis.com', 'answers': '-', 'rcode': '3', 'rcode_name': 'NXDOMAIN', 'tc': 'T', 'src_ip': '192.168.202.103', 'src_port': 57525, 'uid': 'CBCUgY198eLJJ84Aya', 'ttls': '-', 'rd': 'F', 'port': 58785, 'dest_ip': '192.168.207.4', 'proto': 'udp', 'z': 0, 'dest_port': 53, 'qclass': '1', 'ts': 1331901776.43}> /n Event 2: <Node element_id='4:03a97bf0-2a61-4e40-81ae-f6bb2b4ee47f:513' labels=frozenset({'Event'}) properties={'aa': 'F', 'qclass_name': 'C_INTERNET', 'qr': 'F', 'qtype_name': 'A', 'rejected': 'F', 'qtype': '1', 'query': 'www.tomclancy.com', 'answers': '-', 'rcode': '3', 'rcode_name': 'NXDOMAIN', 'tc': 'T', 'src_ip': '192.168.202.103', 'src_port': 48680, 'uid': 'Cm3wDfuuYEJ8ldXr2', 'ttls': '-', 'rd': 'F', 'port': 19846, 'dest_

Performance notes:  
- 10K rows, 0 indexes...query time: 1.2s, 4 results
- 10K rows, 4 indexes...query time: 0.9s, 4 results
- 20K rows, 0 indexes...query time: 0.7s, 6 results
- 20K rows, 4 indexes...query time: 0.8s, 6 results
- 40K rows, 0 indexes...query time: 0.1s, 8 results
- 40K rows, 4 indexes...query time: 0.7s, 8 results
- 80K rows, 0 indexes...query time: 0.7s, 8 results
- 80K rows, 4 indexes...query time:

Indexes probably being used by above query...examine how to confirm execution plan.

In [59]:
# Double-check the current dataframe size
df.shape

(80000, 23)

In [60]:
# Try the same analysis directly on the dataframe

# Define success and failure conditions
def is_success(row):
    return (row['rcode_name'] == '-' and row['rejected'] == 'F') or (row['rcode_name'] == 'NOERROR' and row['rejected'] == 'F')

def is_failure(row):
    return not is_success(row)

# Add success and failure columns
df['is_success'] = df.apply(is_success, axis=1)
df['is_failure'] = df.apply(is_failure, axis=1)

# Identify unique, non-overlapping sequences of 9 failures followed by 1 success from the same source IP
def find_unique_sequences(df):
    sequences = []
    i = 0
    while i <= len(df) - 10:
        window = df.iloc[i:i+10]
        if all(window['is_failure'][:9]) and window['is_success'].iloc[9] and window['src_ip'].nunique() == 1:
            sequences.append(window)
            i += 10  # Skip over the indices that have been included in this sequence
        else:
            i += 1
    return sequences

# Find sequences
sequences = find_unique_sequences(df)

# Display the output in a readable manner
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)        # Set display width

for idx, seq in enumerate(sequences):
    print(f"\nSequence {idx + 1}:\n")
    print(seq.to_string(index=False))

# Drop the temporary columns
df.drop(columns=['is_success', 'is_failure'], inplace=True)



Sequence 1:

          ts                uid          src_ip  src_port       dest_ip  dest_port proto  port                            query qclass qclass_name qtype qtype_name rcode rcode_name qr aa tc rd  z answers ttls rejected  is_success  is_failure
1.331902e+09 CBCUgY198eLJJ84Aya 192.168.202.103     57525 192.168.207.4         53   udp 58785 commondatastorage.googleapis.com      1  C_INTERNET    28       AAAA     3   NXDOMAIN  F  F  T  F  0       -    -        F       False        True
1.331902e+09  Cm3wDfuuYEJ8ldXr2 192.168.202.103     48680 192.168.207.4         53   udp 19846                www.tomclancy.com      1  C_INTERNET     1          A     3   NXDOMAIN  F  F  T  F  0       -    -        F       False        True
1.331902e+09 Ca06Vl3um7Ovtc1phe 192.168.202.103     40570 192.168.207.4         53   udp 64853                  mail.google.com      1  C_INTERNET    28       AAAA     3   NXDOMAIN  F  F  T  F  0       -    -        F       False        True
1.331902e+09 CuDQA

Performance notes: 
- 10K rows...exec time: 0.5s, 4 results
- 20K rows...exec time: 1.0s, 6 results
- 40K rows...exec time: 2.1s, 8 results 
- 80K rows...exec time: 4.1s, 8 results