In [5]:
pip install scapy numpy pandas

Collecting scapy
  Downloading scapy-2.5.0.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: scapy
  Building wheel for scapy (setup.py) ... [?25ldone
[?25h  Created wheel for scapy: filename=scapy-2.5.0-py2.py3-none-any.whl size=1444347 sha256=ddda180bd44f95105d7853243c247c1b2584b5d32f86719245fb90fc9efd5da6
  Stored in directory: /home/seojin929_gmail_com/.cache/pip/wheels/82/b7/03/8344d8cf6695624746311bc0d389e9d05535ca83c35f90241d
Successfully built scapy
Installing collected packages: scapy
Successfully installed scapy-2.5.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install numpy networkx

Collecting networkx
  Downloading networkx-3.3-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: networkx
Successfully installed networkx-3.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
import networkx as nx
import numpy as np
from collections import defaultdict

# --- Parameters Setup ---
CHUNK_LENGTH = 50  # Length of shingles to be used when hashing the graphs.
L = 1000  # Number of hash functions (sketch size).
SEED = 23  # Seed for random number generation to ensure reproducibility.
CLUSTER_UPDATE_INTERVAL = 10000  # Number of edges after which clusters are updated.
GLOBAL_THRESHOLD = 0.75  # Anomaly threshold
CLUSTER_THRESHOLD = 0.5  # Threshold for clustering
MAX_EDGES = 5000  # Maximum number of edges to process before clustering.

# --- Initialize Random Vectors for StreamHash ---
np.random.seed(SEED)
MAX_UINT64 = np.iinfo(np.uint64).max  # Get the maximum value for an unsigned 64-bit integer.
H = [np.random.randint(0, MAX_UINT64, CHUNK_LENGTH+2, dtype=np.uint64) for _ in range(L)]


# --- Function to Hash Shingles ---
def hash_shingle(shingle, randbits):
    sum_hash = int(randbits[0])
    for i, char in enumerate(shingle):
        sum_hash += int(randbits[i+1]) * ord(char)
    return 2 * ((sum_hash >> 63) & 1) - 1


# --- Function to Create a StreamHash Sketch for a Graph ---
def update_streamhash_sketch(projection, shingle_vector):
    # Update the projection with new shingles as they come in.
    for shingle, count in shingle_vector.items():
        for i in range(L):
            projection[i] += count * hash_shingle(shingle, H[i])
    # Convert projection to sketch
    sketch = np.where(projection >= 0, 1, 0)
    return sketch, projection


# --- Function to Process Edges into Graphs and Update Sketches ---
def process_edge(graphs, shingle_vectors, projections, edge_line):
    # Extract edge info
    src_id, src_type, dst_id, dst_type, e_type, gid = edge_line.strip().split()
    gid = int(gid)  # Graph ID
    graph = graphs[gid]  # Retrieve the graph for the given ID
    
    # Add edge to the graph
    graph.add_edge((src_id, src_type), (dst_id, dst_type), e_type=e_type)
    
    # Create/update shingles for this edge
    shingle = f"{src_type}{e_type}{dst_type}"
    shingle_vectors[gid][shingle] += 1
    
    # Update the sketch incrementally for this graph
    sketch, projection = update_streamhash_sketch(projections[gid], shingle_vectors[gid])
    return sketch


# --- Function to Compute Similarity Between Two Sketches ---
def streamhash_similarity(sketch1, sketch2):
    return np.sum(sketch1 == sketch2) / L


# --- Function for Locality-Sensitive Hashing (LSH) ---
def lsh_buckets(sketch):
    # LSH for grouping similar sketches
    band_size = 10  # Band size to split the sketch into bands
    hash_buckets = []
    for i in range(0, L, band_size):
        band = tuple(sketch[i:i+band_size])  # Hash each band (10 bits)
        hash_buckets.append(hash(band))  # Store hash of each band
    return hash_buckets


# --- Online Clustering ---
def update_clusters(gid, graph_sketches, centroid_sketches, global_threshold, clusters):
    sketch = graph_sketches[gid]  # Get sketch of the current graph
    min_distance = 1.0
    nearest_cluster = None
    
    # Compare the sketch to each cluster centroid to find the nearest cluster
    for cluster_id, centroid_sketch in centroid_sketches.items():
        sim = streamhash_similarity(sketch, centroid_sketch)
        distance = 1.0 - sim
        if distance < min_distance:
            min_distance = distance
            nearest_cluster = cluster_id
    
    # If the graph is too far from all centroids, it is flagged as an anomaly
    if min_distance > global_threshold:
        print(f"Graph {gid} is an anomaly with score: {min_distance}")
    else:
        # Otherwise, assign the graph to the nearest cluster
        print(f"Graph {gid} assigned to cluster {nearest_cluster} with score: {min_distance}")
        clusters[nearest_cluster].append(gid)  # Add the graph to the nearest cluster
    return nearest_cluster, min_distance


# --- Main Function to Process Streaming Edges and Perform Clustering/Anomaly Detection ---
def main():
    graphs = defaultdict(nx.DiGraph)  # Store multiple graphs
    shingle_vectors = defaultdict(lambda: defaultdict(int))  # Shingle vectors for each graph
    projections = defaultdict(lambda: np.zeros(L, dtype=int))  # Sketch projection vectors for each graph
    graph_sketches = {}  # Store the sketch for each graph
    centroid_sketches = {}  # Store the centroid sketches for each cluster
    clusters = defaultdict(list)  # Store clusters of graphs
    edge_count = 0  # Count the number of edges processed
    
    # Open the edge file (streaming edges)
    with open('test_edges.txt', 'r') as f:
        for edge_line in f:
            edge_count += 1
            # Process each edge and update the corresponding graph and its sketch
            sketch = process_edge(graphs, shingle_vectors, projections, edge_line)
            gid = int(edge_line.strip().split()[-1])  # Graph ID
            
            # Store the sketch for the graph
            graph_sketches[gid] = sketch
            
            # Perform clustering/anomaly detection every MAX_EDGES
            if edge_count % MAX_EDGES == 0:
                print(f"\nProcessing Clusters after {edge_count} edges...")
                
                # Update the centroid sketches for each cluster
                for cluster_id, gids in clusters.items():
                    centroid_projection = np.mean([projections[gid] for gid in gids], axis=0)
                    centroid_sketch = np.where(centroid_projection >= 0, 1, 0)
                    centroid_sketches[cluster_id] = centroid_sketch
                
                # Check and update clusters for all graphs processed so far
                for gid in graph_sketches:
                    update_clusters(gid, graph_sketches, centroid_sketches, GLOBAL_THRESHOLD, clusters)


if __name__ == "__main__":
    main()
