In [None]:
import pandas as pd
import networkx as nx
from collections import deque
import os
import gdown

In [None]:

mal_file_id = "1oLeb5fhxxpINkFARaqWU3us4dqmA5JR7"
mal_url = f"https://drive.google.com/uc?id={mal_file_id}"
mal_file = gdown.download(mal_url, quiet=False)
MALICIOUS_HASH_FILE = pd.read_csv(mal_file)
MALICIOUS_HASH_FILE.head()


dg_file_id = "1XOkeFjqjBxI_1t7uGPh8S_-QZPN5disk"
dg_url = f"https://drive.google.com/uc?id={dg_file_id}"
dg_file = gdown.download(dg_url, quiet=False)
FULL_NODE_FILE = pd.read_csv(dg_file)
FULL_NODE_FILE.head()


txn_file_id = "1QLleaYa0Z0wxzKWiUue94Q99ziPNsa9H"
txn_url = f"https://drive.google.com/uc?id={txn_file_id}"
txn_file = gdown.download(txn_url, quiet=False)
FULL_EDGE_FILE = pd.read_csv(txn_file)
FULL_EDGE_FILE.head()

# Target size for each BFS search
NODES_PER_BFS = 5000

Downloading...
From: https://drive.google.com/uc?id=1oLeb5fhxxpINkFARaqWU3us4dqmA5JR7
To: /content/malicious_hash.csv
100%|██████████| 3.11k/3.11k [00:00<00:00, 8.81MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1XOkeFjqjBxI_1t7uGPh8S_-QZPN5disk
From (redirected): https://drive.google.com/uc?id=1XOkeFjqjBxI_1t7uGPh8S_-QZPN5disk&confirm=t&uuid=3d5c9ea7-b9be-4704-aac2-9fb9f1457ca6
To: /content/DG_out.csv
100%|██████████| 4.16G/4.16G [01:28<00:00, 47.3MB/s]


In [None]:
# Load malicious hashes
if not os.path.exists(MALICIOUS_HASH_FILE):
    raise FileNotFoundError(f"Could not find malicious hash file at: {MALICIOUS_HASH_FILE}")
malicious_txn = pd.read_csv(MALICIOUS_HASH_FILE)
malicious_hashes_list = malicious_txn['hash'].tolist()
print(f"Loaded {len(malicious_hashes_list)} malicious seed hashes.")

# Load full node feature table
print(f"Loading full node file: {FULL_NODE_FILE}...")
if not os.path.exists(FULL_NODE_FILE):
    raise FileNotFoundError(f"Could not find full node file at: {FULL_NODE_FILE}")
nodes_df = pd.read_csv(FULL_NODE_FILE)

# Load full edge list
print(f"Loading full edge file: {FULL_EDGE_FILE}...")
if not os.path.exists(FULL_EDGE_FILE):
    raise FileNotFoundError(f"Could not find full edge file at: {FULL_EDGE_FILE}")
column_names = ["tx_hash_from", "tx_hash_to", "datetime", "amount_bitcoins"]
txn_df = pd.read_csv(FULL_EDGE_FILE,header= None, names=column_names)

In [None]:
# Build a directed graph from the edge list
G = nx.from_pandas_edgelist(
    txn_df,
    source='tx_hash_from',
    target='tx_hash_to',
    create_using=nx.DiGraph()
)
print(f"Graph built successfully.")
print(f"- Total Nodes: {G.number_of_nodes():,}")
print(f"- Total Edges: {G.number_of_edges():,}")

In [None]:
# DEFINE BFS SAMPLER
# ===================================================================

def bfs_bidirectional(G, start_tx, target_n):
    """
    Performs a Breadth-First Search starting from 'start_tx'.
    It hops BOTH WAYS (successors and predecessors) to find
    the complete local neighborhood.
    Stops when 'target_n' unique nodes are found.
    """
    visited = set([start_tx])
    q = deque([start_tx])

    # Check if start node is in graph
    if not G.has_node(start_tx):
        print(f"  Warning: Start node {start_tx} not found in graph.")
        return set() # Return empty set

    while q and len(visited) < target_n:
        node = q.popleft()
        # Get all nodes pointing TO this node (predecessors)
        preds = G.predecessors(node)
        # Get all nodes this node points TO (successors)
        succs = G.successors(node)

        # Combine them
        import itertools
        all_neighbors = itertools.chain(preds, succs)

        for nb in all_neighbors:
            if nb not in visited:
                visited.add(nb)
                q.append(nb)
                if len(visited) >= target_n:
                    break

    return visited

In [None]:
all_subgraph_nodes = set()

for i, start_hash in enumerate(malicious_hashes_list):
    # Progress indicator
    print(f"Processing {i+1}/{len(malicious_hashes_list)}: {start_hash}...", end="\r")

    # Optimization: If this malicious node was already picked up
    # by a previous search, we don't need to search it again.
    if start_hash in all_subgraph_nodes:
        continue

    # Run the bidirectional BFS
    node_set = bfs_bidirectional(G, start_hash, target_n=NODES_PER_BFS)

    # Add new unique nodes to our master set
    all_subgraph_nodes.update(node_set)

print(f"\nSampling complete.")
print(f"Total unique nodes selected for final dataset: {len(all_subgraph_nodes):,}")

In [None]:
# 1. Filter Node Features
print("Filtering node feature table...")
nodes_filtered = nodes_df[
    nodes_df["tx_hash"].isin(all_subgraph_nodes)
].copy()
print(f"- Filtered node table size: {len(nodes_filtered):,} rows")

# 2. Filter Edges
# We only keep edges that connect two nodes within our selected set.
print("Filtering transaction edge table...")
txn_filtered = txn_df[
    txn_df["tx_hash_from"].isin(all_subgraph_nodes) &
    txn_df["tx_hash_to"].isin(all_subgraph_nodes)
].copy()
print(f"- Filtered edge table size: {len(txn_filtered):,} rows")

# 3. Save to CSV
output_node_file = f"nodes_filtered_{len(all_subgraph_nodes)//1000}k.csv"
output_edge_file = f"txn_filtered_{len(all_subgraph_nodes)//1000}k.csv"

print(f"Saving {output_node_file}...")
nodes_filtered.to_csv(output_node_file, index=False)

print(f"Saving {output_edge_file}...")
txn_filtered.to_csv(output_edge_file, index=False)

In [None]:
print(f"1. Node Features: {output_node_file}")
print(f"2. Edge List:     {output_edge_file}")