In [15]:
# Ethereum Transaction Graph Analysis
# Load required libraries
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

# Load the Ethereum transaction data
df = pd.read_csv('ethereum_medium2.csv')

print("Data loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

print(f" num nodes: {len(df['from_address'].unique())} and num edges: {len(df)}")



Data loaded successfully!
Dataset shape: (36182, 8)
Columns: ['hash', 'block_number', 'from_address', 'to_address', 'value', 'transaction_day', 'root_node', 'logic_path_taken']
 num nodes: 4480 and num edges: 36182


In [22]:
# Step 1: Preprocess data - sort chronologically and handle unseen nodes
import torch
from torch_geometric.data import Data
from torch_geometric.utils import to_networkx
from torch_geometric.nn import TransformerConv
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, accuracy_score
from tqdm import tqdm

# Sort by transaction_day and block_number for chronological ordering
df['transaction_day'] = pd.to_datetime(df['transaction_day'])
df = df.sort_values(['transaction_day', 'block_number']).reset_index(drop=True)

print(f"Sorted data: {len(df)} transactions")
print(f"Date range: {df['transaction_day'].min()} to {df['transaction_day'].max()}")

# Track seen nodes and replace unseen nodes with 'unknown'
seen_nodes = set()
UNKNOWN_NODE = 'unknown'

# Create new columns for processed addresses
df['from_processed'] = df['from_address'].copy()
df['to_processed'] = df['to_address'].copy()

# Process each transaction chronologically
for idx, row in df.iterrows():
    from_addr = row['from_address']
    to_addr = row['to_address']
    
    # Replace unseen nodes with 'unknown'
    if from_addr not in seen_nodes:
        df.at[idx, 'from_processed'] = UNKNOWN_NODE
        seen_nodes.add(from_addr)  # Mark as seen after processing
    
    if to_addr not in seen_nodes:
        df.at[idx, 'to_processed'] = UNKNOWN_NODE
        seen_nodes.add(to_addr)  # Mark as seen after processing
    
    # Mark both as seen for future transactions
    seen_nodes.add(from_addr)
    seen_nodes.add(to_addr)

print(f"\nProcessed addresses:")
print(f"  Total unique original addresses: {len(set(df['from_address'].unique()) | set(df['to_address'].unique()))}")
print(f"  Unique processed addresses (including 'unknown'): {len(set(df['from_processed'].unique()) | set(df['to_processed'].unique()))}")
print(f"  Number of 'unknown' replacements: {sum((df['from_processed'] == UNKNOWN_NODE) | (df['to_processed'] == UNKNOWN_NODE))}")


Sorted data: 36182 transactions
Date range: 2025-11-06 00:00:00 to 2025-12-04 00:00:00

Processed addresses:
  Total unique original addresses: 20574
  Unique processed addresses (including 'unknown'): 4959
  Number of 'unknown' replacements: 20560


In [23]:
# Step 2: Build temporal graph snapshots incrementally
# Normalize edge attributes (transaction values)
value_mean = df['value'].astype(float).mean()
value_std = df['value'].astype(float).std()
print(f"Edge attribute normalization: mean={value_mean:.2e}, std={value_std:.2e}")

def build_graph_snapshot(transactions_up_to_idx, node_to_idx):
    """
    Build a graph snapshot from transactions up to a given index.
    Returns edge_index and edge_attr tensors.
    """
    edges = []
    edge_attrs = []
    
    for idx in range(transactions_up_to_idx):
        row = df.iloc[idx]
        from_node = row['from_processed']
        to_node = row['to_processed']
        
        # Get node indices
        if from_node not in node_to_idx:
            node_to_idx[from_node] = len(node_to_idx)
        if to_node not in node_to_idx:
            node_to_idx[to_node] = len(node_to_idx)
        
        from_idx = node_to_idx[from_node]
        to_idx = node_to_idx[to_node]
        
        edges.append([from_idx, to_idx])
        
        # Use transaction value as edge attribute (normalized)
        value = float(row['value'])
        normalized_value = (value - value_mean) / (value_std + 1e-8)
        edge_attrs.append([normalized_value])
    
    if len(edges) == 0:
        return None, None, node_to_idx
    
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_attrs, dtype=torch.float)
    
    return edge_index, edge_attr, node_to_idx

# Test the function
print("\nTesting graph snapshot building...")
test_node_to_idx = {}
test_edge_index, test_edge_attr, test_node_to_idx = build_graph_snapshot(100, test_node_to_idx)
if test_edge_index is not None:
    print(f"  Snapshot with 100 transactions: {test_edge_index.shape[1]} edges, {len(test_node_to_idx)} nodes")


Edge attribute normalization: mean=9.72e+18, std=2.78e+20

Testing graph snapshot building...
  Snapshot with 100 transactions: 100 edges, 19 nodes


In [24]:
# Step 4: Define Graph Transformer Model
class GraphTransformerLinkPredictor(nn.Module):
    def __init__(self, num_nodes, hidden_dim=128, num_heads=4, num_layers=2, edge_dim=1):
        super(GraphTransformerLinkPredictor, self).__init__()
        self.num_nodes = num_nodes
        self.hidden_dim = hidden_dim
        
        # Node embedding layer
        self.node_embedding = nn.Embedding(num_nodes, hidden_dim)
        
        # Graph Transformer layers
        self.convs = nn.ModuleList()
        self.convs.append(TransformerConv(
            hidden_dim, hidden_dim, heads=num_heads, 
            edge_dim=edge_dim, dropout=0.1, concat=True
        ))
        
        # If multiple layers, need to adjust dimensions
        for _ in range(num_layers - 1):
            self.convs.append(TransformerConv(
                hidden_dim * num_heads, hidden_dim, heads=num_heads,
                edge_dim=edge_dim, dropout=0.1, concat=True
            ))
        
        # Final layer to get node embeddings
        self.final_conv = TransformerConv(
            hidden_dim * num_heads, hidden_dim, heads=1,
            edge_dim=edge_dim, dropout=0.1, concat=False
        )
        
        # Link prediction head
        self.link_predictor = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, node_indices, edge_index, edge_attr):
        """
        Forward pass for link prediction.
        node_indices: tensor of node indices to get embeddings for
        edge_index: graph edge indices
        edge_attr: edge attributes
        """
        # Get node embeddings
        x = self.node_embedding(node_indices)
        
        # Apply graph transformer layers
        for conv in self.convs:
            x = conv(x, edge_index, edge_attr)
            x = F.relu(x)
        
        # Final layer
        x = self.final_conv(x, edge_index, edge_attr)
        
        return x
    
    def predict_link(self, source_emb, target_emb):
        """Predict probability of link between source and target embeddings."""
        # Concatenate source and target embeddings
        pair_emb = torch.cat([source_emb, target_emb], dim=-1)
        # Get logit
        logit = self.link_predictor(pair_emb)
        # Return probability
        return torch.sigmoid(logit)

# Initialize model (will set num_nodes after processing data)
print("Graph Transformer model defined")


Graph Transformer model defined


In [25]:
# Step 5: Create temporal train/validation split
# Use first 80% of transactions for training, last 20% for validation
split_idx = int(len(df) * 0.8)
train_df = df.iloc[:split_idx].copy()
val_df = df.iloc[split_idx:].copy()

print(f"Temporal split:")
print(f"  Training: {len(train_df)} transactions ({train_df['transaction_day'].min()} to {train_df['transaction_day'].max()})")
print(f"  Validation: {len(val_df)} transactions ({val_df['transaction_day'].min()} to {val_df['transaction_day'].max()})")

# Build node mapping from training data only
train_node_to_idx = {}
train_edge_index, train_edge_attr, train_node_to_idx = build_graph_snapshot(len(train_df), train_node_to_idx)

print(f"\nNode mapping from training data:")
print(f"  Total nodes: {len(train_node_to_idx)}")
print(f"  Includes 'unknown': {UNKNOWN_NODE in train_node_to_idx}")

# Initialize model with correct number of nodes
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")

# We'll need to handle dynamic node addition, so we'll use a larger embedding table
# Estimate max nodes (all unique nodes in dataset)
max_nodes = len(set(df['from_processed'].unique()) | set(df['to_processed'].unique()))
print(f"  Maximum nodes (for embedding table): {max_nodes}")

model = GraphTransformerLinkPredictor(
    num_nodes=max_nodes,
    hidden_dim=128,
    num_heads=4,
    num_layers=2,
    edge_dim=1
).to(device)

print(f"\nModel initialized:")
print(f"  Parameters: {sum(p.numel() for p in model.parameters()):,}")


Temporal split:
  Training: 28945 transactions (2025-11-06 00:00:00 to 2025-12-04 00:00:00)
  Validation: 7237 transactions (2025-12-04 00:00:00 to 2025-12-04 00:00:00)

Node mapping from training data:
  Total nodes: 4015
  Includes 'unknown': True

Using device: cpu
  Maximum nodes (for embedding table): 4959

Model initialized:
  Parameters: 2,246,401


In [26]:
# Step 3: Create training pairs with positive and negative samples
def create_training_pairs(transaction_idx, node_to_idx, snapshot_node_to_idx=None, negative_ratio=1):
    """
    Create positive and negative training pairs for a transaction.
    Returns list of (source_node, target_node, label) tuples (using node names, not indices).
    If snapshot_node_to_idx is provided, only uses nodes in the snapshot for negatives.
    """
    row = df.iloc[transaction_idx]
    source = row['from_processed']
    actual_target = row['to_processed']
    
    # Ensure nodes are in node_to_idx
    if source not in node_to_idx:
        node_to_idx[source] = len(node_to_idx)
    if actual_target not in node_to_idx:
        node_to_idx[actual_target] = len(node_to_idx)
    
    # Positive sample (using node names)
    pairs = [(source, actual_target, 1)]
    
    # Negative samples: sample random nodes that are NOT connected to source
    # Use snapshot nodes if provided, otherwise all nodes
    if snapshot_node_to_idx is not None:
        candidate_nodes = set(snapshot_node_to_idx.keys())
    else:
        candidate_nodes = set(node_to_idx.keys())
    
    # Get nodes already connected to source (in current snapshot)
    connected_nodes = set()
    for idx in range(transaction_idx):
        prev_row = df.iloc[idx]
        if prev_row['from_processed'] == source:
            target = prev_row['to_processed']
            connected_nodes.add(target)
    
    # Sample negative targets (nodes not connected to source)
    available_negatives = list(candidate_nodes - connected_nodes - {source})
    
    # If we don't have enough negatives, we can use any node except source
    if len(available_negatives) == 0:
        available_negatives = list(candidate_nodes - {source})
    
    # Sample negative examples
    num_negatives = negative_ratio
    if len(available_negatives) > 0:
        negative_targets = np.random.choice(
            available_negatives, 
            size=min(num_negatives, len(available_negatives)), 
            replace=False
        )
        for neg_target in negative_targets:
            pairs.append((source, neg_target, 0))
    
    return pairs, node_to_idx

# Test the function
print("Testing training pair creation...")
test_node_to_idx = {}
# Build snapshot up to index 100
test_edge_index, _, test_node_to_idx = build_graph_snapshot(100, test_node_to_idx)
# Create pairs for transaction at index 100
test_pairs, test_node_to_idx = create_training_pairs(100, test_node_to_idx, snapshot_node_to_idx=test_node_to_idx, negative_ratio=1)
print(f"  Created {len(test_pairs)} pairs for transaction 100: {sum(1 for _, _, label in test_pairs if label == 1)} positive, {sum(1 for _, _, label in test_pairs if label == 0)} negative")


Testing training pair creation...
  Created 2 pairs for transaction 100: 1 positive, 1 negative


In [None]:
# Step 6: Training loop with temporal constraints
def train_epoch(model, optimizer, train_df, node_to_idx, device, batch_size=32, negative_ratio=1):
    """Train for one epoch, processing transactions chronologically."""
    model.train()
    criterion = nn.BCEWithLogitsLoss()
    
    all_losses = []
    current_node_to_idx = node_to_idx.copy()
    
    # Process transactions in batches
    num_batches = (len(train_df) + batch_size - 1) // batch_size
    
    for batch_idx in tqdm(range(num_batches), desc="Training batches"):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, len(train_df))
        
        # Build graph snapshot up to current batch start
        snapshot_end = start_idx
        edge_index, edge_attr, snapshot_node_to_idx = build_graph_snapshot(
            snapshot_end, current_node_to_idx.copy()
        )
        
        if edge_index is None or edge_index.shape[1] == 0:
            # Still need to update node_to_idx for pairs creation
            for trans_idx in range(start_idx, end_idx):
                _, current_node_to_idx = create_training_pairs(
                    trans_idx, current_node_to_idx, negative_ratio=negative_ratio
                )
            continue
        
        # Move to device
        edge_index = edge_index.to(device)
        edge_attr = edge_attr.to(device)
        
        # Get all node indices in current snapshot
        num_nodes_in_snapshot = len(snapshot_node_to_idx)
        all_node_indices = torch.tensor(list(range(num_nodes_in_snapshot)), dtype=torch.long).to(device)
        
        # Get node embeddings for all nodes in snapshot
        node_embeddings = model(all_node_indices, edge_index, edge_attr)
        
        # Create training pairs for this batch
        batch_pairs = []
        batch_labels = []
        
        for trans_idx in range(start_idx, end_idx):
            pairs, current_node_to_idx = create_training_pairs(
                trans_idx, current_node_to_idx, snapshot_node_to_idx=snapshot_node_to_idx, negative_ratio=negative_ratio
            )
            
            for source_node, target_node, label in pairs:
                # Map node names to snapshot indices
                if source_node in snapshot_node_to_idx and target_node in snapshot_node_to_idx:
                    snapshot_source_idx = snapshot_node_to_idx[source_node]
                    snapshot_target_idx = snapshot_node_to_idx[target_node]
                    
                    if snapshot_source_idx < num_nodes_in_snapshot and snapshot_target_idx < num_nodes_in_snapshot:
                        batch_pairs.append((snapshot_source_idx, snapshot_target_idx))
                        batch_labels.append(label)
        
        if len(batch_pairs) == 0:
            continue
        
        # Convert to tensors
        source_indices = torch.tensor([p[0] for p in batch_pairs], dtype=torch.long).to(device)
        target_indices = torch.tensor([p[1] for p in batch_pairs], dtype=torch.long).to(device)
        labels = torch.tensor(batch_labels, dtype=torch.float).to(device)
        
        # Get embeddings
        source_embs = node_embeddings[source_indices]
        target_embs = node_embeddings[target_indices]
        
        # Predict links
        pair_embs = torch.cat([source_embs, target_embs], dim=-1)
        logits = model.link_predictor(pair_embs).squeeze()
        
        # Handle single sample case
        if logits.dim() == 0:
            logits = logits.unsqueeze(0)
        
        # Compute loss
        loss = criterion(logits, labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        all_losses.append(loss.item())
        
    return np.mean(all_losses), current_node_to_idx

# Initialize optimizer once
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train for a few epochs to test
print("Starting training...")
train_losses = []
for epoch in range(3):
    print(f"\nEpoch {epoch + 1}/3")
    loss, updated_node_to_idx = train_epoch(
        model, optimizer, train_df, train_node_to_idx, device, batch_size=64, negative_ratio=1
    )
    train_losses.append(loss)
    train_node_to_idx = updated_node_to_idx
    print(f"  Average loss: {loss:.4f}")


Starting training...

Epoch 1/3


Training batches:  23%|██▎       | 105/453 [06:10<39:48,  6.86s/it]

In [None]:
# Step 7: Evaluation on validation set
def evaluate(model, val_df, train_node_to_idx, device, batch_size=32, negative_ratio=1):
    """Evaluate model on validation set."""
    model.eval()
    
    all_predictions = []
    all_labels = []
    
    # Start with node mapping from training
    current_node_to_idx = train_node_to_idx.copy()
    
    # Build initial graph snapshot from all training data
    train_end = len(train_df)
    edge_index, edge_attr, snapshot_node_to_idx = build_graph_snapshot(
        train_end, current_node_to_idx.copy()
    )
    
    if edge_index is None:
        return 0.0, 0.0
    
    edge_index = edge_index.to(device)
    edge_attr = edge_attr.to(device)
    
    # Process validation transactions
    num_batches = (len(val_df) + batch_size - 1) // batch_size
    
    with torch.no_grad():
        for batch_idx in tqdm(range(num_batches), desc="Validation batches"):
            start_idx = batch_idx * batch_size
            end_idx = min((batch_idx + 1) * batch_size, len(val_df))
            
            # Rebuild snapshot up to this validation batch (including all training + previous validation)
            snapshot_end = train_end + start_idx
            edge_index, edge_attr, snapshot_node_to_idx = build_graph_snapshot(
                snapshot_end, current_node_to_idx.copy()
            )
            
            if edge_index is None or edge_index.shape[1] == 0:
                # Still update node_to_idx
                for trans_idx in range(start_idx, end_idx):
                    orig_idx = train_end + trans_idx
                    _, current_node_to_idx = create_training_pairs(
                        orig_idx, current_node_to_idx, negative_ratio=negative_ratio
                    )
                continue
            
            edge_index = edge_index.to(device)
            edge_attr = edge_attr.to(device)
            
            # Get all node indices in current snapshot
            num_nodes_in_snapshot = len(snapshot_node_to_idx)
            all_node_indices = torch.tensor(
                list(range(num_nodes_in_snapshot)), dtype=torch.long
            ).to(device)
            
            # Get node embeddings
            node_embeddings = model(all_node_indices, edge_index, edge_attr)
            
            # Create evaluation pairs
            batch_pairs = []
            batch_labels = []
            
            for trans_idx in range(start_idx, end_idx):
                # Map to original dataframe index
                orig_idx = train_end + trans_idx
                pairs, current_node_to_idx = create_training_pairs(
                    orig_idx, current_node_to_idx, snapshot_node_to_idx=snapshot_node_to_idx, negative_ratio=negative_ratio
                )
                
                for source_node, target_node, label in pairs:
                    # Map node names to snapshot indices
                    if source_node in snapshot_node_to_idx and target_node in snapshot_node_to_idx:
                        snapshot_source_idx = snapshot_node_to_idx[source_node]
                        snapshot_target_idx = snapshot_node_to_idx[target_node]
                        
                        if snapshot_source_idx < num_nodes_in_snapshot and snapshot_target_idx < num_nodes_in_snapshot:
                            batch_pairs.append((snapshot_source_idx, snapshot_target_idx))
                            batch_labels.append(label)
            
            if len(batch_pairs) == 0:
                continue
            
            # Convert to tensors
            source_indices = torch.tensor([p[0] for p in batch_pairs], dtype=torch.long).to(device)
            target_indices = torch.tensor([p[1] for p in batch_pairs], dtype=torch.long).to(device)
            labels = torch.tensor(batch_labels, dtype=torch.float).to(device)
            
            # Get embeddings and predict
            source_embs = node_embeddings[source_indices]
            target_embs = node_embeddings[target_indices]
            
            pair_embs = torch.cat([source_embs, target_embs], dim=-1)
            logits = model.link_predictor(pair_embs).squeeze()
            
            # Handle single sample case
            if logits.dim() == 0:
                logits = logits.unsqueeze(0)
            
            probs = torch.sigmoid(logits)
            
            all_predictions.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Compute metrics
    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)
    
    if len(all_predictions) == 0:
        return 0.0, 0.0
    
    # Binary predictions (threshold at 0.5)
    binary_preds = (all_predictions >= 0.5).astype(int)
    accuracy = accuracy_score(all_labels, binary_preds)
    
    # AUC-ROC
    if len(np.unique(all_labels)) > 1:
        auc = roc_auc_score(all_labels, all_predictions)
    else:
        auc = 0.0
    
    return accuracy, auc

# Evaluate
print("\nEvaluating on validation set...")
val_accuracy, val_auc = evaluate(model, val_df, train_node_to_idx, device, batch_size=64, negative_ratio=1)
print(f"\nValidation Results:")
print(f"  Accuracy: {val_accuracy:.4f}")
print(f"  AUC-ROC: {val_auc:.4f}")


In [None]:
# Visualization of training progress
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Training Loss', marker='o')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"\nFinal Training Statistics:")
print(f"  Initial loss: {train_losses[0]:.4f}")
print(f"  Final loss: {train_losses[-1]:.4f}")
print(f"  Improvement: {((train_losses[0] - train_losses[-1]) / train_losses[0] * 100):.2f}%")
