In [None]:
import networkx as nx
import numpy as np
import pandas as pd
from datetime import timedelta, datetime
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.model_selection import train_test_split

In [28]:
data = pd.read_csv("data/test.csv")
data.rename(columns={"Account": "From Account", "Account.1": "To Account"}, inplace=True)

# Extract relevant columns
from_accounts = data['From Account']
to_accounts = data['To Account']
amount_paid = data['Amount Paid']

# Map account IDs to unique indices
account_map = {account: idx for idx, account in enumerate(set(from_accounts) | set(to_accounts))}

In [29]:

# Generate edge index using the mapped account indices
edge_index = torch.tensor([[account_map[from_account], account_map[to_account]] 
                           for from_account, to_account in zip(from_accounts, to_accounts)], dtype=torch.long).t().contiguous()

# Generate edge-level features (amount paid)
edge_amounts = torch.tensor(data['Amount Paid'].values, dtype=torch.float)

In [32]:

# Construct node features based on total transaction amount per node (as before)
node_features = {}
for node in account_map.values():
    node_features[node] = sum([edge_amounts[i] for i, (u, v) in enumerate(zip(from_accounts, to_accounts)) if account_map[u] == node or account_map[v] == node])

In [34]:
# Convert node features to tensor
node_feature_values = list(node_features.values())
node_features_tensor = torch.tensor(node_feature_values, dtype=torch.float).view(-1, 1)

In [35]:
# Add labels (Is Laundering) as target
labels = torch.tensor(data['Is Laundering'].values, dtype=torch.float)

# Construct graph data for PyTorch Geometric
from torch_geometric.data import Data
graph_data = Data(x=node_features_tensor, edge_index=edge_index, edge_attr=edge_amounts, y=labels)

# Check the shapes of the generated data
graph_data.x.shape, graph_data.edge_index.shape, graph_data.y.shape

(torch.Size([17051, 1]), torch.Size([2, 10000]), torch.Size([10000]))

In [46]:
# Train and test mask based on number of nodes (data.y length)
num_nodes = graph_data.x.shape[0]
train_mask, test_mask = train_test_split(range(num_nodes), test_size=0.2, random_state=42)

# Create masks for train and test sets
graph_data.train_mask = torch.tensor(train_mask, dtype=torch.bool)
graph_data.test_mask = torch.tensor(test_mask, dtype=torch.bool)


In [43]:
# Define the GNN model
class SimpleGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleGNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)
        self.fc = nn.Linear(output_dim, 1)  # Binary output (money laundering or not)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        x = torch.relu(x)
        x = self.fc(x)
        return x
    

# Training the model
def train_model(data, model, criterion, optimizer, epochs=100):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output[data.train_mask].squeeze(), data.y[data.train_mask].float())
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Evaluate the model on the test set
def test_model(model, data):
    model.eval()  # Switch to evaluation mode
    with torch.no_grad():  # No gradients are needed during testing
        output = model(data)
        # Apply sigmoid to get probabilities
        pred = torch.sigmoid(output[data.test_mask].squeeze())
        # Calculate accuracy
        correct = (pred.round() == data.y[data.test_mask].float()).sum()
        accuracy = correct / len(data.y[data.test_mask])
        print(f"Test Accuracy: {accuracy.item():.4f}")

In [44]:
# Initialize the model
input_dim = node_features_tensor.shape[1]
hidden_dim = 16
output_dim = 8
model = SimpleGNN(input_dim, hidden_dim, output_dim)

# Loss function and optimizer
criterion = nn.BCEWithLogitsLoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [47]:
# Train the model
train_model(graph_data, model, criterion, optimizer, epochs=100)

IndexError: The shape of the mask [13640] at index 0 does not match the shape of the indexed tensor [17051, 1] at index 0

In [None]:
# Test the model
test_model(model, graph_data)

## Different Approach

In [None]:
# Create a graph
G = nx.DiGraph()

# Add edges based on From Account -> To Account
for i in range(len(from_accounts)):
    G.add_edge(from_accounts[i], to_accounts[i], amount=amount_paid[i])

# Generate node-level features (for simplicity, we'll use the total amount involved in transactions as a feature)
node_features = {}
for node in G.nodes():
    # Sum of transaction amounts related to each node
    node_features[node] = sum([G[u][v]['amount'] for u, v in G.in_edges(node)] + 
                              [G[u][v]['amount'] for u, v in G.out_edges(node)])

# Convert node features to tensor
node_feature_values = list(node_features.values())
node_features_tensor = torch.tensor(node_feature_values, dtype=torch.float).view(-1, 1)

# Generate edge-level features (amount paid)
edges = list(G.edges())
edge_index = torch.tensor([[from_accounts.tolist().index(u), from_accounts.tolist().index(v)] for u, v in edges], dtype=torch.long).t().contiguous()
edge_amounts = [G[u][v]['amount'] for u, v in edges]
edge_features = torch.tensor(edge_amounts, dtype=torch.float)

# Add labels (Is Laundering) as target
labels = torch.tensor(data['Is Laundering'].values, dtype=torch.float)

# Construct graph data for PyTorch Geometric
from torch_geometric.data import Data
graph_data = Data(x=node_features_tensor, edge_index=edge_index, edge_attr=edge_features, y=labels)
