## Imports
---

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

import time
import numpy as np
np.random.seed(42)

import torch
torch.manual_seed(42)

from torch_geometric.data import Data
from torch_geometric.nn import GATConv, BatchNorm, Linear
import torch_geometric.transforms as T
from torch_geometric.loader import NeighborLoader
import torch.nn.functional as F

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve, auc
from sklearn.model_selection import ParameterGrid

from src.utils.dataset import get_full_transactions_dataset 

In [None]:
print(f"Using Torch version {torch.__version__}")
print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}") 
cuda_id = torch.cuda.current_device()
print(f"Name of current CUDA device:{torch.cuda.get_device_name(cuda_id)}")

## Data set load
---

In [None]:
df = get_full_transactions_dataset()

## Data preparation
---

### Preprocessing

In [None]:
label_encoder_columns = ["receiving_currency", "payment_currency", "payment_format"]
label_encoder = LabelEncoder()
for column in label_encoder_columns:
    df[column] = label_encoder.fit_transform(df[column])

In [None]:
df["timestamp"] = pd.to_datetime(df["timestamp"])
df['timestamp'] = df['timestamp'].apply(lambda x: x.value)

In [None]:
df = df.sort_values(by="sender")

In [None]:
scaler = StandardScaler()
df[['amount_received', 'amount_paid', 'timestamp']] = scaler.fit_transform(df[['amount_received', 'amount_paid', 'timestamp']])

In [None]:
df.head()

### Node features

In [None]:
all_account_ids = set(df['sender']).union(set(df['receiver']))
nodes_df = pd.DataFrame({'account': list(all_account_ids)})
nodes_df = nodes_df.sort_values(by="account").reset_index(drop=True)
laundering_df = df[df["is_laundering"] == 1]
laundering_accounts = set(laundering_df['sender']).union(set(laundering_df['receiver']))
nodes_df["is_laundering"] = nodes_df.account.apply(lambda account_id: 1 if account_id in laundering_accounts else 0)
nodes_df = nodes_df.sort_values(by="account")

In [None]:
nodes_df["transactions_sent"] = nodes_df['account'].map(df.groupby('sender').size()).fillna(0)
nodes_df['transactions_received'] = nodes_df['account'].map(df.groupby('receiver').size()).fillna(0)
nodes_df['unique_currencies_sent'] = nodes_df['account'].map(df.groupby('sender')['payment_currency'].nunique()).fillna(0)
nodes_df['unique_currencies_received'] = nodes_df['account'].map(df.groupby('receiver')['payment_currency'].nunique()).fillna(0)

In [None]:
currencies = set(set(df['payment_currency']).union(set(df['receiving_currency'])))
for currency in currencies:
    nodes_df[f'average_paid_{currency}'] = nodes_df['account'].map(
        df[df['payment_currency'] == currency].groupby('sender')['amount_paid'].mean()
    ).fillna(0)
    
    nodes_df[f'total_received_{currency}'] = nodes_df['account'].map(
        df[df['receiving_currency'] == currency].groupby('receiver')['amount_received'].mean()
    ).fillna(0)

In [None]:
node_labels = torch.from_numpy(nodes_df["is_laundering"].values).to(torch.float)
nodes_df = nodes_df.drop(["account", "is_laundering"], axis=1)
node_features = torch.from_numpy(nodes_df.values).to(torch.float)

### Edge features

In [None]:
account_to_index = {acc: idx for idx, acc in enumerate(all_account_ids)}

In [None]:
edges_df = df.copy()

In [None]:
edges_df['sender'] = edges_df['sender'].map(account_to_index)
edges_df['receiver'] = edges_df['receiver'].map(account_to_index)

In [None]:
edges_df.head()

In [None]:
edge_index = torch.stack([torch.from_numpy(edges_df['sender'].values), torch.from_numpy(edges_df['receiver'].values)], dim=0)
edge_attr = torch.from_numpy(edges_df.drop(columns=["sender", "receiver", "is_laundering"]).values).to(torch.float)

### Graph data

In [None]:
graph_data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr, y=node_labels)

In [None]:
graph_data

### Undersampling
---

In [None]:
torch.bincount(graph_data.y.to(torch.long))

In [None]:
def remove_non_fraudulent_nodes(data, fraud_label=1, removal_percentage=0.5):
    non_fraud_nodes = torch.where(data.y != fraud_label)[0]
    num_to_remove = int(len(non_fraud_nodes) * removal_percentage)
    
    nodes_to_remove = np.random.choice(non_fraud_nodes.numpy(), num_to_remove, replace=False)
    nodes_to_remove = torch.tensor(nodes_to_remove, dtype=torch.long)
    
    mask = torch.ones(data.num_nodes, dtype=torch.bool)
    mask[nodes_to_remove] = False  
    remaining_nodes = torch.nonzero(mask, as_tuple=True)[0]  

    node_map = {old_idx.item(): new_idx for new_idx, old_idx in enumerate(remaining_nodes)}
    mask_edges = mask[data.edge_index[0]] & mask[data.edge_index[1]]
    new_edge_index = data.edge_index[:, mask_edges]

    new_edge_index = torch.tensor([[node_map[idx.item()] for idx in row] for row in new_edge_index], dtype=torch.long)

    if data.edge_attr is not None:
        new_edge_attr = data.edge_attr[mask_edges]
    else:
        new_edge_attr = None

    new_data = data.__class__(
        x=data.x[remaining_nodes],  
        edge_index=new_edge_index,  
        edge_attr=new_edge_attr,  
        y=data.y[remaining_nodes]
    )
    
    return new_data

In [None]:
removal_percentage = 0.25
graph_data = remove_non_fraudulent_nodes(graph_data, fraud_label=1, removal_percentage=removal_percentage)

In [None]:
torch.bincount(graph_data.y.to(torch.long))

## Training
---

### Model definition

In [None]:
class GATModel(torch.nn.Module):
    def __init__(self, in_feats, hidden_dim, out_feats, heads):
        super().__init__()
        self.conv1 = GATConv(in_feats, hidden_dim, heads, dropout=0.2)
        self.conv2 = GATConv(hidden_dim * heads, hidden_dim, heads=1, concat=False, dropout=0.2)
        self.conv3 = GATConv(hidden_dim, int(hidden_dim/2), heads=1, concat=False, dropout=0.2)

        self.bn1 = BatchNorm(hidden_dim * heads)
        self.bn2 = BatchNorm(hidden_dim)
        self.bn3 = BatchNorm(int(hidden_dim/2))

        self.lin = Linear(int(hidden_dim/2), out_feats)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x, edge_index, edge_attr):
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.bn1(F.leaky_relu(self.conv1(x, edge_index, edge_attr)))
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.bn2(F.leaky_relu(self.conv2(x, edge_index, edge_attr)))
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.bn3(F.leaky_relu(self.conv3(x, edge_index, edge_attr)))

        x = self.lin(x)
        x = self.sigmoid(x)

        return x

### Grid search

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
param_grid = {
    "lr": [1e-4],
    "batch_size": [1024],
    "num_neighbors": [[64,64], [256,256]],
    "hidden_dim": [16, 32, 64],
    "heads": [16, 32]
}

In [None]:
num_combinations = len(list(ParameterGrid(param_grid)))

In [None]:
num_combinations

In [None]:
train_test_split = T.RandomNodeSplit(split='train_rest', num_val=0.1, num_test=0)
graph_data = train_test_split(graph_data).to(device)

In [None]:
class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=graph_data.y.cpu().numpy())
pos_weight = torch.tensor([class_weights[1]], dtype=torch.float).to(device)

In [None]:
def create_loader(graph_data, mask, params):
    return NeighborLoader(
        graph_data,
        num_neighbors=params["num_neighbors"],
        batch_size=params["batch_size"],
        input_nodes=mask,
    )

In [None]:
best_params = None
best_auc = 0
param_combination = 1

In [None]:
for params in ParameterGrid(param_grid):
    print(f"\nTesting parameters ({param_combination}/{num_combinations}): {params}")
    param_combination += 1
    print(f"Best AUC until now: {best_auc:.2f}")
    
    train_loader = create_loader(graph_data, graph_data.train_mask, params)
    test_loader = create_loader(graph_data, graph_data.val_mask, params)

    model = GATModel(
        in_feats=graph_data.num_features,
        hidden_dim=params["hidden_dim"],
        out_feats=1,
        heads=params["heads"]
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=params["lr"])
    criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    num_epochs = 100
    for epoch in range(1, num_epochs + 1):
        start_time = time.time()
        model.train()
        running_loss = 0.0 
        
        for batch in train_loader:
            optimizer.zero_grad()
            out = model(batch.x, batch.edge_index, batch.edge_attr)
            loss = criterion(out, batch.y.unsqueeze(1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)
        epoch_duration = time.time() - start_time

        if epoch % 10 == 0 or epoch == num_epochs:
            model.eval()
            y_true, y_pred_probs = [], []
            
            with torch.no_grad():
                for batch in test_loader:
                    out = model(batch.x, batch.edge_index, batch.edge_attr)
                    y_true.extend(batch.y.cpu().numpy())
                    y_pred_probs.extend(out.cpu().numpy())

            y_pred_probs = np.array(y_pred_probs).flatten()
            auc_score = roc_auc_score(y_true, y_pred_probs)

            if auc_score > best_auc:
                best_params = params
                best_auc = auc_score
            print(f"Epoch {epoch}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | AUC: {auc_score:.4f} | Time: {epoch_duration:.2f}s")

{'batch_size': 4096, 'heads': 16, 'hidden_dim': 16, 'lr': 0.0001, 'num_neighbors': [64, 64]}

### Evaluation

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import roc_curve, auc

def plot_roc_curve(y_true, y_pred_probs):
    fpr, tpr, _ = roc_curve(y_true, y_pred_probs)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='grey', linestyle='--')  # Linha aleatória
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Taxa de Falsos Positivos (FPR)')
    plt.ylabel('Taxa de Verdadeiros Positivos (TPR)')
    plt.title('Curva ROC')
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

In [None]:
model.eval()
y_true, y_pred_probs = [], []

with torch.no_grad():
    for batch in test_loader:
        out = model(batch.x, batch.edge_index, batch.edge_attr)
        y_true.extend(batch.y.cpu().numpy())
        y_pred_probs.extend(out.cpu().numpy())

y_pred_probs = np.array(y_pred_probs).flatten()
#y_pred = (y_pred_probs >= 0.1).astype(int)  # Converter para 0/1

In [None]:
plot_roc_curve(y_true, y_pred_probs)