## Imports
---

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

import time
import numpy as np
import networkx as nx
from sklearn.utils import resample
from tqdm import tqdm
from src.utils.dataset import get_full_transactions_dataset 

import torch
import torch.nn as nn
from torch_geometric.data import Data
from torch_geometric.nn import GATConv, Linear
import torch_geometric.transforms as T
from torch_geometric.loader import NeighborLoader
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve, auc

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
print(f"Using Torch version {torch.__version__}")
print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}") 
cuda_id = torch.cuda.current_device()
print(f"Name of current CUDA device:{torch.cuda.get_device_name(cuda_id)}")

## Data set load
---

In [None]:
df = get_full_transactions_dataset()

## Data preparation
---

### Preprocessing

In [None]:
label_encoder_columns = ["receiving_currency", "payment_currency", "payment_format"]
label_encoder = LabelEncoder()
for column in label_encoder_columns:
    df[column] = label_encoder.fit_transform(df[column])

In [None]:
df["timestamp"] = pd.to_datetime(df["timestamp"])
df['timestamp'] = df['timestamp'].apply(lambda x: x.value)
df['timestamp'] = (df['timestamp']-df['timestamp'].min())/(df['timestamp'].max()-df['timestamp'].min())

In [None]:
df = df.sort_values(by="sender")

In [None]:
df.head()

### Node features

In [None]:
all_account_ids = set(df['sender']).union(set(df['receiver']))
nodes_df = pd.DataFrame({'account': list(all_account_ids)})
nodes_df = nodes_df.sort_values(by="account").reset_index(drop=True)
laundering_df = df[df["is_laundering"] == 1]
laundering_accounts = set(laundering_df['sender']).union(set(laundering_df['receiver']))
nodes_df["is_laundering"] = nodes_df.account.apply(lambda account_id: 1 if account_id in laundering_accounts else 0)
nodes_df = nodes_df.sort_values(by="account")

In [None]:
nodes_df["transactions_sent"] = nodes_df['account'].map(df.groupby('sender').size()).fillna(0)
nodes_df['transactions_received'] = nodes_df['account'].map(df.groupby('receiver').size()).fillna(0)
nodes_df['unique_currencies_sent'] = nodes_df['account'].map(df.groupby('sender')['payment_currency'].nunique()).fillna(0)
nodes_df['unique_currencies_received'] = nodes_df['account'].map(df.groupby('receiver')['payment_currency'].nunique()).fillna(0)

In [None]:
currencies = set(set(df['payment_currency']).union(set(df['receiving_currency'])))
for currency in currencies:
    nodes_df[f'average_paid_{currency}'] = nodes_df['account'].map(
        df[df['payment_currency'] == currency].groupby('sender')['amount_paid'].mean()
    ).fillna(0)
    
    nodes_df[f'total_received_{currency}'] = nodes_df['account'].map(
        df[df['receiving_currency'] == currency].groupby('receiver')['amount_received'].mean()
    ).fillna(0)

In [None]:
node_labels = torch.from_numpy(nodes_df["is_laundering"].values).to(torch.float)
nodes_df = nodes_df.drop(["account", "is_laundering"], axis=1)
node_features = torch.from_numpy(nodes_df.values).to(torch.float)

### Edge features

In [None]:
account_to_index = {acc: idx for idx, acc in enumerate(all_account_ids)}

In [None]:
edges_df = df.copy()

In [None]:
edges_df['sender'] = edges_df['sender'].map(account_to_index)
edges_df['receiver'] = edges_df['receiver'].map(account_to_index)

In [None]:
edges_df.head()

In [None]:
edge_index = torch.stack([torch.from_numpy(edges_df['sender'].values), torch.from_numpy(edges_df['receiver'].values)], dim=0)
edge_attr = torch.from_numpy(edges_df.drop(columns=["sender", "receiver", "is_laundering"]).values).to(torch.float)

### Graph data

In [None]:
graph_data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr, y=node_labels)

In [None]:
graph_data

## Training
---

### Model definition

In [None]:
class GATModel(torch.nn.Module):
    def __init__(self, in_feats, hidden_dim, out_feats, heads):
        super().__init__()
        self.conv1 = GATConv(in_feats, hidden_dim, heads, dropout=0.6)
        self.conv2 = GATConv(hidden_dim * heads, int(hidden_dim/4), heads=1, concat=False, dropout=0.6)
        self.lin = Linear(int(hidden_dim/4), out_feats)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, edge_index, edge_attr):
        x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.conv1(x, edge_index, edge_attr))
        x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.conv2(x, edge_index, edge_attr))
        x = self.lin(x)
        x = self.sigmoid(x)
        
        return x

### Train and test split

In [None]:
train_test_split = T.RandomNodeSplit(split='train_rest', num_val=0.1, num_test=0)

In [None]:
graph_data = train_test_split(graph_data)

In [None]:
batch_size = 2048
num_neighbors = [30] * 2
train_loader = NeighborLoader(
    graph_data,
    num_neighbors=num_neighbors,
    batch_size=batch_size,
    input_nodes=graph_data.train_mask,
)

test_loader = NeighborLoader(
    graph_data,
    num_neighbors=num_neighbors,
    batch_size=batch_size,
    input_nodes=graph_data.val_mask,
)

### Train

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GATModel(in_feats=graph_data.num_features, hidden_dim=16, out_feats=1, heads=8).to(device)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = torch.nn.BCELoss()
graph_data = graph_data.to(device)

In [None]:
num_epochs = 10
for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        out = model(batch.x, batch.edge_index, batch.edge_attr)
        loss = criterion(out, batch.y.unsqueeze(1))
        loss.backward()
        optimizer.step()
    end_time = time.time()
    epoch_duration = end_time - start_time
    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {loss.item():.4f} | time (s): {epoch_duration:.2f}")

### Evaluation

In [None]:
model.eval()
y_true, y_pred_probs = [], []

with torch.no_grad():
    for batch in test_loader:
        out = model(batch.x, batch.edge_index, batch.edge_attr)
        y_true.extend(batch.y.cpu().numpy())
        y_pred_probs.extend(out.cpu().numpy())

y_pred_probs = np.array(y_pred_probs).flatten()
#y_pred = (y_pred_probs >= 0.1).astype(int)  # Converter para 0/1

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import roc_curve, auc

def plot_roc_curve(y_true, y_pred_probs):
    fpr, tpr, _ = roc_curve(y_true, y_pred_probs)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='grey', linestyle='--')  # Linha aleatória
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Taxa de Falsos Positivos (FPR)')
    plt.ylabel('Taxa de Verdadeiros Positivos (TPR)')
    plt.title('Curva ROC')
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

In [None]:
plot_roc_curve(y_true, y_pred_probs)