## Imports
---

In [3]:
import pandas as pd
import time
import numpy as np
import plotly.express as px
from src.utils.dataset import get_full_transactions_dataset 

from sklearn import preprocessing

import torch
import torch.nn as nn
from torch_geometric.data import Data
from torch_geometric.nn import GATConv, Linear
from torch_geometric.loader import NeighborLoader
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt



In [4]:
print(f"Using Torch version {torch.__version__}")
print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}") 
cuda_id = torch.cuda.current_device()
print(f"Name of current CUDA device:{torch.cuda.get_device_name(cuda_id)}")

Using Torch version 2.5.1+cu124
Is CUDA supported by this system? True
CUDA version: 12.4
Name of current CUDA device:NVIDIA GeForce RTX 2060


## Data set load
---

In [5]:
df = get_full_transactions_dataset()

2025-02-26 09:22:00.381 
  command:

    streamlit run C:\Users\ferna\AppData\Roaming\Python\Python312\site-packages\ipykernel_launcher.py [ARGUMENTS]


In [6]:
df.shape

(4484942, 9)

In [7]:
df.head(3)

Unnamed: 0,timestamp,sender,receiver,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering
0,2022/09/01 00:20,3208_8000F4580,1_8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0
1,2022/09/01 00:26,12_8000EC280,2439_8017BF800,7.66,US Dollar,7.66,US Dollar,Credit Card,0
2,2022/09/01 00:21,1_8000EDEC0,211050_80AEF5310,383.71,US Dollar,383.71,US Dollar,Credit Card,0


In [8]:
df.columns

Index(['timestamp', 'sender', 'receiver', 'amount_received',
       'receiving_currency', 'amount_paid', 'payment_currency',
       'payment_format', 'is_laundering'],
      dtype='object')

In [9]:
df.dtypes

timestamp              object
sender                 object
receiver               object
amount_received       float64
receiving_currency     object
amount_paid           float64
payment_currency       object
payment_format         object
is_laundering           int64
dtype: object

In [10]:
df.isna().sum()

timestamp             0
sender                0
receiver              0
amount_received       0
receiving_currency    0
amount_paid           0
payment_currency      0
payment_format        0
is_laundering         0
dtype: int64

In [11]:
df.head()

Unnamed: 0,timestamp,sender,receiver,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering
0,2022/09/01 00:20,3208_8000F4580,1_8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0
1,2022/09/01 00:26,12_8000EC280,2439_8017BF800,7.66,US Dollar,7.66,US Dollar,Credit Card,0
2,2022/09/01 00:21,1_8000EDEC0,211050_80AEF5310,383.71,US Dollar,383.71,US Dollar,Credit Card,0
3,2022/09/01 00:04,1_8000F4510,11813_8011305D0,9.82,US Dollar,9.82,US Dollar,Credit Card,0
4,2022/09/01 00:08,1_8000F4FE0,245335_812ED62E0,4.01,US Dollar,4.01,US Dollar,Credit Card,0


## Data preparation
---

### Node features

In [None]:
accounts_df = set(set(df['sender']).union(set(df['receiver'])))
nodes_df = pd.DataFrame({'account': list(accounts_df)})
nodes_df = nodes_df.sort_values(by="account").reset_index(drop=True)

In [None]:
laundering_df = df[df["is_laundering"] == 1]
laundering_accounts_df = set(set(laundering_df['sender']).union(set(laundering_df['receiver'])))

In [None]:
nodes_df["is_laundering"] = nodes_df.account.apply(lambda account_id: 1 if account_id in laundering_accounts_df else 0)

In [None]:
nodes_df["transactions_sent"] = nodes_df['account'].map(df.groupby('sender').size()).fillna(0)
nodes_df['transactions_received'] = nodes_df['account'].map(df.groupby('receiver').size()).fillna(0)
nodes_df['unique_currencies_sent'] = nodes_df['account'].map(df.groupby('sender')['payment_currency'].nunique()).fillna(0)
nodes_df['unique_currencies_received'] = nodes_df['account'].map(df.groupby('receiver')['payment_currency'].nunique()).fillna(0)

In [None]:
currencies = set(set(df['payment_currency']).union(set(df['receiving_currency'])))
for currency in currencies:
    nodes_df[f'total_sent_{currency.lower()}'] = nodes_df['account'].map(
        df[df['payment_currency'] == currency].groupby('sender')['amount_paid'].sum()
    ).fillna(0)
    
    nodes_df[f'total_received_{currency.lower()}'] = nodes_df['account'].map(
        df[df['payment_currency'] == currency].groupby('receiver')['amount_paid'].sum()
    ).fillna(0)

In [None]:
node_labels = torch.from_numpy(nodes_df["is_laundering"].values).to(torch.float)
node_features_df = nodes_df.drop(["account", "is_laundering"], axis=1)
node_features = torch.from_numpy(node_features_df.values).to(torch.float)

### Edge features

In [None]:
edges_df = df[["timestamp", "sender", "receiver", "amount_paid", "payment_currency", "payment_format", "is_laundering"]].copy()
le = preprocessing.LabelEncoder()
for i in ["payment_currency", "payment_format"]:
    edges_df[i] = le.fit_transform(edges_df[i].astype(str))

In [None]:
account_to_index = {acc: idx for idx, acc in enumerate(nodes_df['account'])}

In [None]:
edges_df['sender'] = edges_df['sender'].map(account_to_index)
edges_df['receiver'] = edges_df['receiver'].map(account_to_index)

In [None]:
edges_df['timestamp'] = edges_df['timestamp'].astype('int64') // 10**9

In [None]:
scaler = StandardScaler()
edges_df[['timestamp', 'amount_paid']] = scaler.fit_transform(edges_df[['timestamp', 'amount_paid']])

In [None]:
edges_df.head()

In [None]:
edge_index = torch.tensor([edges_df['sender'].values, edges_df['receiver'].values], dtype=torch.long)
edge_attr = torch.tensor(edges_df.drop(columns=['sender', 'receiver']).values, dtype=torch.float)

### Graph data

In [None]:
graph_data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr, y=node_labels)

In [None]:
graph_data

## Training
---

### Model definition

In [None]:
class GATModel(torch.nn.Module):
    def __init__(self, in_feats, hidden_dim, out_feats, heads):
        super().__init__()
        self.conv1 = GATConv(in_feats, hidden_dim, heads, dropout=0.6)
        self.conv2 = GATConv(hidden_dim * heads, int(hidden_dim/4), heads=1, concat=False, dropout=0.6)
        self.lin = Linear(int(hidden_dim/4), out_feats)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, edge_index, edge_attr):
        x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.conv1(x, edge_index, edge_attr))
        x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.conv2(x, edge_index, edge_attr))
        x = self.lin(x)
        x = self.sigmoid(x)
        
        return x

### Train and test split

In [None]:
num_nodes = graph_data.x.shape[0]
train_idx, test_idx = train_test_split(range(num_nodes), test_size=0.2, random_state=42)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[train_idx] = True
test_mask[test_idx] = True

graph_data.train_mask = train_mask
graph_data.test_mask = test_mask

### Train

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GATModel(in_feats=graph_data.x.shape[1], hidden_dim=16, out_feats=1, heads=8).to(device)

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
criterion = torch.nn.BCELoss()
graph_data = graph_data.to(device)

train_loader = NeighborLoader(
    graph_data,
    num_neighbors=[30] * 2,
    batch_size=256,
    input_nodes=graph_data.train_mask,
)

In [None]:
num_epochs = 100
for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        batch.x = torch.nan_to_num(batch.x, nan=0.0, posinf=1.0, neginf=-1.0)
        batch.edge_attr = torch.nan_to_num(batch.edge_attr, nan=0.0, posinf=1.0, neginf=-1.0)
        out = model(batch.x, batch.edge_index, batch.edge_attr)

        if torch.isnan(out).sum() > 0:
            print("⚠️ NaN detectado na saída do modelo! Abortando batch...")
            continue

        loss = criterion(out, batch.y.unsqueeze(1))
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
    end_time = time.time()
    epoch_duration = end_time - start_time
    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {loss.item():.4f} | time (s): {epoch_duration:.2f}", end='\r')

### Evaluation

In [None]:
model.eval()
with torch.no_grad():
    logits = model(graph_data.x, graph_data.edge_index, graph_data.edge_attr)
    predictions = logits.argmax(dim=1).cpu().numpy()
    true_labels = graph_data.y.cpu().numpy()

test_preds = predictions[graph_data.test_mask.cpu().numpy()]
test_labels = true_labels[graph_data.test_mask.cpu().numpy()]

accuracy = accuracy_score(test_labels, test_preds)
precision = precision_score(test_labels, test_preds)
recall = recall_score(test_labels, test_preds)
f1 = f1_score(test_labels, test_preds)
conf_matrix = confusion_matrix(test_labels, test_preds)

labels = ["Não Lavador (0)", "Lavador (1)"]
conf_matrix_display = np.array([
    [f"TN: {conf_matrix[0,0]}", f"FP: {conf_matrix[0,1]}"],
    [f"FN: {conf_matrix[1,0]}", f"TP: {conf_matrix[1,1]}"]
])

print(f"📊 Acurácia: {accuracy:.4f}")
print(f"🎯 Precisão: {precision:.4f}")
print(f"📢 Recall: {recall:.4f}")
print(f"📈 F1-score: {f1:.4f}")

In [None]:
%matplotlib inline

In [None]:
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=conf_matrix_display, fmt="", cmap="Blues", xticklabels=labels, yticklabels=labels)
plt.xlabel("Predição")
plt.ylabel("Real")
plt.title("Matriz de Confusão")
plt.show()