# Notebook 4 - Graph Machine Learning

### On importe les libraries nécessaires

In [1]:
import torch
from torch_geometric.data import Data
import numpy as np
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data, DataLoader

## On importe les données nécessaires

In [2]:
# On importe seulement les colonnes utiles
accounts = pd.read_csv("accounts.csv")
transac = pd.read_csv("transactions.csv")
alerts = pd.read_csv("alerts.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## On prépare nous données pour qu'elles puissent être modélisée sous forme de graphe

In [None]:
# Prepare edge_index
sender_ids = torch.tensor(transac['SENDER_ACCOUNT_ID'].values, dtype=torch.long)
receiver_ids = torch.tensor(transac['RECEIVER_ACCOUNT_ID'].values, dtype=torch.long)
edge_index = torch.stack([sender_ids, receiver_ids], dim=0)

# Prepare node features (INIT_BALANCE)
node_features = torch.tensor(accounts[['ACCOUNT_ID']].values, dtype=torch.float)

# Prepare edge features (TX_AMOUNT and TIMESTAMP)
edge_attr = torch.tensor(transac[['TX_AMOUNT', 'TIMESTAMP']].values, dtype=torch.float)

# Prepare labels (IS_FRAUD)
labels = torch.tensor(accounts['IS_FRAUD'].astype(int).values, dtype=torch.long)

# Create the PyTorch Geometric data object
data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr, y=labels)

## On crée un ensemble d'entrainement, de validation et de test

In [None]:
import torch
import numpy as np
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split

def random_split_transd_adapted(node_features, labels, edge_index, edge_attr, train_size, test_size, device='cpu', seed=42):
    """
    Split the graph into training, validation, and test sets based on node indices.
    
    Args:
        node_features (torch.Tensor): Tensor of node features.
        labels (torch.Tensor): Tensor of node labels.
        edge_index (torch.Tensor): Tensor representing the graph's edges (connectivity).
        edge_attr (torch.Tensor): Tensor of edge features.
        train_size (float): The proportion of nodes to include in the training set.
        test_size (float): The proportion of nodes to include in the test set.
        device (str): Device to store the data on ('cpu' or 'cuda').
        seed (int): Random seed for reproducibility.
    
    Returns:
        data (Data): A PyTorch Geometric Data object containing node features, edge_index, edge_attr, and labels.
        splits (dict): A dictionary with indices for the training, validation, and test sets.
    """
    # Ensure reproducibility
    np.random.seed(seed)
    torch.manual_seed(seed)

    # Total number of nodes
    num_nodes = labels.size(0)

    # Generate node indices
    node_indices = np.arange(num_nodes)

    # Split indices into training, validation, and test sets
    train_index, temp_index = train_test_split(node_indices, train_size=train_size, random_state=seed, stratify=labels.numpy())
    val_index, test_index = train_test_split(temp_index, test_size=test_size / (1 - train_size), random_state=seed, stratify=labels[temp_index].numpy())

    # Construct the PyTorch Geometric Data object
    data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr, y=labels).to(device)
    
    # Count the number of IS_FRAUD = 1 in each set
    num_fraud_train = torch.sum(labels[train_index] == 1).item()
    num_fraud_val = torch.sum(labels[val_index] == 1).item()
    num_fraud_test = torch.sum(labels[test_index] == 1).item()
    
    # Print the counts
    print(f"Training set size: {len(train_index)}, IS_FRAUD=1: {num_fraud_train}")
    print(f"Validation set size: {len(val_index)}, IS_FRAUD=1: {num_fraud_val}")
    print(f"Test set size: {len(test_index)}, IS_FRAUD=1: {num_fraud_test}")
    
    # Return the data object and the indices for each split
    return data, {'train': train_index, 'val': val_index, 'test': test_index}

# Example of how to use the function with your data
data, splits = random_split_transd_adapted(
    node_features=node_features, 
    labels=labels, 
    edge_index=edge_index, 
    edge_attr=edge_attr, 
    train_size=0.8, 
    test_size=0.1, 
    device='cuda' if torch.cuda.is_available() else 'cpu', 
    seed=42
)


In [None]:

# Dichotomy (binary search) function to find the optimal threshold
def find_optimal_threshold(y_prob, y_true, limit_recall, tol=1e-4):
    low, high = 0.0, 1.0
    best_threshold = 0.5
    while high - low > tol:
        mid = (low + high) / 2.0
        y_pred = (y_prob >= mid).astype(int)
        recall = recall_score(y_true, y_pred)
        
        if recall < limit_recall:
            high = mid
        else:
            best_threshold = mid
            low = mid
            
    return best_threshold


# Function to calculate SAR Conversion Rate
def sar_conversion_rate(y_true, y_pred):
    positive_predictions = np.sum(y_pred)
    if positive_predictions == 0:
        return 0
    true_positives = np.sum((y_pred == 1) & (y_true == 1))
    return true_positives / positive_predictions


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from torch_geometric.data import DataLoader
from sklearn.model_selection import train_test_split, ParameterSampler
from sklearn.metrics import recall_score, make_scorer
from sklearn.model_selection import StratifiedKFold
import numpy as np
from scipy.stats import randint, uniform
import matplotlib.pyplot as plt

# Assuming NUM_EDGE_FEATURES = 2
NUM_EDGE_FEATURES = 2

# Define the GAT model with edge features
class GDPModel(nn.Module):
    def __init__(self, num_features=3, hidden_size=32, target_size=1):
        super(GDPModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_features = num_features
        self.target_size = target_size
        self.convs = nn.ModuleList([
            GATConv(self.num_features, self.hidden_size, edge_dim=NUM_EDGE_FEATURES),
            GATConv(self.hidden_size, self.hidden_size, edge_dim=NUM_EDGE_FEATURES)
        ])
        self.linear = nn.Linear(self.hidden_size, self.target_size)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        for conv in self.convs[:-1]:
            x = conv(x, edge_index, edge_attr=edge_attr)
            x = F.relu(x)
            x = F.dropout(x, training=self.training)
        x = self.convs[-1](x, edge_index, edge_attr=edge_attr)
        x = self.linear(x)
        return F.relu(x)

# Function to train the model
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out.view(-1), data.y.float())
    loss.backward()
    optimizer.step()
    return loss.item()

# Function to evaluate the model
def evaluate(model, data, threshold=0.5):
    model.eval()
    with torch.no_grad():
        out = model(data)
        preds = (out.view(-1) > threshold).long()
        correct = preds.eq(data.y).sum().item()
        return correct / data.num_nodes

# Hyperparameter search space
param_dist = {
    'hidden_size': randint(16, 128),
    'lr': uniform(0.001, 0.01),
    'dropout': uniform(0.1, 0.5),
    'epochs': randint(1, 50)
}

# Randomized search for hyperparameter tuning
def randomized_search(data, param_dist, n_iter=10, seed=42):
    results = []
    sampler = ParameterSampler(param_dist, n_iter=n_iter, random_state=seed)
    
    for params in sampler:
        print(f"Training with params: {params}")
        model = GDPModel(num_features=data.x.size(1), hidden_size=params['hidden_size']).to(data.x.device)
        optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
        criterion = nn.BCEWithLogitsLoss()
        
        for epoch in range(params['epochs']):
            loss = train(model, data, optimizer, criterion)
            if epoch % 10 == 0:
                print(f"Epoch {epoch}: Loss = {loss:.4f}")
        
        acc = evaluate(model, data)
        results.append((acc, params))
        print(f"Validation Accuracy: {acc:.4f}\n")
    
    return sorted(results, key=lambda x: x[0], reverse=True)[0]

# Load data and splits (assuming you have already run the split code)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
data, splits = random_split_transd_adapted(node_features, labels, edge_index, edge_attr, train_size=0.8, test_size=0.1, device=device)

# Perform randomized search
best_acc, best_params = randomized_search(data, param_dist)
print(f"Best Parameters: {best_params}")
print(f"Best Validation Accuracy: {best_acc:.4f}")

# Evaluate the best model on the test set
best_model = GDPModel(num_features=data.x.size(1), hidden_size=best_params['hidden_size']).to(device)
optimizer = torch.optim.Adam(best_model.parameters(), lr=best_params['lr'])
criterion = nn.BCEWithLogitsLoss()

# Train the best model
for epoch in range(best_params['epochs']):
    train(best_model, data, optimizer, criterion)

# Final evaluation on the test set
test_acc = evaluate(best_model, data)
print(f"Test Set Accuracy: {test_acc:.4f}")

In [None]:
import torch

# Define GCN model
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Assuming `data` is your Data object with x, edge_index, edge_attr, and y
model = GCN(num_node_features=data.num_node_features, num_classes=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

# Testing
model.eval()
_, pred = model(data).max(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Test Accuracy: {acc:.4f}')


In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sklearn.metrics import recall_score

# Calculate conversion rate
def sar_conversion_rate(y_true, y_pred):
    positive_predictions = (y_pred == 1).sum().item()
    if positive_predictions == 0:
        return 0
    true_positives = ((y_pred == 1) & (y_true == 1)).sum().item()
    return true_positives / positive_predictions


# Step 1: Create Masks
num_nodes = data.num_nodes
num_train = int(num_nodes * 0.8)
num_val = int(num_nodes * 0.1)
num_test = num_nodes - num_train - num_val

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

indices = torch.randperm(num_nodes)
train_mask[indices[:num_train]] = True
val_mask[indices[num_train:num_train + num_val]] = True
test_mask[indices[num_train + num_val:]] = True

data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask

# Step 2: Define GCN Model
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Step 3: Train and Evaluate the Model
model = GCN(num_node_features=data.num_node_features, num_classes=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(50):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    # Validation
    model.eval()
    probabilities = torch.softmax(out, dim=1)[:, 1]
    pred = (probabilities > 0.5).long()
    recall = recall_score(data.y[data.val_mask].cpu(), pred[data.val_mask].cpu(), pos_label=1)
    conversion_rate = sar_conversion_rate(data.y[data.val_mask], pred[data.val_mask])
    print(f'Epoch {epoch}: Train Loss: {loss.item()}, Recall (Class 1): {recall:.4f}, Conversion Rate: {conversion_rate:.4f}')


# Testing after all epochs are done
model.eval()
out = model(data)
probabilities = torch.softmax(out, dim=1)[:, 1]  # Probabilities for class 1
threshold = 0.5  # Set your decision threshold

# Make predictions based on threshold
pred = (probabilities > threshold).long()

# Calculate recall for class 1
recall = recall_score(data.y[data.test_mask].cpu(), pred[data.test_mask].cpu(), pos_label=1)

conversion_rate = sar_conversion_rate(data.y[data.test_mask], pred[data.test_mask])

print(f'Test Accuracy: {test_acc:.4f}')
print(f'Recall (Class 1): {recall:.4f}')
print(f'Conversion Rate: {conversion_rate:.4f}')


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import recall_score
import numpy as np
from scipy.stats import randint, uniform
import matplotlib.pyplot as plt

# Define the GAT model with edge features
class GDPModel(nn.Module):
    def __init__(self, num_features=3, hidden_size=32, target_size=1):
        super(GDPModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_features = num_features
        self.target_size = target_size
        self.convs = nn.ModuleList([
            GATConv(self.num_features, self.hidden_size, edge_dim=NUM_EDGE_FEATURES),
            GATConv(self.hidden_size, self.hidden_size, edge_dim=NUM_EDGE_FEATURES)
        ])
        self.linear = nn.Linear(self.hidden_size, self.target_size)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        for conv in self.convs[:-1]:
            x = conv(x, edge_index, edge_attr=edge_attr)
            x = F.relu(x)
            x = F.dropout(x, training=self.training)
        x = self.convs[-1](x, edge_index, edge_attr=edge_attr)
        x = self.linear(x)
        return torch.sigmoid(x)  # Use sigmoid since it's binary classification

# Function to train the model
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out.view(-1), data.y.float())
    loss.backward()
    optimizer.step()
    return loss.item()

# Function to evaluate the model (focus on recall only)
def evaluate(model, data, threshold=0.5):
    model.eval()
    with torch.no_grad():
        out = model(data)
        preds = (out.view(-1) > threshold).long()
        recall = recall_score(data.y.cpu(), preds.cpu())
        return recall

# Hyperparameter search space
param_dist = {
    'hidden_size': randint(16, 128),
    'lr': uniform(0.0001, 0.01),
    'dropout': uniform(0.1, 0.5),
    'epochs': randint(10, 50)
}

# Randomized search for hyperparameter tuning
def randomized_search(data, param_dist, n_iter=10, seed=42):
    results = []
    sampler = ParameterSampler(param_dist, n_iter=n_iter, random_state=seed)
    
    for params in sampler:
        print(f"Training with params: {params}")
        model = GDPModel(num_features=data.x.size(1), hidden_size=params['hidden_size']).to(data.x.device)
        optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
        criterion = nn.BCEWithLogitsLoss()
        
        for epoch in range(params['epochs']):
            train(model, data, optimizer, criterion)
        
        recall = evaluate(model, data)
        results.append((recall, params))
        print(f"Validation Recall: {recall:.4f}\n")
    
    return sorted(results, key=lambda x: x[0], reverse=True)[0]

# Load data and splits (assuming you have already run the split code)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
data, splits = random_split_transd_adapted(node_features, labels, edge_index, edge_attr, train_size=0.8, test_size=0.1, device=device)

# Perform randomized search for hyperparameter tuning
best_recall, best_params = randomized_search(data, param_dist)
print(f"Best Parameters: {best_params}")
print(f"Best Validation Recall: {best_recall:.4f}")

# Train the best model
best_model = GDPModel(num_features=data.x.size(1), hidden_size=best_params['hidden_size']).to(device)
optimizer = torch.optim.Adam(best_model.parameters(), lr=best_params['lr'])
criterion = nn.BCEWithLogitsLoss()

for epoch in range(best_params['epochs']):
    train(best_model, data, optimizer, criterion)


In [None]:
# Function to calculate conversion rate
def conversion_rate(y_true, y_pred):
    positive_predictions = np.sum(y_pred)
    if positive_predictions == 0:
        return 0
    true_positives = np.sum((y_pred == 1) & (y_true == 1))
    return true_positives / positive_predictions

# Dichotomy (binary search) function to find the optimal threshold
def find_optimal_threshold(y_prob, y_true, limit_recall, tol=1e-4):
    low, high = 0.0, 1.0
    best_threshold = 0.5
    while high - low > tol:
        mid = (low + high) / 2.0
        y_pred = (y_prob >= mid).astype(int)
        recall = recall_score(y_true, y_pred)
        
        if recall < limit_recall:
            high = mid
        else:
            best_threshold = mid
            low = mid
            
    return best_threshold

# Predict probabilities on the test set
y_prob_test = best_model(data).view(-1).cpu().detach().numpy()

# Calculate SAR Conversion Rate and Recall for each threshold
thresholds = np.arange(0.0, 1.0, 0.01)
sar_rates = []
recalls = []
for threshold in thresholds:
    y_pred_test = (y_prob_test >= threshold).astype(int)
    sar_rate = conversion_rate(labels.cpu().numpy(), y_pred_test)
    recall = recall_score(labels.cpu().numpy(), y_pred_test)
    sar_rates.append(sar_rate)
    recalls.append(recall)

# Define the recall limit
limit_recall = 0.85  # Adjust as needed

# Plotting
fig, ax1 = plt.subplots()

ax1.set_xlabel('Threshold')
ax1.set_ylabel('SAR Conversion Rate', color='tab:blue')
ax1.plot(thresholds, sar_rates, color='tab:blue', label='SAR Conversion Rate')
ax1.tick_params(axis='y', labelcolor='tab:blue')

ax2 = ax1.twinx()
ax2.set_ylabel('Recall (Class 1)', color='tab:orange')
ax2.plot(thresholds, recalls, color='tab:orange', label='Recall (Class 1)')
ax2.tick_params(axis='y', labelcolor='tab:orange')

ax2.axhline(y=limit_recall, color='red', linestyle='--', linewidth=1, label='Recall Limit')
fig.tight_layout()
plt.title('Threshold vs SAR Conversion Rate & Recall (Class 1)')
plt.show()

# Find and print the optimal threshold and corresponding SAR Conversion Rate
optimal_threshold = find_optimal_threshold(y_prob_test, labels.cpu().numpy(), limit_recall)
y_pred_optimal = (y_prob_test >= optimal_threshold).astype(int)
optimal_sar = conversion_rate(labels.cpu().numpy(), y_pred_optimal)

print(f"Optimal Threshold for Recall Limit {limit_recall}: {optimal_threshold:.4f}")
print(f"SAR Conversion Rate at Optimal Threshold: {optimal_sar:.4f}")


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from sklearn.model_selection import ParameterSampler, StratifiedKFold
from sklearn.metrics import recall_score
import numpy as np
from scipy.stats import randint, uniform
import matplotlib.pyplot as plt

# Define the GAT model with edge features
class GDPModel(nn.Module):
    def __init__(self, num_features=3, hidden_size=32, target_size=1):
        super(GDPModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_features = num_features
        self.target_size = target_size
        self.convs = nn.ModuleList([
            GATConv(self.num_features, self.hidden_size, edge_dim=NUM_EDGE_FEATURES),
            GATConv(self.hidden_size, self.hidden_size, edge_dim=NUM_EDGE_FEATURES)
        ])
        self.linear = nn.Linear(self.hidden_size, self.target_size)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        for conv in self.convs[:-1]:
            x = conv(x, edge_index, edge_attr=edge_attr)  # Adding edge features here
            x = F.relu(x)
            x = F.dropout(x, training=self.training)
        x = self.convs[-1](x, edge_index, edge_attr=edge_attr)  # Adding edge features here
        x = self.linear(x)
        return torch.sigmoid(x)  # Use sigmoid for binary classification

# Function to train the model
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out.view(-1), data.y.float())
    loss.backward()
    optimizer.step()
    return loss.item()

# Function to evaluate the model (focus on recall only)
def evaluate(model, data, threshold=0.5):
    model.eval()
    with torch.no_grad():
        out = model(data)
        preds = (out.view(-1) > threshold).long()
        recall = recall_score(data.y.cpu(), preds.cpu())
        return recall

# Hyperparameter search space
param_dist = {
    'hidden_size': randint(16, 128),
    'lr': uniform(0.0001, 0.01),
    'dropout': uniform(0.1, 0.5),
    'epochs': randint(10, 150)
}

# Randomized search for hyperparameter tuning
def randomized_search(data, param_dist, n_iter=10, seed=42):
    results = []
    sampler = ParameterSampler(param_dist, n_iter=n_iter, random_state=seed)
    
    for params in sampler:
        print(f"Training with params: {params}")
        model = GDPModel(num_features=data.x.size(1), hidden_size=params['hidden_size']).to(data.x.device)
        optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
        criterion = nn.BCEWithLogitsLoss()
        
        for epoch in range(params['epochs']):
            train(model, data, optimizer, criterion)
        
        recall = evaluate(model, data)
        results.append((recall, params))
        print(f"Validation Recall: {recall:.4f}\n")
    
    return sorted(results, key=lambda x: x[0], reverse=True)[0]

# Load data and splits (assuming you have already run the split code)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
data, splits = random_split_transd_adapted(node_features, labels, edge_index, edge_attr, train_size=0.8, test_size=0.1, device=device)

# Perform randomized search for hyperparameter tuning
best_recall, best_params = randomized_search(data, param_dist)
print(f"Best Parameters: {best_params}")
print(f"Best Validation Recall: {best_recall:.4f}")

# Train the best model
best_model = GDPModel(num_features=data.x.size(1), hidden_size=best_params['hidden_size']).to(device)
optimizer = torch.optim.Adam(best_model.parameters(), lr=best_params['lr'])
criterion = nn.BCEWithLogitsLoss()

for epoch in range(best_params['epochs']):
    train(best_model, data, optimizer, criterion)


In [None]:
# Function to calculate SAR Conversion Rate
def sar_conversion_rate(y_true, y_pred):
    positive_predictions = np.sum(y_pred)
    if positive_predictions == 0:
        return 0
    true_positives = np.sum((y_pred == 1) & (y_true == 1))
    return true_positives / positive_predictions

# Dichotomy (binary search) function to find the optimal threshold
def find_optimal_threshold(y_prob, y_true, limit_recall, tol=1e-4):
    low, high = 0.0, 1.0
    best_threshold = 0.5
    while high - low > tol:
        mid = (low + high) / 2.0
        y_pred = (y_prob >= mid).astype(int)
        recall = recall_score(y_true, y_pred)
        
        if recall < limit_recall:
            high = mid
        else:
            best_threshold = mid
            low = mid
            
    return best_threshold

# Cross-validation setup with stratification
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
optimal_thresholds = []

# Cross-validation process
for fold_num, (train_index, val_index) in enumerate(skf.split(data.x.cpu().numpy(), labels.cpu().numpy()), start=1):
    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    train_mask[train_index] = True
    val_mask[val_index] = True
    
    # Train model on the current fold
    best_model.train()
    for epoch in range(best_params['epochs']):
        train(best_model, data, optimizer, criterion)
    
    # Predict probabilities on the validation fold
    best_model.eval()
    with torch.no_grad():
        y_prob_val_fold = best_model(data).view(-1)[val_mask].cpu().detach().numpy()
        y_true_val_fold = labels[val_mask].cpu().detach().numpy()

    # Calculate SAR Conversion Rate and Recall for each threshold
    thresholds = np.arange(0.0, 1.0, 0.01)
    sar_rates = []
    recalls = []
    for threshold in thresholds:
        y_pred_val = (y_prob_val_fold >= threshold).astype(int)
        sar_rate = sar_conversion_rate(y_true_val_fold, y_pred_val)
        recall = recall_score(y_true_val_fold, y_pred_val)
        sar_rates.append(sar_rate)
        recalls.append(recall)

    # Objective recall for the current fold
    objective_recall = np.random.uniform(min_recall_tuning, 1.0)
    print(f"Fold {fold_num}: Objective Recall Target = {objective_recall:.4f}")
    
    # Find the optimal threshold using the dichotomy method for this fold
    optimal_threshold = find_optimal_threshold(y_prob_val_fold, y_true_val_fold, objective_recall)
    optimal_thresholds.append(optimal_threshold)

    # Plot SAR Conversion Rate and Recall for each threshold
    fig, ax1 = plt.subplots()

    ax1.set_xlabel('Threshold')
    ax1.set_ylabel('SAR Conversion Rate', color='tab:blue')
    ax1.plot(thresholds, sar_rates, color='tab:blue', label='SAR Conversion Rate')
    ax1.tick_params(axis='y', labelcolor='tab:blue')

    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
    ax2.set_ylabel('Recall (Class 1)', color='tab:orange')
    ax2.plot(thresholds, recalls, color='tab:orange', label='Recall (Class 1)')
    ax2.tick_params(axis='y', labelcolor='tab:orange')

    # Highlight the selected threshold and the objective recall
    ax1.axvline(x=optimal_threshold, color='red', linestyle='--', linewidth=1, label='Selected Threshold')
    ax2.axhline(y=objective_recall, color='green', linestyle='--', linewidth=1, label='Objective Recall')

    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.title(f'Threshold vs SAR Conversion Rate & Recall (Class 1) - Fold {fold_num}')
    plt.show()

# Average the thresholds obtained from all folds
average_threshold = np.mean(optimal_thresholds)
print(f"Averaged Optimal Threshold from Cross-Validation: {average_threshold:.4f}")

# Evaluate on the test set using the averaged threshold
best_model.eval()
with torch.no_grad():
    y_prob_test = best_model(data).view(-1)[splits['test']].cpu().detach().numpy()
    y_true_test = labels[splits['test']].cpu().detach().numpy()
    y_pred_test = (y_prob_test >= average_threshold).astype(int)

test_sar_conversion = sar_conversion_rate(y_true_test, y_pred_test)
test_recall = recall_score(y_true_test, y_pred_test)

print(f"Test Set SAR Conversion Rate at Averaged Threshold: {test_sar_conversion:.4f}")
print(f"Test Set Recall at Averaged Threshold: {test_recall:.4f}")


In [None]:
min_recall_tuning = 0.85

# Adding features

Alright, without node features, it seems like we cannot do anything. Let's add our tabular data features as node features and let's see if it works better  

In [None]:
df = pd.read_csv("AML_features.csv")

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from sklearn.model_selection import ParameterSampler, StratifiedKFold
from sklearn.metrics import recall_score
import numpy as np
from scipy.stats import randint, uniform
import matplotlib.pyplot as plt



# Prepare node features by using the feature columns from `df`
node_features = torch.tensor(df.drop(columns=['ACCOUNT_ID', 'IS_FRAUD']).values, dtype=torch.float)

# Prepare labels (IS_FRAUD)
labels = torch.tensor(df['IS_FRAUD'].astype(int).values, dtype=torch.long)

# Prepare edge_index
sender_ids = torch.tensor(transac['SENDER_ACCOUNT_ID'].values, dtype=torch.long)
receiver_ids = torch.tensor(transac['RECEIVER_ACCOUNT_ID'].values, dtype=torch.long)
edge_index = torch.stack([sender_ids, receiver_ids], dim=0)

# Prepare edge features (TX_AMOUNT and TIMESTAMP)
edge_attr = torch.tensor(transac[['TX_AMOUNT', 'TIMESTAMP']].values, dtype=torch.float)

# Create the PyTorch Geometric data object
data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr, y=labels)





# Define the GAT model with edge features
class GDPModel(nn.Module):
    def __init__(self, num_features=26, hidden_size=32, target_size=1):
        super(GDPModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_features = num_features
        self.target_size = target_size
        self.convs = nn.ModuleList([
            GATConv(self.num_features, self.hidden_size, edge_dim=2),
            GATConv(self.hidden_size, self.hidden_size, edge_dim=2)
        ])
        self.linear = nn.Linear(self.hidden_size, self.target_size)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        for conv in self.convs[:-1]:
            x = conv(x, edge_index, edge_attr=edge_attr)  # Adding edge features here
            x = F.relu(x)
            x = F.dropout(x, training=self.training)
        x = self.convs[-1](x, edge_index, edge_attr=edge_attr)  # Adding edge features here
        x = self.linear(x)
        return torch.sigmoid(x)  # Use sigmoid for binary classification

# Function to train the model
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out.view(-1), data.y.float())
    loss.backward()
    optimizer.step()
    return loss.item()

# Function to evaluate the model (focus on recall only)
def evaluate(model, data, threshold=0.5):
    model.eval()
    with torch.no_grad():
        out = model(data)
        preds = (out.view(-1) > threshold).long()
        recall = recall_score(data.y.cpu(), preds.cpu())
        return recall

# Hyperparameter search space
param_dist = {
    'hidden_size': randint(16, 128),
    'lr': uniform(0.0001, 0.01),
    'dropout': uniform(0.1, 0.5),
    'epochs': randint(10, 150)
}

# Randomized search for hyperparameter tuning
def randomized_search(data, param_dist, n_iter=10, seed=42):
    results = []
    sampler = ParameterSampler(param_dist, n_iter=n_iter, random_state=seed)
    
    for params in sampler:
        print(f"Training with params: {params}")
        model = GDPModel(num_features=data.x.size(1), hidden_size=params['hidden_size']).to(data.x.device)
        optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
        criterion = nn.BCEWithLogitsLoss()
        
        for epoch in range(params['epochs']):
            train(model, data, optimizer, criterion)
        
        recall = evaluate(model, data)
        results.append((recall, params))
        print(f"Validation Recall: {recall:.4f}\n")
    
    return sorted(results, key=lambda x: x[0], reverse=True)[0]

# Load data and splits (assuming you have already run the split code)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
data, splits = random_split_transd_adapted(node_features, labels, edge_index, edge_attr, train_size=0.8, test_size=0.1, device=device)

# Perform randomized search for hyperparameter tuning
best_recall, best_params = randomized_search(data, param_dist)
print(f"Best Parameters: {best_params}")
print(f"Best Validation Recall: {best_recall:.4f}")

# Train the best model
best_model = GDPModel(num_features=data.x.size(1), hidden_size=best_params['hidden_size']).to(device)
optimizer = torch.optim.Adam(best_model.parameters(), lr=best_params['lr'])
criterion = nn.BCEWithLogitsLoss()

for epoch in range(best_params['epochs']):
    train(best_model, data, optimizer, criterion)

    


# Cross-validation setup with stratification
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
optimal_thresholds = []

# Cross-validation process
for fold_num, (train_index, val_index) in enumerate(skf.split(data.x.cpu().numpy(), labels.cpu().numpy()), start=1):
    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    train_mask[train_index] = True
    val_mask[val_index] = True
    
    # Train model on the current fold
    best_model.train()
    for epoch in range(best_params['epochs']):
        train(best_model, data, optimizer, criterion)
    
    # Predict probabilities on the validation fold
    best_model.eval()
    with torch.no_grad():
        y_prob_val_fold = best_model(data).view(-1)[val_mask].cpu().detach().numpy()
        y_true_val_fold = labels[val_mask].cpu().detach().numpy()

    # Calculate SAR Conversion Rate and Recall for each threshold
    thresholds = np.arange(0.0, 1.0, 0.01)
    sar_rates = []
    recalls = []
    for threshold in thresholds:
        y_pred_val = (y_prob_val_fold >= threshold).astype(int)
        sar_rate = sar_conversion_rate(y_true_val_fold, y_pred_val)
        recall = recall_score(y_true_val_fold, y_pred_val)
        sar_rates.append(sar_rate)
        recalls.append(recall)

    # Objective recall for the current fold
    objective_recall = np.random.uniform(min_recall_tuning, 1.0)
    print(f"Fold {fold_num}: Objective Recall Target = {objective_recall:.4f}")
    
    # Find the optimal threshold using the dichotomy method for this fold
    optimal_threshold = find_optimal_threshold(y_prob_val_fold, y_true_val_fold, objective_recall)
    optimal_thresholds.append(optimal_threshold)

    # Plot SAR Conversion Rate and Recall for each threshold
    fig, ax1 = plt.subplots()

    ax1.set_xlabel('Threshold')
    ax1.set_ylabel('SAR Conversion Rate', color='tab:blue')
    ax1.plot(thresholds, sar_rates, color='tab:blue', label='SAR Conversion Rate')
    ax1.tick_params(axis='y', labelcolor='tab:blue')

    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
    ax2.set_ylabel('Recall (Class 1)', color='tab:orange')
    ax2.plot(thresholds, recalls, color='tab:orange', label='Recall (Class 1)')
    ax2.tick_params(axis='y', labelcolor='tab:orange')

    # Highlight the selected threshold and the objective recall
    ax1.axvline(x=optimal_threshold, color='red', linestyle='--', linewidth=1, label='Selected Threshold')
    ax2.axhline(y=objective_recall, color='green', linestyle='--', linewidth=1, label='Objective Recall')

    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.title(f'Threshold vs SAR Conversion Rate & Recall (Class 1) - Fold {fold_num}')
    plt.show()

# Average the thresholds obtained from all folds
average_threshold = np.mean(optimal_thresholds)
print(f"Averaged Optimal Threshold from Cross-Validation: {average_threshold:.4f}")

# Evaluate on the test set using the averaged threshold
best_model.eval()
with torch.no_grad():
    y_prob_test = best_model(data).view(-1)[splits['test']].cpu().detach().numpy()
    y_true_test = labels[splits['test']].cpu().detach().numpy()
    y_pred_test = (y_prob_test >= average_threshold).astype(int)

test_sar_conversion = sar_conversion_rate(y_true_test, y_pred_test)
test_recall = recall_score(y_true_test, y_pred_test)

print(f"Test Set SAR Conversion Rate at Averaged Threshold: {test_sar_conversion:.4f}")
print(f"Test Set Recall at Averaged Threshold: {test_recall:.4f}")
    

## Message passing node classification

Sans class imbalance management et entrainement avec une fonction qui pénalise plus la classe 1 que la classe 0

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import recall_score
from sklearn.model_selection import StratifiedKFold
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing
import torch.nn.functional as F
import torch.optim as optim

# Prepare edge_index, node features, edge features, and labels as before
# Assuming transac and accounts DataFrames are already defined

sender_ids = torch.tensor(transac['SENDER_ACCOUNT_ID'].values, dtype=torch.long)
receiver_ids = torch.tensor(transac['RECEIVER_ACCOUNT_ID'].values, dtype=torch.long)
edge_index = torch.stack([sender_ids, receiver_ids], dim=0)
node_features = torch.tensor(accounts[['INIT_BALANCE']].values, dtype=torch.float)
edge_attr = torch.tensor(transac[['TX_AMOUNT', 'TIMESTAMP']].values, dtype=torch.float)
labels = torch.tensor(accounts['IS_FRAUD'].astype(int).values, dtype=torch.long)
data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr, y=labels)

class MPNN(MessagePassing):
    def __init__(self, in_channels, out_channels, edge_dim):
        super(MPNN, self).__init__(aggr='mean')
        self.lin = torch.nn.Linear(in_channels + edge_dim, out_channels)

    def forward(self, x, edge_index, edge_attr):
        return self.propagate(edge_index, x=x, edge_attr=edge_attr)

    def message(self, x_j, edge_attr):
        return F.relu(self.lin(torch.cat([x_j, edge_attr], dim=-1)))

class NodeClassifier(torch.nn.Module):
    def __init__(self, num_node_features, num_edge_features, hidden_dim, num_classes):
        super(NodeClassifier, self).__init__()
        self.mpnn = MPNN(num_node_features, hidden_dim, num_edge_features)
        self.linear = torch.nn.Linear(hidden_dim, num_classes)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        x = self.mpnn(x, edge_index, edge_attr)
        return self.linear(x)

# Instantiate the model
num_node_features = 1  # INIT_BALANCE
num_edge_features = 2  # TX_AMOUNT and TIMESTAMP
hidden_dim = 32
num_classes = 2  # Binary classification

model = NodeClassifier(num_node_features, num_edge_features, hidden_dim, num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Weighted cross-entropy loss, giving more weight to class 1 (fraud)
class_weights = torch.tensor([1.0, 5.0])
loss_criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

def sar_conversion_rate(y_true, y_pred):
    positive_predictions = np.sum(y_pred)
    if positive_predictions == 0:
        return 0
    true_positives = np.sum((y_pred == 1) & (y_true == 1))
    return true_positives / positive_predictions

def find_optimal_threshold(y_prob, y_true, limit_recall, tol=1e-4):
    low, high = 0.0, 1.0
    best_threshold = 0.5
    while high - low > tol:
        mid = (low + high) / 2.0
        y_pred = (y_prob >= mid).astype(int)
        recall = recall_score(y_true, y_pred)
        if recall < limit_recall:
            high = mid
        else:
            best_threshold = mid
            low = mid
    return best_threshold

# Cross-validation setup with stratification
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
optimal_thresholds = []

for fold_num, (train_index, val_index) in enumerate(skf.split(data.x.cpu().numpy(), labels.cpu().numpy()), start=1):
    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    train_mask[train_index] = True
    val_mask[val_index] = True
    
    # Train the model on the current fold
    model.train()
    for epoch in range(50):  # Reduce the number of epochs for faster convergence
        optimizer.zero_grad()
        out = model(data)
        loss = loss_criterion(out[train_mask], labels[train_mask])
        loss.backward()
        optimizer.step()
    
    # Predict probabilities on the validation fold
    model.eval()
    with torch.no_grad():
        y_prob_val_fold = F.softmax(model(data), dim=1)[:, 1][val_mask].cpu().numpy()
        y_true_val_fold = labels[val_mask].cpu().numpy()

    # Calculate SAR Conversion Rate and Recall for each threshold
    thresholds = np.arange(0.0, 1.0, 0.01)
    sar_rates = []
    recalls = []
    for threshold in thresholds:
        y_pred_val = (y_prob_val_fold >= threshold).astype(int)
        sar_rate = sar_conversion_rate(y_true_val_fold, y_pred_val)
        recall = recall_score(y_true_val_fold, y_pred_val)
        sar_rates.append(sar_rate)
        recalls.append(recall)

    # Objective recall for the current fold
    objective_recall = np.random.uniform(0.8, 1.0)  # Adjust min_recall_tuning as needed
    print(f"Fold {fold_num}: Objective Recall Target = {objective_recall:.4f}")
    
    # Find the optimal threshold using the dichotomy method for this fold
    optimal_threshold = find_optimal_threshold(y_prob_val_fold, y_true_val_fold, objective_recall)
    optimal_thresholds.append(optimal_threshold)

    # Plot SAR Conversion Rate and Recall for each threshold
    fig, ax1 = plt.subplots()

    ax1.set_xlabel('Threshold')
    ax1.set_ylabel('SAR Conversion Rate', color='tab:blue')
    ax1.plot(thresholds, sar_rates, color='tab:blue', label='SAR Conversion Rate')
    ax1.tick_params(axis='y', labelcolor='tab:blue')

    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
    ax2.set_ylabel('Recall (Class 1)', color='tab:orange')
    ax2.plot(thresholds, recalls, color='tab:orange', label='Recall (Class 1)')
    ax2.tick_params(axis='y', labelcolor='tab:orange')

    # Highlight the selected threshold and the objective recall
    ax1.axvline(x=optimal_threshold, color='red', linestyle='--', linewidth=1, label='Selected Threshold')
    ax2.axhline(y=objective_recall, color='green', linestyle='--', linewidth=1, label='Objective Recall')

    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.title(f'Threshold vs SAR Conversion Rate & Recall (Class 1) - Fold {fold_num}')
    plt.show()

# Average the thresholds obtained from all folds
average_threshold = np.mean(optimal_thresholds)
print(f"Averaged Optimal Threshold from Cross-Validation: {average_threshold:.4f}")

# Evaluate on the test set using the averaged threshold
model.eval()
with torch.no_grad():
    y_prob_test = F.softmax(model(data), dim=1)[:, 1][splits['test']].cpu().numpy()
    y_true_test = labels[splits['test']].cpu().numpy()
    y_pred_test = (y_prob_test >= average_threshold).astype(int)

test_sar_conversion = sar_conversion_rate(y_true_test, y_pred_test)
test_recall = recall_score(y_true_test, y_pred_test)

print(f"Test Set SAR Conversion Rate at Averaged Threshold: {test_sar_conversion:.4f}")
print(f"Test Set Recall at Averaged Threshold: {test_recall:.4f}")


<u>Remarque :</u> Le rappel et très instable et le conversion rate reste globalement faible.

Il sera donc fondamental d'utiliser des méthdoes de imbalance management et de changer la fonction objectif dans le tuning des hyperparamètres.

# Message passing node classification

On utilisera la méthode de sur-échantillonage SMOTE, et l'optisation des hyperpatamètres se fera sur le F1-score 

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing
import torch.nn.functional as F
import torch.optim as optim
from imblearn.over_sampling import SMOTE

# Prepare edge_index, node features, edge features, and labels
sender_ids = torch.tensor(transac['SENDER_ACCOUNT_ID'].values, dtype=torch.long)
receiver_ids = torch.tensor(transac['RECEIVER_ACCOUNT_ID'].values, dtype=torch.long)
edge_index = torch.stack([sender_ids, receiver_ids], dim=0)
node_features = torch.tensor(accounts[['INIT_BALANCE']].values, dtype=torch.float)
edge_attr = torch.tensor(transac[['TX_AMOUNT', 'TIMESTAMP']].values, dtype=torch.float)
labels = torch.tensor(accounts['IS_FRAUD'].astype(int).values, dtype=torch.long)

# Oversample the data using SMOTE
X_resampled, y_resampled = SMOTE().fit_resample(node_features.cpu().numpy(), labels.cpu().numpy())
node_features_resampled = torch.tensor(X_resampled, dtype=torch.float)
labels_resampled = torch.tensor(y_resampled, dtype=torch.long)

# Create the PyTorch Geometric data object with resampled data
data = Data(x=node_features_resampled, edge_index=edge_index, edge_attr=edge_attr, y=labels_resampled)

class MPNN(MessagePassing):
    def __init__(self, in_channels, out_channels, edge_dim):
        super(MPNN, self).__init__(aggr='mean')
        self.lin = torch.nn.Linear(in_channels + edge_dim, out_channels)

    def forward(self, x, edge_index, edge_attr):
        return self.propagate(edge_index, x=x, edge_attr=edge_attr)

    def message(self, x_j, edge_attr):
        return F.relu(self.lin(torch.cat([x_j, edge_attr], dim=-1)))

class NodeClassifier(torch.nn.Module):
    def __init__(self, num_node_features, num_edge_features, hidden_dim, num_classes):
        super(NodeClassifier, self).__init__()
        self.mpnn = MPNN(num_node_features, hidden_dim, num_edge_features)
        self.linear = torch.nn.Linear(hidden_dim, num_classes)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        x = self.mpnn(x, edge_index, edge_attr)
        return self.linear(x)

# Instantiate the model
num_node_features = 1  # INIT_BALANCE
num_edge_features = 2  # TX_AMOUNT and TIMESTAMP
hidden_dim = 32
num_classes = 2  # Binary classification

model = NodeClassifier(num_node_features, num_edge_features, hidden_dim, num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Standard cross-entropy loss
loss_criterion = torch.nn.CrossEntropyLoss()

def sar_conversion_rate(y_true, y_pred):
    positive_predictions = np.sum(y_pred)
    if positive_predictions == 0:
        return 0
    true_positives = np.sum((y_pred == 1) & (y_true == 1))
    return true_positives / positive_predictions

# Cross-validation setup with stratification
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
optimal_thresholds = []

for fold_num, (train_index, val_index) in enumerate(skf.split(data.x.cpu().numpy(), labels_resampled.cpu().numpy()), start=1):
    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    train_mask[train_index] = True
    val_mask[val_index] = True
    
    # Train the model on the current fold
    model.train()
    for epoch in range(50):  # Reduce the number of epochs for faster convergence
        optimizer.zero_grad()
        out = model(data)
        loss = loss_criterion(out[train_mask], labels_resampled[train_mask])
        loss.backward()
        optimizer.step()
    
    # Predict probabilities on the validation fold
    model.eval()
    with torch.no_grad():
        y_prob_val_fold = F.softmax(model(data), dim=1)[:, 1][val_mask].cpu().numpy()
        y_true_val_fold = labels_resampled[val_mask].cpu().numpy()

    # Calculate SAR Conversion Rate and Recall for each threshold
    thresholds = np.arange(0.0, 1.0, 0.01)
    sar_rates = []
    recalls = []
    for threshold in thresholds:
        y_pred_val = (y_prob_val_fold >= threshold).astype(int)
        sar_rate = sar_conversion_rate(y_true_val_fold, y_pred_val)
        recall = recall_score(y_true_val_fold, y_pred_val)
        sar_rates.append(sar_rate)
        recalls.append(recall)

    # Objective recall for the current fold
    objective_recall = np.random.uniform(0.8, 1.0)  # Adjust min_recall_tuning as needed
    print(f"Fold {fold_num}: Objective Recall Target = {objective_recall:.4f}")
    
    # Find the optimal threshold using the dichotomy method for this fold
    optimal_threshold = find_optimal_threshold(y_prob_val_fold, y_true_val_fold, objective_recall)
    optimal_thresholds.append(optimal_threshold)

    # Plot SAR Conversion Rate and Recall for each threshold
    fig, ax1 = plt.subplots()

    ax1.set_xlabel('Threshold')
    ax1.set_ylabel('SAR Conversion Rate', color='tab:blue')
    ax1.plot(thresholds, sar_rates, color='tab:blue', label='SAR Conversion Rate')
    ax1.tick_params(axis='y', labelcolor='tab:blue')

    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
    ax2.set_ylabel('Recall (Class 1)', color='tab:orange')
    ax2.plot(thresholds, recalls, color='tab:orange', label='Recall (Class 1)')
    ax2.tick_params(axis='y', labelcolor='tab:orange')

    # Highlight the selected threshold and the objective recall
    ax1.axvline(x=optimal_threshold, color='red', linestyle='--', linewidth=1, label='Selected Threshold')
    ax2.axhline(y=objective_recall, color='green', linestyle='--', linewidth=1, label='Objective Recall')

    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.title(f'Threshold vs SAR Conversion Rate & Recall (Class 1) - Fold {fold_num}')
    plt.show()

# Average the thresholds obtained from all folds
average_threshold = np.mean(optimal_thresholds)
print(f"Averaged Optimal Threshold from Cross-Validation: {average_threshold:.4f}")

# Evaluate on the test set using the averaged threshold
model.eval()
with torch.no_grad():
    y_prob_test = F.softmax(model(data), dim=1)[:, 1][splits['test']].cpu().numpy()
    y_true_test = labels_resampled[splits['test']].cpu().numpy()
    y_pred_test = (y_prob_test >= average_threshold).astype(int)

test_sar_conversion = sar_conversion_rate(y_true_test, y_pred_test)
test_recall = recall_score(y_true_test, y_pred_test)
test_f1_score = f1_score(y_true_test, y_pred_test)

print(f"Test Set SAR Conversion Rate at Averaged Threshold: {test_sar_conversion:.4f}")
print(f"Test Set Recall at Averaged Threshold: {test_recall:.4f}")
print(f"Test Set F1 Score at Averaged Threshold: {test_f1_score:.4f}")


<u>Remarque :</u> La méthode de tuning de threshold que l'on a utilisé en notebook 3 n'est pas adaptée ici. Notre fonction recall est en escalier, avec une marche nous menant à un recall élevée, un palier immense pour un recall un peu faible mais acceptable, puis une marche à recall casi 0. Il nous sera donc difficile de viser un recall extrêmement élevé, la pente étant très inclinée, on a intérêt à choisir le threshold le plus élevé sur la marche de rappel, pour essayer de maximiser le conversion rate.

A vu d'oeil, un threshold de 0.7 devrait nous assurer de rester sur ce palier de rappel, voyons donc ce que nous donne un seuil de 0.7

### MPNN avec SMOTE et optimisation sur f1-score, avec un seuil de décision de 0.7

In [3]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing
import torch.nn.functional as F
import torch.optim as optim
from imblearn.over_sampling import SMOTE

# Prepare edge_index, node features, edge features, and labels
sender_ids = torch.tensor(transac['SENDER_ACCOUNT_ID'].values, dtype=torch.long)
receiver_ids = torch.tensor(transac['RECEIVER_ACCOUNT_ID'].values, dtype=torch.long)
edge_index = torch.stack([sender_ids, receiver_ids], dim=0)
node_features = torch.tensor(accounts[['INIT_BALANCE']].values, dtype=torch.float)
edge_attr = torch.tensor(transac[['TX_AMOUNT', 'TIMESTAMP']].values, dtype=torch.float)
labels = torch.tensor(accounts['IS_FRAUD'].astype(int).values, dtype=torch.long)

# Oversample the data using SMOTE
X_resampled, y_resampled = SMOTE().fit_resample(node_features.cpu().numpy(), labels.cpu().numpy())
node_features_resampled = torch.tensor(X_resampled, dtype=torch.float)
labels_resampled = torch.tensor(y_resampled, dtype=torch.long)

# Create the PyTorch Geometric data object with resampled data
data = Data(x=node_features_resampled, edge_index=edge_index, edge_attr=edge_attr, y=labels_resampled)

class MPNN(MessagePassing):
    def __init__(self, in_channels, out_channels, edge_dim):
        super(MPNN, self).__init__(aggr='mean')
        self.lin = torch.nn.Linear(in_channels + edge_dim, out_channels)

    def forward(self, x, edge_index, edge_attr):
        return self.propagate(edge_index, x=x, edge_attr=edge_attr)

    def message(self, x_j, edge_attr):
        return F.relu(self.lin(torch.cat([x_j, edge_attr], dim=-1)))

class NodeClassifier(torch.nn.Module):
    def __init__(self, num_node_features, num_edge_features, hidden_dim, num_classes):
        super(NodeClassifier, self).__init__()
        self.mpnn = MPNN(num_node_features, hidden_dim, num_edge_features)
        self.linear = torch.nn.Linear(hidden_dim, num_classes)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        x = self.mpnn(x, edge_index, edge_attr)
        return self.linear(x)

# Instantiate the model
num_node_features = 1  # INIT_BALANCE
num_edge_features = 2  # TX_AMOUNT and TIMESTAMP
hidden_dim = 32
num_classes = 2  # Binary classification

model = NodeClassifier(num_node_features, num_edge_features, hidden_dim, num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Standard cross-entropy loss
loss_criterion = torch.nn.CrossEntropyLoss()

def sar_conversion_rate(y_true, y_pred):
    positive_predictions = np.sum(y_pred)
    if positive_predictions == 0:
        return 0
    true_positives = np.sum((y_pred == 1) & (y_true == 1))
    return true_positives / positive_predictions

# Cross-validation setup with stratification
skf = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)

for fold_num, (train_index, val_index) in enumerate(skf.split(data.x.cpu().numpy(), labels_resampled.cpu().numpy()), start=1):
    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    train_mask[train_index] = True
    val_mask[val_index] = True
    
    # Train the model on the current fold
    model.train()
    for epoch in range(70): 
        optimizer.zero_grad()
        out = model(data)
        loss = loss_criterion(out[train_mask], labels_resampled[train_mask])
        loss.backward()
        optimizer.step()
    
    # Predict probabilities on the validation fold
    model.eval()
    with torch.no_grad():
        y_prob_val_fold = F.softmax(model(data), dim=1)[:, 1][val_mask].cpu().numpy()
        y_true_val_fold = labels_resampled[val_mask].cpu().numpy()

    # Set the fixed threshold at 0.7
    threshold = 0.7
    y_pred_val = (y_prob_val_fold >= threshold).astype(int)
    
    # Calculate SAR Conversion Rate and Recall
    sar_rate = sar_conversion_rate(y_true_val_fold, y_pred_val)
    recall = recall_score(y_true_val_fold, y_pred_val)
    f1 = f1_score(y_true_val_fold, y_pred_val)

    print(f"Fold {fold_num} - SAR Conversion Rate: {sar_rate:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

# Evaluate on the test set using the fixed threshold
#model.eval()
#with torch.no_grad():
#    y_prob_test = F.softmax(model(data), dim=1)[:, 1][val_mask].cpu().numpy()
#    y_true_test = labels_resampled[val_mask].cpu().numpy()
#    y_pred_test = (y_prob_test >= threshold).astype(int)

#test_sar_conversion = sar_conversion_rate(y_true_test, y_pred_test)
#test_recall = recall_score(y_true_test, y_pred_test)
#test_f1_score = f1_score(y_true_test, y_pred_test)

#print(f"Test Set SAR Conversion Rate at Threshold 0.7: {test_sar_conversion:.4f}")
#print(f"Test Set Recall at Threshold 0.7: {test_recall:.4f}")
#print(f"Test Set F1 Score at Threshold 0.7: {test_f1_score:.4f}")

Fold 1 - SAR Conversion Rate: 0.0000, Recall: 0.0000, F1 Score: 0.0000
Fold 2 - SAR Conversion Rate: 0.9677, Recall: 0.7143, F1 Score: 0.8219
Fold 3 - SAR Conversion Rate: 0.9722, Recall: 0.8333, F1 Score: 0.8974
Fold 4 - SAR Conversion Rate: 1.0000, Recall: 0.8571, F1 Score: 0.9231
Fold 5 - SAR Conversion Rate: 0.5676, Recall: 1.0000, F1 Score: 0.7241
Fold 6 - SAR Conversion Rate: 0.9250, Recall: 0.8810, F1 Score: 0.9024
Fold 7 - SAR Conversion Rate: 0.5385, Recall: 1.0000, F1 Score: 0.7000
Fold 8 - SAR Conversion Rate: 0.6452, Recall: 0.9524, F1 Score: 0.7692
Fold 9 - SAR Conversion Rate: 0.9714, Recall: 0.8095, F1 Score: 0.8831
Fold 10 - SAR Conversion Rate: 0.8500, Recall: 0.8095, F1 Score: 0.8293
Fold 11 - SAR Conversion Rate: 0.9655, Recall: 0.6667, F1 Score: 0.7887
Fold 12 - SAR Conversion Rate: 1.0000, Recall: 0.6667, F1 Score: 0.8000
Fold 13 - SAR Conversion Rate: 0.9500, Recall: 0.9048, F1 Score: 0.9268
Fold 14 - SAR Conversion Rate: 0.6724, Recall: 0.9286, F1 Score: 0.7800
F