In [3]:
import torch
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import InMemoryDataset, Data
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
import torch_geometric.transforms as T
import torch.nn as nn
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
import numpy as np
import itertools
from copy import deepcopy
from tqdm import tqdm
import logging
import os
from tabulate import tabulate
import matplotlib.pyplot as plt
import csv
from datetime import datetime
from EMP.metrics import empCreditScoring

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
os.chdir(r"C:\Users\guest\Desktop\OneDrive_2025-01-04\Van Kerschaver & Xu - Benchmarking graph neural networks for churn prediction - shared folder\Data\Transformed data (csv format)\Mobile Vikings")

In [5]:
# Load all nodes that churn in month 1
churners_m1 = pd.read_csv("L_M1.csv")
churners_list_m1 = churners_m1[churners_m1['churn_m1'] == 1]['USR'].tolist()
print("Number of churners in month 1: ", len(churners_list_m1))

Number of churners in month 1:  14055


In [6]:
import torch
from torch_geometric.data import Data

def remove_nodes(data, nodes_to_remove):
    """
    Remove specified nodes from a PyTorch Geometric Data object and update all related attributes.
    
    Args:
        data: PyTorch Geometric Data object
        nodes_to_remove: List or tensor of node indices to remove
    
    Returns:
        Updated Data object with nodes removed
    """
    # Convert nodes_to_remove to a set for faster lookup
    if not isinstance(nodes_to_remove, set):
        nodes_to_remove = set(nodes_to_remove.tolist() if torch.is_tensor(nodes_to_remove) else nodes_to_remove)
    
    # Create a mask for nodes to keep
    keep_nodes = torch.ones(data.num_nodes, dtype=torch.bool)
    keep_nodes[list(nodes_to_remove)] = False
    
    # Create a mapping from old to new indices
    new_indices = torch.cumsum(keep_nodes, dim=0) - 1
    
    # Filter edges: keep only edges where both source and target nodes are kept
    edge_mask = torch.ones(data.edge_index.size(1), dtype=torch.bool)
    for i in range(data.edge_index.size(1)):
        src, dst = data.edge_index[:, i]
        if src.item() in nodes_to_remove or dst.item() in nodes_to_remove:
            edge_mask[i] = False
    
    # Create new data object with filtered nodes and edges
    new_data = Data()
    
    # Update node features
    if hasattr(data, 'x') and data.x is not None:
        new_data.x = data.x[keep_nodes]
    
    # Update node labels
    if hasattr(data, 'y') and data.y is not None:
        new_data.y = data.y[keep_nodes]
    
    # Update edge indices and attributes
    if hasattr(data, 'edge_index') and data.edge_index is not None:
        # Filter edges
        new_data.edge_index = data.edge_index[:, edge_mask]
        # Update indices
        for i in range(new_data.edge_index.size(1)):
            new_data.edge_index[0, i] = new_indices[new_data.edge_index[0, i]]
            new_data.edge_index[1, i] = new_indices[new_data.edge_index[1, i]]
    
    # Update edge attributes
    if hasattr(data, 'edge_attr') and data.edge_attr is not None:
        new_data.edge_attr = data.edge_attr[edge_mask]
    
    # Preserve other attributes
    new_data.num_classes = data.num_classes
    new_data.num_features = data.num_features
    
    return new_data


# Training 

In [7]:
node_attr= pd.read_csv("train_rmf.csv", sep=",",  header=0 )
node_attr.index += 1 #to make sure the nodes start counting at 1
display(node_attr)

Unnamed: 0,USR,R_on,M_30_on,M_60_on,M_90_on,F_30_on,F_60_on,F_90_on,numDialing_30_on,numDialed_30_on,numDialing_60_on,numDialed_60_on,numDialing_90_on,numDialed_90_on
1,1.0,0,1389.019590,3101.536954,6720.267202,54,94,214,3,4,6,6,10,8
2,2.0,0,2847.647333,5487.314113,6696.261817,228,449,578,5,5,6,5,6,5
3,3.0,11,706.386715,2611.513490,4056.546870,29,118,218,5,6,5,6,5,6
4,4.0,15,837.415605,1107.172814,1153.624858,55,74,78,5,5,5,6,5,6
5,5.0,0,1059.638912,1395.772800,3136.134856,78,116,259,5,5,5,5,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168347,168351.0,1000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0
168348,168353.0,1000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0
168349,168354.0,1000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0
168350,168355.0,1000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0


In [8]:
#don't need first column since this is USR 
node_attr= node_attr.iloc[:, 1:]
display(node_attr)
# List of columns you want to drop
columns_to_drop = ['M_60_on', 'M_90_on','F_60_on','F_90_on','numDialing_60_on','numDialed_60_on','numDialing_90_on','numDialed_90_on']
# Drop the specified columns
node_attr = node_attr.drop(columns=columns_to_drop)
# normalizing the attributes
scale = StandardScaler()
attrs_norm = scale.fit_transform(node_attr)
#to have more numbers after the comma
torch.set_printoptions(precision=10)
display(attrs_norm)
attrs_train = torch.tensor(attrs_norm, dtype=torch.float)

Unnamed: 0,R_on,M_30_on,M_60_on,M_90_on,F_30_on,F_60_on,F_90_on,numDialing_30_on,numDialed_30_on,numDialing_60_on,numDialed_60_on,numDialing_90_on,numDialed_90_on
1,0,1389.019590,3101.536954,6720.267202,54,94,214,3,4,6,6,10,8
2,0,2847.647333,5487.314113,6696.261817,228,449,578,5,5,6,5,6,5
3,11,706.386715,2611.513490,4056.546870,29,118,218,5,6,5,6,5,6
4,15,837.415605,1107.172814,1153.624858,55,74,78,5,5,5,6,5,6
5,0,1059.638912,1395.772800,3136.134856,78,116,259,5,5,5,5,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
168347,1000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0
168348,1000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0
168349,1000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0
168350,1000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0


array([[-0.41872848,  0.0314685 , -0.05325674,  0.14897139,  0.52507858],
       [-0.41872848,  0.38266333,  1.18667903,  0.85624259,  0.89452061],
       [-0.38580757, -0.13288949, -0.23140843,  0.85624259,  1.26396265],
       ...,
       [ 2.57408188, -0.30296672, -0.4380644 , -0.91193541, -0.95268956],
       [ 2.57408188, -0.30296672, -0.4380644 , -0.91193541, -0.95268956],
       [ 2.57408188, -0.30296672, -0.4380644 , -0.91193541, -0.95268956]])

In [9]:
#read the adjacency matrix
adj_train = pd.read_csv("SN_M2_c.csv", sep=",",  header=0)
display(adj_train)
#make them into tensors
edge_idx_train_dir0=torch.tensor([adj_train['i'], adj_train['j']], dtype=torch.long)
edge_idx_train_dir1= torch.tensor([adj_train['j'], adj_train['i']], dtype=torch.long)
edge_idx_train= torch.cat((edge_idx_train_dir0, edge_idx_train_dir1), dim=1)
display(edge_idx_train)
print(len(edge_idx_train[0]))
#create edge_attr from SN_ data file (don't normalise!!)   
#should we create tensor with only the attributes or also the edges?
edge_attrs_train_1= torch.tensor(adj_train['x'], dtype=torch.float64) 

#need it twice since we needed the edges in both directions
edge_attrs_train= torch.cat((edge_attrs_train_1,edge_attrs_train_1), dim=-1)
print(edge_attrs_train)
print(len(edge_attrs_train))

Unnamed: 0,i,j,x
0,5491,1,1
1,29407,1,47
2,30662,1,1
3,47527,1,2
4,51309,1,61
...,...,...,...
497731,79862,167208,1
497732,145555,167243,1
497733,39480,167268,2
497734,92195,167681,3


tensor([[  5491,  29407,  30662,  ..., 167268, 167681, 167807],
        [     1,      1,      1,  ...,  39480,  92195, 140824]])

995472
tensor([ 1., 47.,  1.,  ...,  2.,  3.,  1.], dtype=torch.float64)
995472


In [10]:
labels_pd_train= pd.read_csv("L_M3.csv", sep=",",  header=0)
# now need to transform them to tensors
labels_train = torch.tensor(labels_pd_train['churn_m3'], dtype=torch.long)
print(labels_train) 

tensor([0, 0, 0,  ..., 0, 0, 0])


In [11]:
## create a PyTorch Geometric Data object for training a graph neural network
class MobileVikings_train:
    def __init__(self, attrs_train, edge_idx_train,edge_attrs_train,labels_train):
        
        self.data_train=Data(x=attrs_train, edge_index=edge_idx_train, edge_attr=edge_attrs_train,y=labels_train )
        
        self.data_train.num_classes=2
        self.data_train.num_features= len(node_attr.columns)
    
dataset= MobileVikings_train(attrs_train, edge_idx_train,edge_attrs_train,labels_train)

# we save this in data_train
data_train= dataset.data_train
print(data_train)

Data(x=[168351, 5], edge_index=[2, 995472], edge_attr=[995472], y=[168351], num_classes=2, num_features=5)


In [12]:
data_train= remove_nodes(data_train, churners_list_m1)
print(data_train)

Data(x=[154296, 5], y=[154296], edge_index=[2, 929204], edge_attr=[929204], num_classes=2, num_features=5)


# Validation

In [13]:
adj_val = pd.read_csv("SN_M3_c.csv", sep=",",  header=0)
display(adj_val)
#make them into tensors
edge_idx_val_dir0=torch.tensor([adj_val['i'], adj_val['j']], dtype=torch.long)
edge_idx_val_dir1= torch.tensor([adj_val['j'], adj_val['i']], dtype=torch.long)
edge_idx_val= torch.cat((edge_idx_val_dir0, edge_idx_val_dir1), dim=1)
display(edge_idx_val)
print(len(edge_idx_val[0]))

Unnamed: 0,i,j,x
0,29407,1,55
1,47527,1,1
2,51309,1,12
3,68448,1,29
4,114644,1,1
...,...,...,...
505620,135680,167507,3
505621,65340,167616,1
505622,20350,167652,3
505623,73780,167780,1


tensor([[ 29407,  47527,  51309,  ..., 167652, 167780, 168139],
        [     1,      1,      1,  ...,  20350,  73780,  21770]])

1011250


In [14]:
edge_attrs_val_1= torch.tensor(adj_val['x'], dtype=torch.float64) 
#need it twice since we needed the edges in both directions
edge_attrs_val= torch.cat((edge_attrs_val_1,edge_attrs_val_1))
print(edge_attrs_val)

tensor([55.,  1., 12.,  ...,  3.,  1.,  1.], dtype=torch.float64)


In [15]:
labels_pd_val= pd.read_csv("L_M4.csv", sep=",",  header=0)
labels_val = torch.tensor(labels_pd_val['churn_m4'], dtype=torch.long)
print(labels_val) 

tensor([0, 0, 0,  ..., 0, 0, 0])


In [16]:
class MobileVikings_val:
    def __init__(self, attrs_train,edge_idx_val,edge_attrs_val,labels_val):
        self.data_val=Data(x=attrs_train, edge_index=edge_idx_val, edge_attr=edge_attrs_val,y=labels_val )
        self.data_val.num_classes=2
        self.data_val.num_features= len(node_attr.columns)
dataset=MobileVikings_val(attrs_train,edge_idx_val,edge_attrs_val,labels_val)
data_val = dataset.data_val
print(data_val)

Data(x=[168351, 5], edge_index=[2, 1011250], edge_attr=[1011250], y=[168351], num_classes=2, num_features=5)


In [17]:
data_val= remove_nodes(data_val, churners_list_m1)
print(data_val)

Data(x=[154296, 5], y=[154296], edge_index=[2, 943718], edge_attr=[943718], num_classes=2, num_features=5)


# Testing

In [18]:
test_node_attr= pd.read_csv("test_rmf.csv", sep=",",  header=0 )
test_node_attr.index += 1 #to make sure the nodes start counting at 1

#don't need first column since they is USR 
test_node_attr= test_node_attr.iloc[:, 1:]
columns_to_drop = ['M_60_on', 'M_90_on','F_60_on','F_90_on','numDialing_60_on','numDialed_60_on','numDialing_90_on','numDialed_90_on']
# Drop the specified columns
test_node_attr = test_node_attr.drop(columns=columns_to_drop)

# normalizing the attributes
scale = StandardScaler()
test_attrs_norm = scale.fit_transform(test_node_attr)

#to have more numbers after the comma
torch.set_printoptions(precision=10)

#now need to transform them to tensors
attrs_test = torch.tensor(test_attrs_norm, dtype=torch.float) #this can also be done in class
display(attrs_test)

tensor([[-0.3715719879, -0.2578698695, -0.3550667465,  0.1016500667,
          0.8493221998],
        [-0.3875837326,  0.2734937668,  0.7645937204,  0.4552221894,
          0.8493221998],
        [-0.3875837326, -0.1037445739,  0.0371710025,  0.4552221894,
          0.8493221998],
        ...,
        [ 2.8147616386, -0.2963555157, -0.4406458735, -0.9590663314,
         -1.0071097612],
        [ 2.8147616386, -0.2963555157, -0.4406458735, -0.9590663314,
         -1.0071097612],
        [ 2.8147616386, -0.2963555157, -0.4406458735, -0.9590663314,
         -1.0071097612]])

In [19]:
adj_test = pd.read_csv("SN_M4_c.csv", sep=",",  header=0)
display(adj_test)
edge_idx_test_dir0=torch.tensor([adj_test['i'], adj_test['j']], dtype=torch.long)
edge_idx_test_dir1= torch.tensor([adj_test['j'], adj_test['i']], dtype=torch.long)
edge_idx_test= torch.cat((edge_idx_test_dir0, edge_idx_test_dir1), dim=1)
print(edge_idx_test)
print(len(edge_idx_test[0]))

Unnamed: 0,i,j,x
0,29407,1,29
1,34359,1,2
2,47527,1,10
3,48775,1,48
4,51309,1,11
...,...,...,...
476166,94740,167584,8
476167,100201,167617,3
476168,75692,167767,3
476169,102971,168267,1


tensor([[ 29407,  34359,  47527,  ..., 167767, 168267, 168344],
        [     1,      1,      1,  ...,  75692, 102971,  60109]])
952342


In [20]:
edge_attrs_test_1= torch.tensor(adj_test['x'], dtype=torch.float64) 

#need it twice since we needed the edges in both directions
edge_attrs_test= torch.cat((edge_attrs_test_1,edge_attrs_test_1))
print(edge_attrs_test)

tensor([29.,  2., 10.,  ...,  3.,  1.,  1.], dtype=torch.float64)


In [21]:
labels_pd_test= pd.read_csv("L_test.csv", sep=",",  header=0)

#now need to transform them to tensors
labels_test = torch.tensor(labels_pd_test['churn_test'], dtype=torch.long)
print(labels_test)

tensor([0, 0, 0,  ..., 1, 1, 1])


In [22]:
#creating the dataset
from torch_geometric.data import InMemoryDataset, Data

class MobileVikings_test:
    def __init__(self, attrs_test, edge_idx_test,edge_attrs_test,labels_test):
        
        self.data_test=Data(x=attrs_test, edge_index=edge_idx_test, edge_attr=edge_attrs_test,y=labels_test )
        
        self.data_test.num_classes=2
        self.data_test.num_features= len(node_attr.columns)
    
dataset= MobileVikings_test(attrs_test, edge_idx_test,edge_attrs_test,labels_test)
data_test= dataset.data_test
print(data_test)

Data(x=[168351, 5], edge_index=[2, 952342], edge_attr=[952342], y=[168351], num_classes=2, num_features=5)


In [23]:
data_test= remove_nodes(data_test, churners_list_m1)
print(data_test)

Data(x=[154296, 5], y=[154296], edge_index=[2, 890852], edge_attr=[890852], num_classes=2, num_features=5)


# GCN

In [None]:

class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, num_layers, dropout_rate=0.5, out_channels=1):
        super(GCN, self).__init__()
        self.num_layers = num_layers
        self.dropout_rate = dropout_rate
        
        # Input layer
        self.convs = nn.ModuleList()
        self.convs.append(GCNConv(in_channels, hidden_channels))
        
        # Hidden layers
        for _ in range(num_layers - 1):
            self.convs.append(GCNConv(hidden_channels, hidden_channels))
        
        # Output layer
        self.lin = nn.Linear(hidden_channels, out_channels)

        def forward(self, x, edge_index):
        # Graph convolution layers with ELU activation and dropout
            for i in range(self.num_layers):
                x = self.convs[i](x, edge_index)
                x = F.elu(x)  # Using ELU activation as requested
                x = F.dropout(x, p=self.dropout_rate, training=self.training)
            
            # Final linear layer
            x = self.lin(x)
            return x

    

In [25]:
#calculating the weights for the loss function
total_weight_class_0 = 0
total_weight_class_1 = 0

# Iterate over the first 4 months (since test data looks different)
for month in range(1, 5):
    labels_pd = pd.read_csv(f"L_M{month}.csv", sep=",", header=0)
    
    #weight_for_class_0 = labels_pd.shape[0] / ((labels_pd[f'churn_m{month}'] == 0).sum() * 2)
    weight_for_class_1 = labels_pd.shape[0] / ((labels_pd[f'churn_m{month}'] == 1).sum() * 2)
    
    # Add the weights to the total
    #total_weight_class_0 += weight_for_class_0
    total_weight_class_1 += weight_for_class_1

#only test month needs to be done seperately
labels_pd_m5 = pd.read_csv(f"L_test.csv", sep=",", header=0)
    
#weight_for_class_0_m5 = labels_pd.shape[0] / ((labels_pd_m5['churn_test'] == 0).sum() * 2)
weight_for_class_1_m5 = labels_pd.shape[0] / ((labels_pd_m5['churn_test'] == 1).sum() * 2)

#total_weight_class_0 +=  weight_for_class_0_m5
total_weight_class_1 += weight_for_class_1_m5

#avg_weight_for_class_0 = total_weight_class_0 / 5
avg_weight_for_class_1 = total_weight_class_1 / 5


# Create the weight tensor
weight = torch.tensor([  avg_weight_for_class_1])
print(weight)

tensor([5.9431902660], dtype=torch.float64)


In [26]:
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out.squeeze(), data.y.float())
    loss.backward()
    optimizer.step()
    return loss.item()

In [27]:
def calculate_emp_score(y_true,y_prob):
        output = empCreditScoring(y_prob, y_true, return_output=True,print_output=False)
        emp_credit_score = float(output.EMPC)
        return emp_credit_score


In [28]:
def calculate_lift(y_true, y_prob, percentage):
    """
    Calculate the lift at a specific percentage
    
    Parameters:
    - y_true: True labels
    - y_prob: Predicted probabilities
    - percentage: Percentage of population (0.005 for 0.5%, 0.05 for 5%)
    """
    # Convert to numpy arrays
    y_true = np.array(y_true)
    y_prob = np.array(y_prob)
    
    # Sort indices by predicted probability (descending)
    sorted_indices = np.argsort(y_prob)[::-1]
    
    # Calculate number of samples to consider
    n_samples = len(y_true)
    n_top = max(1, int(n_samples * percentage))
    
    # Get top n_top samples
    top_indices = sorted_indices[:n_top]
    
    # Calculate lift
    top_positive_rate = np.mean(y_true[top_indices])
    overall_positive_rate = np.mean(y_true)
    
    # Avoid division by zero
    if overall_positive_rate == 0:
        return 0
    
    lift = top_positive_rate / overall_positive_rate
    return lift

In [42]:
def evaluate(model, data, criterion):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        loss = criterion(out.squeeze(), data.y.float())
        
        # Get probabilities
        probs = torch.sigmoid(out).squeeze().cpu().numpy()
        y_true = data.y.cpu().numpy()
        # AUC
        auc = roc_auc_score(y_true, probs)
        
        # EMP using EMP-PY library
        emp_score = 0

        # Lift metrics
        lift_005 = calculate_lift(y_true, probs, 0.005)  # 0.5% lift
        lift_05 = calculate_lift(y_true, probs, 0.05)    # 5% lift
        

        # Binary predictions at threshold 0.5 for basic accuracy
        pred_05 = (probs >= 0.5).astype(int)
        accuracy = np.mean(pred_05 == y_true)
        
    return {
        'loss': loss.item(),
        'accuracy': accuracy,
        'auc': auc,
        'emp': emp_score,
        'lift_005': lift_005,
        'lift_05': lift_05
    }

In [None]:
def hyperparameter_tuning(data_train, data_val, data_test, learning_rates, hidden_channels, layers, device, weight):
    results_df = []
    best_val_auc = 0
    best_val_emp = 0
    best_hyperparams = {}
    best_model = None
    
    print(f"Class imbalance weight: {weight.item()}")
    
    # Create timestamp for unique CSV filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_filename = f"gcn_results_{timestamp}.csv"
    
    # Set up CSV file with headers
    with open(csv_filename, 'w', newline='') as csvfile:
        fieldnames = ['lr', 'hidden_channels', 'layers', 'epoch', 
                     'train_loss', 'train_accuracy', 'train_auc', 'train_emp', 'train_lift_005', 'train_lift_05',
                     'val_loss', 'val_accuracy', 'val_auc', 'val_emp', 'val_lift_005', 'val_lift_05']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
    
    for lr in learning_rates:
        for hidden in hidden_channels:
            for num_layers in layers:
                print(f"\nTraining with lr={lr}, hidden_channels={hidden}, layers={num_layers}")
                
                # Initialize model
                model = GCN(data_train.num_features, hidden, num_layers).to(device)
                optimizer = torch.optim.Adam(model.parameters(), lr=lr)
                criterion = nn.BCEWithLogitsLoss(pos_weight=weight.to(device))
                
                # Track validation performance
                best_epoch = 0
                best_val_loss = float('inf')
                best_val_auc_for_config = 0
                best_val_emp_for_config = 0
                patience = 15
                epochs_no_improve = 0
                
                # Train for a maximum of 200 epochs
                for epoch in range(1, 201):
                    loss = train(model, data_train.to(device), optimizer, criterion)
                    
                    if epoch % 5 == 0 or epoch == 1:
                        # Evaluate on training and validation sets
                        train_metrics = evaluate(model, data_train.to(device), criterion)
                        val_metrics = evaluate(model, data_val.to(device), criterion)
                        
                        # Print metrics
                        print(f'Epoch: {epoch:03d}, Train Loss: {train_metrics["loss"]:.4f}, '
                              f'Val Loss: {val_metrics["loss"]:.4f}, Val AUC: {val_metrics["auc"]:.4f}, '
                              f'Val EMP: {val_metrics["emp"]:.4f}, Val 0.5% Lift: {val_metrics["lift_005"]:.4f}')
                        
                        # Save to CSV
                        with open(csv_filename, 'a', newline='') as csvfile:
                            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                            writer.writerow({
                                'lr': lr,
                                'hidden_channels': hidden,
                                'layers': num_layers,
                                'epoch': epoch,
                                'train_loss': train_metrics["loss"],
                                'train_accuracy': train_metrics["accuracy"],
                                'train_auc': train_metrics["auc"],
                                'train_emp': train_metrics["emp"],
                                'train_lift_005': train_metrics["lift_005"],
                                'train_lift_05': train_metrics["lift_05"],
                                'val_loss': val_metrics["loss"],
                                'val_accuracy': val_metrics["accuracy"],
                                'val_auc': val_metrics["auc"],
                                'val_emp': val_metrics["emp"],
                                'val_lift_005': val_metrics["lift_005"],
                                'val_lift_05': val_metrics["lift_05"]
                            })
                        
                        # Track best model by both validation AUC and EMP
                        improved = False
                        
                        if val_metrics["auc"] > best_val_auc_for_config:
                            best_val_auc_for_config = val_metrics["auc"]
                            improved = True
                            
                        if val_metrics["emp"] > best_val_emp_for_config:
                            best_val_emp_for_config = val_metrics["emp"]
                            improved = True
                            
                        if improved:
                            best_val_loss = val_metrics["loss"]
                            best_epoch = epoch
                            epochs_no_improve = 0
                            
                        else:
                            epochs_no_improve += 1
                            
                        # Early stopping check
                        if epochs_no_improve >= patience:
                            print(f"Early stopping at epoch {epoch}! Best epoch: {best_epoch}")
                            break
    
    # Print best hyperparameters
    print("\n=== Best Hyperparameters ===")
    print(f"Learning Rate: {best_hyperparams['lr']}")
    print(f"Hidden Channels: {best_hyperparams['hidden_channels']}")
    print(f"Number of Layers: {best_hyperparams['layers']}")
    print(f"Selected based on: {best_hyperparams.get('best_by', 'AUC')}")
    print(f"Best Validation AUC: {best_hyperparams['val_auc']:.4f}")
    print(f"Best Validation EMP: {best_hyperparams['val_emp']:.4f}")
    print(f"Best Validation 0.5% Lift: {best_hyperparams['val_lift_005']:.4f}")
    print(f"Best Validation 5% Lift: {best_hyperparams['val_lift_05']:.4f}")
    
    # Test with best model
    test_metrics = evaluate(best_model, data_test.to(device), criterion)
    
    print("\n=== Test Performance ===")
    print(f"Test Loss: {test_metrics['loss']:.4f}")
    print(f"Test Accuracy: {test_metrics['accuracy']:.4f}")
    print(f"Test AUC: {test_metrics['auc']:.4f}")
    print(f"Test EMP: {test_metrics['emp']:.4f}")
    print(f"Test 0.5% Lift: {test_metrics['lift_005']:.4f}")
    print(f"Test 5% Lift: {test_metrics['lift_05']:.4f}")
    
    # Write test results to CSV
    test_results_filename = f"gcn_test_results_{timestamp}.csv"
    with open(test_results_filename, 'w', newline='') as csvfile:
        fieldnames = ['lr', 'hidden_channels', 'layers', 'selection_criteria', 'test_loss', 'test_accuracy', 
                     'test_auc', 'test_emp', 'test_lift_005', 'test_lift_05']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerow({
            'lr': best_hyperparams['lr'],
            'hidden_channels': best_hyperparams['hidden_channels'],
            'layers': best_hyperparams['layers'],
            'selection_criteria': best_hyperparams.get('best_by', 'AUC'),
            'test_loss': test_metrics['loss'],
            'test_accuracy': test_metrics['accuracy'],
            'test_auc': test_metrics['auc'],
            'test_emp': test_metrics['emp'],
            'test_lift_005': test_metrics['lift_005'],
            'test_lift_05': test_metrics['lift_05']
        })
    
    print(f"Training results saved to: {csv_filename}")
    print(f"Test results saved to: {test_results_filename}")
    
    return best_model, best_hyperparams

In [44]:
def main():
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Hyperparameters to tune - using the specified parameters
    learning_rates = [0.01, 0.001, 0.0001]
    hidden_channels = [32, 128, 256]
    layers = [1, 3]
    
    # Run hyperparameter tuning
    best_model, best_hyperparams = hyperparameter_tuning(
        data_train, data_val, data_test, 
        learning_rates, hidden_channels, layers, 
        device, weight
    )
    
    return best_model, best_hyperparams

# Function to analyze and visualize results
def analyze_results(results_file):
    """
    Analyze hyperparameter tuning results and create visualizations
    
    Parameters:
    - results_file: Path to the CSV file with training results
    """
    results = pd.read_csv(results_file)
    
    # Create directory for plots
    plots_dir = "tuning_plots"
    os.makedirs(plots_dir, exist_ok=True)
    
    # Plot validation metrics over epochs for each configuration
    for lr in results['lr'].unique():
        for hidden in results['hidden_channels'].unique():
            for layer in results['layers'].unique():
                subset = results[(results['lr'] == lr) & 
                                 (results['hidden_channels'] == hidden) & 
                                 (results['layers'] == layer)]
                
                if len(subset) == 0:
                    continue
                
                # Skip if fewer than 3 data points
                if len(subset) < 3:
                    continue
                
                import matplotlib.pyplot as plt
                
                # Create plot
                fig, axs = plt.subplots(2, 2, figsize=(15, 10))
                fig.suptitle(f'Metrics for lr={lr}, hidden={hidden}, layers={layer}')
                
                # Plot metrics
                axs[0, 0].plot(subset['epoch'], subset['val_auc'], 'b-', label='Validation')
                axs[0, 0].plot(subset['epoch'], subset['train_auc'], 'r--', label='Training')
                axs[0, 0].set_title('AUC')
                axs[0, 0].set_xlabel('Epoch')
                axs[0, 0].set_ylabel('AUC')
                axs[0, 0].legend()
                
                axs[0, 1].plot(subset['epoch'], subset['val_emp'], 'b-', label='Validation')
                axs[0, 1].plot(subset['epoch'], subset['train_emp'], 'r--', label='Training')
                axs[0, 1].set_title('EMP')
                axs[0, 1].set_xlabel('Epoch')
                axs[0, 1].set_ylabel('EMP')
                axs[0, 1].legend()
                
                axs[1, 0].plot(subset['epoch'], subset['val_lift_005'], 'b-', label='Validation')
                axs[1, 0].plot(subset['epoch'], subset['train_lift_005'], 'r--', label='Training')
                axs[1, 0].set_title('0.5% Lift')
                axs[1, 0].set_xlabel('Epoch')
                axs[1, 0].set_ylabel('Lift')
                axs[1, 0].legend()
                
                axs[1, 1].plot(subset['epoch'], subset['val_lift_05'], 'b-', label='Validation')
                axs[1, 1].plot(subset['epoch'], subset['train_lift_05'], 'r--', label='Training')
                axs[1, 1].set_title('5% Lift')
                axs[1, 1].set_xlabel('Epoch')
                axs[1, 1].set_ylabel('Lift')
                axs[1, 1].legend()
                
                plt.tight_layout()
                plt.savefig(f"{plots_dir}/metrics_lr{lr}_hidden{hidden}_layers{layer}.png")
                plt.close()
    
    # Create summary plots for best configurations
    best_configs = []
    
    # Find best config for each metric
    metrics = ['val_auc', 'val_emp', 'val_lift_005', 'val_lift_05']
    for metric in metrics:
        # Group by hyperparameters and find maximum value for the metric
        grouped = results.groupby(['lr', 'hidden_channels', 'layers'])[metric].max().reset_index()
        best_config = grouped.loc[grouped[metric].idxmax()]
        best_configs.append({
            'metric': metric,
            'lr': best_config['lr'],
            'hidden_channels': best_config['hidden_channels'],
            'layers': best_config['layers'],
            'value': best_config[metric]
        })
    
    # Print summary of best configurations
    print("\n=== Best Configurations by Metric ===")
    for config in best_configs:
        print(f"{config['metric']}: lr={config['lr']}, hidden={config['hidden_channels']}, layers={config['layers']}, value={config['value']:.4f}")
    
    return best_configs

# Example usage
if __name__ == "__main__":
    # Run the main training and hyperparameter tuning
    best_model, best_hyperparams = main()
    
    # After training is done, analyze the results
    # (Uncomment this line after you have results to analyze)
    best_configs = analyze_results("gcn_results_YYYYMMDD_HHMMSS.csv")

Using device: cpu
Class imbalance weight: 5.9431902659718165

Training with lr=0.01, hidden_channels=32, layers=1
Epoch: 001, Train Loss: 0.8819, Val Loss: 0.9439, Val AUC: 0.5940, Val EMP: 0.0000, Val 0.5% Lift: 0.6353
Epoch: 005, Train Loss: 0.8302, Val Loss: 0.9013, Val AUC: 0.6198, Val EMP: 0.0000, Val 0.5% Lift: 1.5047
Epoch: 010, Train Loss: 0.7701, Val Loss: 0.8608, Val AUC: 0.6209, Val EMP: 0.0000, Val 0.5% Lift: 2.3072


KeyboardInterrupt: 