In [30]:
# Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.data import Data, DataLoader
import torch_geometric as T
from torch_geometric.data.data import BaseData, Data
from torch_geometric.utils import from_networkx
from torch_geometric.utils import dense_to_sparse
from torch_geometric.nn import GCNConv
import networkx as nx

In [2]:
# Read from csvs without second list in daughter columns (30 s)
training_df = pd.read_csv('train_data.csv')
testing_df = pd.read_csv('test_data.csv')

In [3]:
# Duplicate Jet PT and Eta elements as a list the same length as the number of daughters in each row
def duplicate(df, cols, n_col):
    def duplicate_value(row, col, n_col):
        value = row[col]
        if isinstance(value, float):
            num_daughters = row[n_col]
            return [value] * num_daughters
        return value

    for col in cols:
        df[col] = df.apply(lambda row: duplicate_value(row, col, n_col), axis=1)
    return df

In [4]:
# 17 s
duplicate(training_df, ['Jet0_PT', 'Jet0_Eta'], 'Jet0_nDaughters');
duplicate(testing_df, ['Jet1_PT', 'Jet1_Eta'], 'Jet1_nDaughters');

In [5]:
# Rename DataFrames
df = training_df
df2 = testing_df

In [6]:
# Converts (already duplicated and dropped second array) string into list of floats
def convert_to_lists(df, d_cols):
    for col in d_cols:
        df[col] = df[col].apply(lambda x: [float(num) for num in x.strip('[]').split(',')] if isinstance(x, str) else x)
    return df

In [7]:
# Define features from daughter columns + overall kinematics (already duplicated)
train_d_cols = list(df[df.columns[df.columns.str.contains("_Daughters")]])
test_d_cols = list(df2[df2.columns[df2.columns.str.contains("_Daughters")]])

train_f_cols = train_d_cols + ['Jet0_Eta', 'Jet0_PT']
test_f_cols = test_d_cols + ['Jet1_Eta', 'Jet1_PT']

In [8]:
# Clean up data from str to list of floats (2 min)
convert_to_lists(df, train_d_cols);
convert_to_lists(df2, test_d_cols);

In [9]:
# Creates list of all node features (100115647)
def features_list(df, features_cols):
    node_features = []
    for i, row in df.iterrows():
        for col in features_cols:
            node = df.at[i, col]
            if isinstance(node, list):
                node_features.extend(node)
            else:
                node_features.append(node)
    return node_features

In [10]:
# Gets [num_nodes, num_features] to create tensor
def node_features(df, features_cols):
    node_features_list = []
    node_features = []
    for i, row in df.iterrows():
        row_nodes = []
        for col in features_cols:
            feature = df.at[i, col]
            if isinstance(feature, list):
                node_features.extend(feature)
                if len(row_nodes) < len(feature):
                    row_nodes.extend([[] for _ in range(len(feature) - len(row_nodes))])
                for j, val in enumerate(feature):
                    row_nodes[j].append(val)
            else:
                node_features.append(feature)
                for node in row_nodes:
                        node.append(feature)
        node_features_list.extend(row_nodes)
    return node_features_list

In [11]:
features = node_features(df, train_f_cols)

In [12]:
x = torch.tensor(features, dtype = torch.float)

In [13]:
x.shape # [num_nodes, num_features]

torch.Size([3229537, 31])

In [14]:
# Number of nodes
len(features)

3229537

In [15]:
# Number of nodes = total number of daughters
num_nodes = df['Jet0_nDaughters'].sum()
print(num_nodes)

3229537


In [16]:
# Number of features
num_features = len(train_f_cols)
print(num_features)

31


# 3229537 Nodes, 31 Features

# Trying to create data object without NetworkX

In [33]:
# Gets [num_nodes, num_features] to create tensor
def node_features(df, features_cols):
    node_features_list = []
    node_features = []
    for i, row in df.iterrows():
        row_nodes = []
        for col in features_cols:
            feature = df.at[i, col]
            if isinstance(feature, list):
                node_features.extend(feature)
                if len(row_nodes) < len(feature):
                    row_nodes.extend([[] for _ in range(len(feature) - len(row_nodes))])
                for j, val in enumerate(feature):
                    row_nodes[j].append(val)
            else:
                node_features.append(feature)
                for node in row_nodes:
                        node.append(feature)
        node_features_list.extend(row_nodes)
    return node_features_list

def create_data_object(df, features_cols):
    data_list = []

    for i in range(len(df)):
        node_df = df.iloc[[i]]  # Select a single row as a dataframe
        
        features = node_features(node_df, features_cols)
        num_nodes = len(features)

        G = nx.complete_graph(num_nodes)
        edge_index = torch.tensor(list(G.edges)).t().contiguous()

        x = torch.tensor(features, dtype=torch.float)
        y = torch.ones(num_nodes, dtype=torch.long)
        train_mask = torch.ones(num_nodes, dtype=torch.bool)
        test_mask = torch.ones(num_nodes, dtype=torch.bool)

        data = Data(x=x, edge_index=edge_index, y=y, train_mask=train_mask, test_mask=test_mask)
        data_list.append(data)
        
    combined_data = T.data.Batch.from_data_list(data_list)
    return combined_data

In [34]:
data_list = create_data_object(df, train_f_cols)

In [35]:
data_list

DataBatch(x=[3229537, 31], edge_index=[2, 27797017], y=[3229537], train_mask=[3229537], test_mask=[3229537], batch=[3229537], ptr=[200001])

In [37]:
train_data = data_list

In [38]:
print("Number of nodes:", train_data.num_nodes)
print("Node features tensor shape:", train_data.x.shape)
print("First node feature:", train_data.x[0])
print("Labels shape:", train_data.y.shape)
print("Train mask shape:", train_data.train_mask.shape)
print("Test mask shape:", train_data.test_mask.shape)

Number of nodes: 3229537
Node features tensor shape: torch.Size([3229537, 31])
First node feature: tensor([ 6.6412e+03,  2.2647e+02,  2.2000e+01,  7.7613e+01,  2.1275e+02,
         6.6373e+03,  4.0713e+00,  1.2210e+00,  0.0000e+00, -1.0000e+02,
         0.0000e+00, -1.0000e+02, -1.0000e+03, -1.0000e+03, -1.0000e+03,
        -1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03,
        -1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03,
         6.9344e+03,  1.0406e+00,  9.2143e-01,  1.5359e+01,  4.3268e+00,
         1.7482e+04])
Labels shape: torch.Size([3229537])
Train mask shape: torch.Size([3229537])
Test mask shape: torch.Size([3229537])


In [41]:
test_data = create_data_object(df2, test_f_cols)

In [42]:
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=True)



# Training Model

In [44]:
# Define GNN model
class GNN(torch.nn.Module):
    def __init__(self):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(train_data.num_node_features, 64)
        self.conv2 = GCNConv(64, 2)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        return F.log_softmax(x, dim=1)

model = GNN()

In [49]:
def train(model, train_loader, optimizer, criterion):
    model.train()
    for data in train_loader:
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
    return loss.item()

# Evaluation loop
def test(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_loader:
            out = model(data.x, data.edge_index)
            _, pred = out.max(dim=1)
            correct += pred[data.test_mask].eq(data.y[data.test_mask]).sum().item()
            total += data.test_mask.sum().item()
    accuracy = correct / total
    return accuracy

# Main training and evaluation loop
for epoch in range(1):
    train_loss = train(model, train_loader, optimizer, criterion)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss}')

test_accuracy = test(model, test_loader)
print(f'Test Accuracy: {test_accuracy}')

TypeError: forward() takes 2 positional arguments but 3 were given

# ROC Curve

In [None]:
output = model(test_graph)
_, pred = output.max(dim=1)
true_labels = test_graph.y.numpy().flatten()
y_score = output[:, 1].detach().numpy()

In [None]:
fpr, tpr, thresholds = roc_curve(true_labels, y_score)
auc = auc(fpr, tpr)

In [None]:
back_rej = 1 - fpr
sig_eff = 1 - tpr

print(f'Background Rejection: {back_rej}')
print(f'Signal Efficiency: {sig_eff}')
print(f'AUC: {auc}')

In [None]:
plt.plot(back_rej, sig_eff, color='purple', label='ROC Curve')
plt.plot([0, 1], [1, 0], color='grey', linestyle='--', label='baseline')
plt.plot([], [], ' ', label=f'AUC: {auc}')
plt.xlim([0.0, 1.01])
plt.ylim([0.0, 1.01])
plt.ylabel('Background Rejection')
plt.xlabel('Signal Efficiency' )
plt.title('Receiver Operating Characteristics (ROC)')
plt.legend(loc='lower left')
plt.show()