In [186]:
import pandas as pd
import networkx as nx
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch import Tensor
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
from torch.utils.data import Dataset, DataLoader
from torch_geometric.nn import SAGEConv
from itertools import product

In [None]:
# Load data
signal_df = pd.read_csv('Dijet_bb_pt10_15_dw.csv')
background_df = pd.read_csv('Dijet_qq_pt10_15_dw.csv')

In [145]:
# Separate Jet 0 and Jet 1 data & combine signal/ background
s0 = signal_df[signal_df.columns[signal_df.columns.str.contains("Jet0")]]
b0 = background_df[background_df.columns[background_df.columns.str.contains("Jet0")]]
train_df = pd.concat([s0, b0])

s1 = signal_df[signal_df.columns[signal_df.columns.str.contains("Jet1")]]
b1 = background_df[background_df.columns[background_df.columns.str.contains("Jet1")]]
test_df = pd.concat([s0, b0])

In [236]:
def fully_connected_graph1(df):
    G = nx.Graph()

    # Add all columns as nodes
    nodes = list(df.columns)
    
    for node in nodes:
        G.add_node(node)
    
    # Connect all nodes
    for i, node1 in enumerate(nodes):
        for node2 in nodes[i+1:]:
            G.add_edge(node1, node2)
    
    # Add node features
    for node in G.nodes(): 
        node_features = df[df['Jet0_FD_OWNPV'] == node].iloc[:, 2:].values # need to update
        if len(node_features) > 0:
            G.nodes[node]['x'] = torch.tensor(node_features[0], dtype=torch.float)
        else:
            G.nodes[node]['x'] = torch.tensor([0.0] * (df.shape[1] - 2), dtype=torch.float)
    
    # Convert to PyTorch Geometric Data object
    data = from_networkx(G)
    return data

signal_graph = fully_connected_graph1(signal_df)
background_graph = fully_connected_graph1(background_df)

In [240]:

def nodes_without_edges(df, target_column):
    G = nx.Graph()

    # Add all columns as nodes
    num_nodes = len(list(df.columns))
    nodes = list(df.columns)
    for node in nodes:
        G.add_node(node)

    # Get unique values from target column
    for i in range len(list(df.columns)):
        target_column = i
        unique_nodes = target_column.unique()

    # Add unique values as nodes
    for node in unique_nodes:
        G.add_node(node)
    
    # Add node features based on target_column
    for node in G.nodes(): 
        # Filter rows where target_column matches node
        node_features = df[df[target_column] == node].iloc[:, 1:].values  # Assuming the first column is an identifier
        if len(node_features) > 0:
            G.nodes[node]['x'] = torch.tensor(node_features[0], dtype=torch.float)
        else:
            G.nodes[node]['x'] = torch.tensor([0.0] * (df.shape[1] - 1), dtype=torch.float)
    
    # Convert to PyTorch Geometric Data object
    data = from_networkx(G)
    return data

signal_graph = nodes_without_edges(signal_df)
background_graph = nodes_without_edges(background_df)

SyntaxError: invalid syntax (546081587.py, line 11)

In [222]:
# Fully Connected Edges
def fully_connected_graph2(df):
    G = nx.Graph()

    # Add all columns as nodes
    num_nodes = len(list(df.columns))
    nodes = list(df.columns)
    for node in nodes:
        G.add_node(node)
    
    # Connect all nodes
    for i, node1 in enumerate(nodes):
        for node2 in nodes[i+1:]:
            G.add_edge(node1, node2)
    
    # Add node features
    for node in G.nodes(): 
        node_features = df[node].iloc[:, 2:].values
        if len(node_features) > 0:
            G.nodes[node]['x'] = torch.tensor(node_features[0], dtype=torch.float)
        else:
            G.nodes[node]['x'] = torch.tensor([0.0] * (df.shape[1] - 2), dtype=torch.float)
    
    # Convert to PyTorch Geometric Data object
    data = from_networkx(G)

    return data

signal_graph = fully_connected_graph2(signal_df)
background_graph = fully_connected_graph2(background_df)

In [229]:
# Combine signal and background graphs
def create_combined_dataset(signal_graph, background_graph):
    signal_graph.y = torch.ones(signal_graph.num_nodes, dtype=torch.long)
    background_graph.y = torch.zeros(background_graph.num_nodes, dtype=torch.long)
    signal_graph.train_mask = torch.ones(signal_graph.num_nodes, dtype=torch.bool)
    background_graph.train_mask = torch.ones(background_graph.num_nodes, dtype=torch.bool)
    signal_graph.test_mask = torch.ones(signal_graph.num_nodes, dtype=torch.bool)
    background_graph.test_mask = torch.ones(background_graph.num_nodes, dtype=torch.bool)
    combined_graph = Data(
        x=torch.cat([signal_graph.x, background_graph.x], dim=0),
        edge_index=torch.cat([signal_graph.edge_index, background_graph.edge_index + signal_graph.num_nodes], dim=1),
        y=torch.cat([signal_graph.y, background_graph.y], dim=0),
        train_mask=torch.cat([signal_graph.train_mask, background_graph.train_mask], dim=0),
        test_mask=torch.cat([signal_graph.test_mask, background_graph.test_mask], dim=0),
    )
    return combined_graph

combined_graph = create_combined_dataset(signal_graph, background_graph)

In [232]:
combined_graph # x = [num_nodes, num_features, edge_index = [2, num_edges]]

Data(x=[770, 383], edge_index=[2, 295680], y=[770], train_mask=[770], test_mask=[770])

In [230]:
# Define GNN model
class GNN(torch.nn.Module):
    def __init__(self):
        super(GNN, self).__init__()
        self.conv1 = SAGEConv(in_channels=signal_graph.num_node_features, out_channels=16)
        self.conv2 = SAGEConv(in_channels=16, out_channels=2)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

model = GNN()

In [231]:
# Training parameters
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

for epoch in range(10):
    loss = train(model, combined_graph, optimizer, criterion)
    print(f'Epoch {epoch}, Loss: {loss}')

# Evaluation
def test(model, data):
    model.eval()
    _, pred = model(data).max(dim=1)
    correct = pred[data.test_mask].eq(data.y[data.test_mask]).sum().item()
    acc = correct / data.test_mask.sum().item()
    return acc

accuracy = test(model, combined_graph)
print(f'Accuracy: {accuracy}')

Epoch 0, Loss: 0.6968805193901062
Epoch 1, Loss: 0.6947128772735596
Epoch 2, Loss: 0.6934343576431274
Epoch 3, Loss: 0.6931832432746887
Epoch 4, Loss: 0.6937320232391357
Epoch 5, Loss: 0.6941283941268921
Epoch 6, Loss: 0.6940276622772217
Epoch 7, Loss: 0.6936753988265991
Epoch 8, Loss: 0.6933432817459106
Epoch 9, Loss: 0.6931694746017456
Accuracy: 0.5
