In [6]:
import pandas as pd
import ast
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.data import Data, DataLoader
from torch_geometric.utils import dense_to_sparse
from torch_geometric.nn import GCNConv
import networkx as nx
import csv

In [7]:
# Load data
signal_df = pd.read_csv('Dijet_bb_pt10_15_dw.csv')
background_df = pd.read_csv('Dijet_qq_pt10_15_dw.csv')

In [8]:
# Distinguish signal and background
signal_df['IsB']=1
background_df['IsB']=0

In [9]:
# Separate Jet 0 and Jet 1 data
sig_jet0 = signal_df[signal_df.columns[signal_df.columns.str.contains("Jet0|IsB")]]
back_jet0 = background_df[background_df.columns[background_df.columns.str.contains("Jet0|IsB")]]

sig_jet1 = signal_df[signal_df.columns[signal_df.columns.str.contains("Jet1|IsB")]]
back_jet1 = background_df[background_df.columns[background_df.columns.str.contains("Jet1|IsB")]]

# Combine signal and background
train_df = pd.concat([sig_jet0, back_jet0])
test_df = pd.concat([sig_jet1, back_jet1])

In [256]:
d_cols = list(df[df.columns[df.columns.str.contains("_Daughters")]])

In [13]:
# Create new csv and drop the second list

def process_cell(cell_value):
    if isinstance(cell_value, str) and cell_value.startswith('['):
        cell_value = cell_value[1:]
        cell_value = cell_value.split(']')[0]
        return [float(num) for num in cell_value.strip('[]').split(',') if num]
        
    return cell_value

def rewrite_data(df):
    return df.applymap(process_cell)

In [16]:
train_data = rewrite_data(train_df)
train_data.to_csv('train_data.csv', index=False)
training_df = pd.read_csv('train_data.csv')

  return df.applymap(process_cell)


In [17]:
training_df.Jet0_Daughters_ID[0]

'[22.0, 211.0, 211.0, 22.0, -211.0, -11.0, -211.0, -22.0, 211.0, 211.0, -211.0]'

In [18]:
test_data = rewrite_data(test_df)
test_data.to_csv('test_data.csv', index=False)
testing_df = pd.read_csv('test_data.csv')

  return df.applymap(process_cell)


'[211.0, 211.0, 22.0, -211.0, 11.0, -11.0, -211.0, -211.0, -211.0, -211.0, 211.0, 321.0, 3122.0, 211.0, 211.0, 22.0, 22.0, -211.0]'

In [24]:
testing_df.Jet1_Daughters_ID[0]

'[22.0, 211.0, 211.0, 22.0, -211.0, -11.0, -211.0, -22.0, 211.0, 211.0, -211.0]'

xxx


In [309]:
testing_df = pd.read_csv('test_data.csv')

In [311]:
testing_df.Jet1_Daughters_ID[0]

'[211.0, 211.0, 22.0, -211.0, 11.0, -11.0, -211.0, -211.0, -211.0, -211.0, 211.0, 321.0, 3122.0, 211.0, 211.0, 22.0, 22.0, -211.0]'

In [8]:
testing_df.Jet1_Eta

0         4.240293
1         2.785512
2         2.248249
3         2.785512
4         2.248249
            ...   
199995    2.786849
199996    2.884563
199997    4.425471
199998    3.335576
199999    3.335576
Name: Jet1_Eta, Length: 200000, dtype: float64

In [303]:
# Duplicate Jet PT and Eta elements as a list the same length as the number of daughters in each row

def duplicate(df, cols, n_col):
    def duplicate_value(row, col, n_col):
        value = row[col]
        if isinstance(value, float):
            num_daughters = row[n_col]
            return [value] * num_daughters
        return value

    for col in cols:
        df[col] = df.apply(lambda row: duplicate_value(row, col, n_col), axis=1)
    return df

In [320]:
duplicate(training_df, ['Jet0_PT', 'Jet0_Eta'], 'Jet0_nDaughters');

In [321]:
training_df.Jet0_Eta

0         [4.326752994360513, 4.326752994360513, 4.32675...
1         [4.326752994360513, 4.326752994360513, 4.32675...
2         [4.326752994360513, 4.326752994360513, 4.32675...
3         [4.240292525550871, 4.240292525550871, 4.24029...
4         [4.240292525550871, 4.240292525550871, 4.24029...
                                ...                        
199995    [2.191868323943797, 2.191868323943797, 2.19186...
199996    [2.4029287573481444, 2.4029287573481444, 2.402...
199997    [2.277577843908911, 2.277577843908911, 2.27757...
199998    [4.425471023039266, 4.425471023039266, 4.42547...
199999    [2.277577843908911, 2.277577843908911, 2.27757...
Name: Jet0_Eta, Length: 200000, dtype: object

In [10]:
duplicate(testing_df, ['Jet1_PT', 'Jet1_Eta'], 'Jet1_nDaughters');

In [11]:
testing_df.Jet1_Eta

0         [4.240292525550871, 4.240292525550871, 4.24029...
1         [2.785512362437907, 2.785512362437907, 2.78551...
2         [2.2482488092613195, 2.2482488092613195, 2.248...
3         [2.785512362437907, 2.785512362437907, 2.78551...
4         [2.2482488092613195, 2.2482488092613195, 2.248...
                                ...                        
199995    [2.7868489899936093, 2.7868489899936093, 2.786...
199996    [2.8845629378907884, 2.8845629378907884, 2.884...
199997    [4.425471023039266, 4.425471023039266, 4.42547...
199998    [3.33557565661344, 3.33557565661344, 3.3355756...
199999    [3.33557565661344, 3.33557565661344, 3.3355756...
Name: Jet1_Eta, Length: 200000, dtype: object

In [333]:
# Daughters columns, not including number of daughters
daughters = testing_df[testing_df.columns[testing_df.columns.str.contains("_Daughters")]]
d_cols = list(testing_df[testing_df.columns[testing_df.columns.str.contains("_Daughters")]])

In [335]:
d_cols

['Jet1_Daughters_E',
 'Jet1_Daughters_pT',
 'Jet1_Daughters_ID',
 'Jet1_Daughters_pX',
 'Jet1_Daughters_pY',
 'Jet1_Daughters_pZ',
 'Jet1_Daughters_Eta',
 'Jet1_Daughters_Phi',
 'Jet1_Daughters_Q',
 'Jet1_Daughters_IP',
 'Jet1_Daughters_IPCHI2',
 'Jet1_Daughters_IPraw',
 'Jet1_Daughters_NNe',
 'Jet1_Daughters_NNk',
 'Jet1_Daughters_NNp',
 'Jet1_Daughters_NNpi',
 'Jet1_Daughters_NNmu',
 'Jet1_Daughters_Chi2',
 'Jet1_Daughters_QoverP',
 'Jet1_Daughters_trackX',
 'Jet1_Daughters_trackY',
 'Jet1_Daughters_trackZ',
 'Jet1_Daughters_trackVX',
 'Jet1_Daughters_trackVY',
 'Jet1_Daughters_trackVZ',
 'Jet1_Daughters_CaloNeutralEcal',
 'Jet1_Daughters_CaloNeutralHcal2Ecal',
 'Jet1_Daughters_CaloNeutralE49',
 'Jet1_Daughters_CaloNeutralPrs']

In [331]:
# Define features
features_cols = d_cols + ['Jet1_PT', 'Jet1_Eta']

In [165]:
# Get nodes from features cols after duplicating data, and convert to list of floats from string

def get_nodes(df, features_cols, nd_col):
    def nodes(row, col, nd_col):
        value = row[col]
        if isinstance(value, str):
            value = value.strip('[]').split(',')
        return value

    for col in features_cols:
        df[col] = df.apply(lambda row: nodes(row, col, nd_col), axis=1)
    return df

In [None]:
extracted_nodes = get_nodes(testing_df, features_cols, 'Jet1_nDaughters')

In [323]:
len(training_df.Jet0_Eta[0])

11

In [324]:
training_df.Jet0_nDaughters[0]

11

In [330]:
ast.literal_eval(training_df.Jet0_Daughters_ID[0])

['22.0',
 ' 211.0',
 ' 211.0',
 ' 22.0',
 ' -211.0',
 ' -11.0',
 ' -211.0',
 ' -22.0',
 ' 211.0',
 ' 211.0',
 ' -211.0']

In [97]:
# Define features
features_cols = [daughters, 'Jet0_PT', 'Jet0_Eta']

def convert_to_float_list(s):
    return [float(x) for x in s.split()]

def get_nodes(df, features_cols, nd_col):
    def nodes(row, col, nd_col):
        num_d = row[nd_col]
        value = row[col]
        if col in daughters:
            value = convert_to_float_list(value)
        return value * num_d

    for col in features_cols:
        df[col] = df.apply(lambda row: nodes(row, col, nd_col), axis=1)
    return df

extracted_nodes = get_nodes(training_df, features_cols, 'Jet0_nDaughters')

TypeError: Indexing a Series with DataFrame is not supported, use the appropriate DataFrame column

In [133]:
def load_data(df):
    data_list=[]
    
    for i, row in df.iterrows():
        num_nodes = len(nodes_list)
        features = nodes_list.values
        num_features = len(features)
        data.x = torch.Tensor([num_nodes, num_features])
        adj = torch.ones((num_nodes, num_nodes))
        edge_index = dense_to_sparse(adj)[0]
        y = torch.ones(num_nodes, dtype=torch.long)
        train_mask = torch.ones(num_nodes, dtype=torch.bool)
        test_mask = torch.ones(num_nodes, dtype=torch.bool)
        data = Data(x=x, edge_index=edge_index, y=y, train_mask=train_mask, test_mask=test_mask, num_features=num_features, num_nodes=num_nodes)
        data_list.append(data)
    return data_list

In [134]:
train_data = load_data(training_df)
test_data = load_data(testing_df)

AttributeError: 'list' object has no attribute 'values'

In [96]:
def load_data(df, features_cols):
    data_list = []
    
    for i, row in df.iterrows():
        all_features = []

        for col in features_cols:
            feature_values_str = row[col]
            feature_values = eval(feature_values_str) 
            
            features = [float(value) for value in feature_values]
            all_features.extend(features) 
            
        x = torch.tensor(all_features, dtype=torch.float32)
        features = torch.tensor(all_features, dtype=torch.int64)
        
        num_features = len(features_cols)
        num_nodes = len(all_features) // num_features
        
        adj = torch.ones((num_nodes, num_nodes))
        edge_index = dense_to_sparse(adj)[0]
        
        y = torch.ones(num_nodes, dtype=torch.long)
        train_mask = torch.ones(num_nodes, dtype=torch.bool)
        test_mask = torch.ones(num_nodes, dtype=torch.bool)
        
        data = Data(x=x, edge_index=edge_index, y=y, train_mask=train_mask, test_mask=test_mask, num_features=num_features, num_nodes=num_nodes)
        data.num_features = num_features
        data.num_nodes = num_nodes
        data_list.append(data)
    
    return data_list

In [97]:
train_data = load_data(training_df, ['Jet0_Daughters_Eta', 'Jet0_Daughters_Phi', 'Jet0_Daughters_pT'])
test_data = load_data(testing_df, ['Jet1_Daughters_Eta', 'Jet1_Daughters_Phi', 'Jet1_Daughters_pT'])

KeyboardInterrupt: 

In [98]:
def graph(df, features_cols):
    G = nx.Graph()

    for i, row in df.iterrows():
        nodes = []
        for col in features_cols:
            nodes.extend(ast.literal_eval(row[col]))
        nodes = list(set(nodes))
        for node in nodes:
            G.add_node(node)
    
    node_features_list = []

    for node in G.nodes():
        node_features = []
        for col in features_cols:
            feature_value = df.loc[df[node_col].apply(lambda x: node in ast.literal_eval(x)), col].values[0]
            node_features.append(feature_value)
        node_features_list.append(np.array(node_features).astype(np.float32))

    node_features_np = np.array(node_features_list)
    node_features_tensor = torch.tensor(node_features_np)

    for i, node in enumerate(G.nodes()): 
        G.nodes[node]['x'] = node_features_tensor[i]
    
    data = from_networkx(G)

    data.y = torch.ones(data.num_nodes, dtype=torch.long)
    data.train_mask = torch.ones(data.num_nodes, dtype=torch.bool)
    data.test_mask = torch.ones(data.num_nodes, dtype=torch.bool)

    return data

In [99]:
train_data = graph(training_df, ['Jet0_Daughters_Eta', 'Jet0_Daughters_Phi', 'Jet0_Daughters_pT'])
test_data = graph(testing_df, ['Jet1_Daughters_Eta', 'Jet1_Daughters_Phi', 'Jet1_Daughters_pT'])

KeyboardInterrupt: 

In [58]:
train_data

[Data(x=[0], edge_index=[2, 1], y=1, train_mask=True, test_mask=True, num_features=0, num_nodes=1),
 Data(x=[0], edge_index=[2, 1], y=1, train_mask=True, test_mask=True, num_features=0, num_nodes=1),
 Data(x=[0], edge_index=[2, 1], y=1, train_mask=True, test_mask=True, num_features=0, num_nodes=1),
 Data(x=[0], edge_index=[2, 1], y=1, train_mask=True, test_mask=True, num_features=0, num_nodes=1),
 Data(x=[0], edge_index=[2, 1], y=1, train_mask=True, test_mask=True, num_features=0, num_nodes=1),
 Data(x=[0], edge_index=[2, 1], y=1, train_mask=True, test_mask=True, num_features=0, num_nodes=1),
 Data(x=[0], edge_index=[2, 1], y=1, train_mask=True, test_mask=True, num_features=0, num_nodes=1),
 Data(x=[0], edge_index=[2, 1], y=1, train_mask=True, test_mask=True, num_features=0, num_nodes=1),
 Data(x=[0], edge_index=[2, 1], y=1, train_mask=True, test_mask=True, num_features=0, num_nodes=1),
 Data(x=[0], edge_index=[2, 1], y=1, train_mask=True, test_mask=True, num_features=0, num_nodes=1),


In [136]:
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=True)



In [None]:
# Define GNN model
class GNN(torch.nn.Module):
    def __init__(self):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(3, 64)
        self.conv2 = GCNConv(64, 2)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

model = GNN()

In [None]:
# Training parameters
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

for epoch in range(1):
    loss = train(model, train_data, optimizer, criterion)
    print(f'Epoch {epoch+1}, Loss: {loss}')

# Evaluation
def test(model, data):
    model.eval()
    _, pred = model(data).max(dim=1)
    correct = pred[data.test_mask].eq(data.y[data.test_mask]).sum().item()
    acc = correct / data.test_mask.sum().item()
    return acc

accuracy = test(model, test_data)
print(f'Accuracy: {accuracy}')

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got -2)