# No Edges

In [1]:
import matplotlib as plt
import pandas as pd
import networkx as nx

import torch
import torch.nn.functional as F
import torch.optim as optim
from torch import Tensor
from torch_geometric.data import Data, DataLoader
from torch_geometric.utils import from_networkx
from torch.utils.data import Dataset, DataLoader
from torch_geometric.nn import SAGEConv

from itertools import product

import h5py

# For ROC Curve
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [2]:
# Load data
signal_df = pd.read_csv('Dijet_bb_pt10_15_dw.csv')
background_df = pd.read_csv('Dijet_qq_pt10_15_dw.csv')

In [3]:
# Separate Jet 0 and Jet 1 data & combine signal/ background
sig_jet0 = signal_df[signal_df.columns[signal_df.columns.str.contains("Jet0")]]
back_jet0 = background_df[background_df.columns[background_df.columns.str.contains("Jet0")]]
train_df = pd.concat([sig_jet0, back_jet0])

sig_jet1 = signal_df[signal_df.columns[signal_df.columns.str.contains("Jet1")]]
back_jet1 = background_df[background_df.columns[background_df.columns.str.contains("Jet1")]]
test_df = pd.concat([sig_jet1, back_jet1])

In [4]:
signal_df.to_hdf('data.h5', key='df', mode='w', format='table')

In [None]:
pd.read_hdf('data.h5', 's')

In [148]:
hdf = h5py.File('hdf.hdf5', 'w')

In [150]:
hdf.create_dataset('Jet', data=[[],[]])


<HDF5 dataset "Jet": shape (2, 0), type "<f8">

In [151]:
hdf.create_group('Daughters')

<HDF5 group "/Daughters" (0 members)>

In [155]:
group = hdf['Daughters']
group.create_dataset("Daughter1", data=[[], []])

<HDF5 dataset "Daughter1": shape (2, 0), type "<f8">

In [21]:
pd.read_hdf('data.h5')

Unnamed: 0.1,Unnamed: 0,Jet0_ENDVERTEX_X,Jet0_ENDVERTEX_Y,Jet0_ENDVERTEX_Z,Jet0_ENDVERTEX_XERR,Jet0_ENDVERTEX_YERR,Jet0_ENDVERTEX_ZERR,Jet0_ENDVERTEX_CHI2,Jet0_ENDVERTEX_NDOF,Jet0_OWNPV_X,...,nITClusters,nTTClusters,nOTClusters,nSPDHits,nMuonCoordsS0,nMuonCoordsS1,nMuonCoordsS2,nMuonCoordsS3,nMuonCoordsS4,nMuonTracks
0,0,0.8049,-0.1442,9.6248,0.0167,0.0164,0.1036,11.296007,29,0.805862,...,766,1016,5474,346,394,268,14,5,27,8
1,1,0.8049,-0.1442,9.6248,0.0167,0.0164,0.1036,11.296007,29,0.805862,...,766,1016,5474,346,394,268,14,5,27,8
2,2,0.8049,-0.1442,9.6248,0.0167,0.0164,0.1036,11.296007,29,0.805862,...,766,1016,5474,346,394,268,14,5,27,8
3,3,0.8049,-0.1442,9.6248,0.0167,0.0164,0.1036,11.296007,29,0.866242,...,766,1016,5474,346,394,268,14,5,27,8
4,4,0.8049,-0.1442,9.6248,0.0167,0.0164,0.1036,11.296007,29,0.866242,...,766,1016,5474,346,394,268,14,5,27,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,0.8785,-0.2188,-49.0015,0.0079,0.0079,0.0377,43.331200,115,0.878667,...,293,449,3906,196,204,76,3,4,4,0
99996,99996,0.8205,-0.2224,-1.1421,0.0301,0.0256,0.2178,3.674259,11,0.758976,...,1005,1263,6488,374,483,214,54,19,5,3
99997,99997,0.8411,-0.1376,-18.3968,0.0143,0.0141,0.0655,12.950173,27,0.848029,...,765,1038,4653,322,350,233,21,5,4,9
99998,99998,0.8411,-0.1376,-18.3968,0.0143,0.0141,0.0655,12.950173,27,0.847748,...,765,1038,4653,322,350,233,21,5,4,9


In [4]:
background_df.to_hdf('data.h5', 'data', append=True)

  background_df.to_hdf('data.h5', 'data', append=True)


In [221]:
print(pd.read_hdf('signal.h5', 'data'))

       Unnamed: 0  Jet0_ENDVERTEX_X  Jet0_ENDVERTEX_Y  Jet0_ENDVERTEX_Z  \
0               0            0.8049           -0.1442            9.6248   
1               1            0.8049           -0.1442            9.6248   
2               2            0.8049           -0.1442            9.6248   
3               3            0.8049           -0.1442            9.6248   
4               4            0.8049           -0.1442            9.6248   
...           ...               ...               ...               ...   
99995       99995            0.8104           -0.1898           39.3648   
99996       99996            0.8104           -0.1898           39.3648   
99997       99997            0.8104           -0.1898           39.3648   
99998       99998            0.8372           -0.1919          -36.7766   
99999       99999            0.8372           -0.1919          -36.7766   

       Jet0_ENDVERTEX_XERR  Jet0_ENDVERTEX_YERR  Jet0_ENDVERTEX_ZERR  \
0                   0.0167 

In [102]:
nodes = ['Jet0_MM', 'Jet0_PX', 'Jet0_PY', 'Jet0_PZ', 'Jet0_M', 'Jet0_PT', 'Jet0_Eta', 'Jet0_Phi', 'Jet0_NTrk']
node_features = train_df.drop(nodes, axis=1)

In [103]:
node_features

Unnamed: 0,Jet0_ENDVERTEX_X,Jet0_ENDVERTEX_Y,Jet0_ENDVERTEX_Z,Jet0_ENDVERTEX_XERR,Jet0_ENDVERTEX_YERR,Jet0_ENDVERTEX_ZERR,Jet0_ENDVERTEX_CHI2,Jet0_ENDVERTEX_NDOF,Jet0_OWNPV_X,Jet0_OWNPV_Y,...,Jet0_Hlt1Global_TOS,Jet0_Hlt1Phys_Dec,Jet0_Hlt1Phys_TIS,Jet0_Hlt1Phys_TOS,Jet0_Hlt2Global_Dec,Jet0_Hlt2Global_TIS,Jet0_Hlt2Global_TOS,Jet0_Hlt2Phys_Dec,Jet0_Hlt2Phys_TIS,Jet0_Hlt2Phys_TOS
0,0.8049,-0.1442,9.6248,0.0167,0.0164,0.1036,11.296007,29,0.805862,-0.145482,...,True,True,True,True,True,True,True,True,True,True
1,0.8049,-0.1442,9.6248,0.0167,0.0164,0.1036,11.296007,29,0.805862,-0.145482,...,True,True,True,True,True,True,True,True,True,True
2,0.8049,-0.1442,9.6248,0.0167,0.0164,0.1036,11.296007,29,0.805862,-0.145482,...,True,True,True,True,True,True,True,True,True,True
3,0.8049,-0.1442,9.6248,0.0167,0.0164,0.1036,11.296007,29,0.866242,-0.195181,...,True,True,True,True,True,True,True,True,True,True
4,0.8049,-0.1442,9.6248,0.0167,0.0164,0.1036,11.296007,29,0.866242,-0.195181,...,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.8785,-0.2188,-49.0015,0.0079,0.0079,0.0377,43.331200,115,0.878667,-0.219578,...,True,True,True,True,True,True,True,True,True,True
99996,0.8205,-0.2224,-1.1421,0.0301,0.0256,0.2178,3.674259,11,0.758976,-0.186395,...,False,False,False,False,True,False,False,True,False,False
99997,0.8411,-0.1376,-18.3968,0.0143,0.0141,0.0655,12.950173,27,0.848029,-0.184660,...,True,True,True,True,True,True,True,True,True,True
99998,0.8411,-0.1376,-18.3968,0.0143,0.0141,0.0655,12.950173,27,0.847748,-0.185888,...,True,True,True,True,True,True,True,True,True,False


In [131]:
# Create Graph with No Edges from DataFrame

def no_edges_graph(df):
    G = nx.Graph()

    #nodes = ['Jet0_MM', 'Jet0_PX', 'Jet0_PY', 'Jet0_PZ', 'Jet0_M', 'Jet0_PT', 'Jet0_Eta', 'Jet0_Phi', 'Jet0_NTrk']
    num_nodes = len(nodes)
    features = df.drop(nodes, axis=1)

    for node in nodes:
        G.add_node(node)
    
    for node in G.nodes(): 
        #node_features = df[features].values
        node_features = df[df['Jet0_Eta'] == node].iloc[:, 2:].values
        if len(node_features) > 0:
            G.nodes[node]['x'] = torch.tensor(node_features[0], dtype=torch.float)

        else:
            G.nodes[node]['x'] = torch.tensor([0.0] * (df.shape[1] - 1), dtype=torch.float)
    
    data = from_networkx(G)
    data.y = torch.ones(data.num_nodes, dtype=torch.long)
    data.train_mask = torch.ones(data.num_nodes, dtype=torch.bool)
    data.test_mask = torch.ones(data.num_nodes, dtype=torch.bool)
    combined_graph = Data(
        x=data.x,
        edge_index=data.edge_index,
        y=data.y,
        train_mask=data.train_mask,
        test_mask=data.test_mask)
    return data

graph = no_edges_graph(train_df)

In [132]:
graph

Data(x=[9, 169], edge_index=[2, 0], y=[9], train_mask=[9], test_mask=[9])

In [133]:
# Define GNN model
class GNN(torch.nn.Module):
    def __init__(self):
        super(GNN, self).__init__()
        self.conv1 = SAGEConv(in_channels=graph.num_node_features, out_channels=16)
        self.conv2 = SAGEConv(in_channels=16, out_channels=2)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

model = GNN()

In [137]:
# Training parameters
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

for epoch in range(10):
    loss = train(model, graph, optimizer, criterion)
    print(f'Epoch {epoch}, Loss: {loss}')

# Evaluation
def test(model, data):
    model.eval()
    _, pred = model(data).max(dim=1)
    correct = pred[data.test_mask].eq(data.y[data.test_mask]).sum().item()
    acc = correct / data.test_mask.sum().item()
    return acc

accuracy = test(model, graph)
print(f'Accuracy: {accuracy}')

Epoch 0, Loss: 0.045586321502923965
Epoch 1, Loss: 0.039124730974435806
Epoch 2, Loss: 0.0334593690931797
Epoch 3, Loss: 0.028532352298498154
Epoch 4, Loss: 0.024279635399580002
Epoch 5, Loss: 0.020633922889828682
Epoch 6, Loss: 0.01752733252942562
Epoch 7, Loss: 0.014893529936671257
Epoch 8, Loss: 0.012669667601585388
Epoch 9, Loss: 0.010798000730574131
Accuracy: 1.0


# ROC Curve

In [208]:
ground_truth = graph
predictions = test(model, graph)

In [209]:
confusion_matrix = metrics.confusion_matrix(ground_truth, predictions)

InvalidParameterError: The 'y_pred' parameter of confusion_matrix must be an array-like. Got 1.0 instead.