In [1]:
# Import necessary libraries

import pandas as pd
import ast
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.data import Data, DataLoader
from torch_geometric.utils import dense_to_sparse
from torch_geometric.nn import GCNConv
import networkx as nx

In [2]:
# Read from csvs without second list in daughter columns

training_df = pd.read_csv('train_data.csv')
testing_df = pd.read_csv('test_data.csv')

In [3]:
# Column before duplicate fxn

training_df.Jet0_Eta

0         4.326753
1         4.326753
2         4.326753
3         4.240293
4         4.240293
            ...   
199995    2.191868
199996    2.402929
199997    2.277578
199998    4.425471
199999    2.277578
Name: Jet0_Eta, Length: 200000, dtype: float64

In [4]:
# Duplicate Jet PT and Eta elements as a list the same length as the number of daughters in each row

def duplicate(df, cols, n_col):
    def duplicate_value(row, col, n_col):
        value = row[col]
        if isinstance(value, float):
            num_daughters = row[n_col]
            return [value] * num_daughters
        return value

    for col in cols:
        df[col] = df.apply(lambda row: duplicate_value(row, col, n_col), axis=1)
    return df

In [5]:
duplicate(training_df, ['Jet0_PT', 'Jet0_Eta'], 'Jet0_nDaughters');
duplicate(testing_df, ['Jet1_PT', 'Jet1_Eta'], 'Jet1_nDaughters');

In [28]:
training_df.Jet0_Eta[0][0]

4.326752994360513

In [7]:
training_df.Jet0_Daughters_ID[0][0]

'['

In [25]:
training_df.Jet0_Daughters_ID[0].strip('[]').split(',')[0]

'22.0'

In [38]:
# Converts (already duplicated and dropped second array) string into list of floats

def convert_to_lists(df, d_cols):
    for col in d_cols:
        df[col] = df[col].apply(lambda x: [float(num) for num in x.strip('[]').split(',')] if isinstance(x, str) else x)
    return df

In [30]:
# Define daughters columns
train_d_cols = list(training_df[training_df.columns[training_df.columns.str.contains("_Daughters")]])
test_d_cols = list(testing_df[testing_df.columns[testing_df.columns.str.contains("_Daughters")]])

In [39]:
# Clean up data to be indexed

convert_to_lists(training_df, train_d_cols);
convert_to_lists(testing_df, test_d_cols);

In [44]:
training_df.Jet0_nDaughters[0]

11

In [46]:
len(training_df.Jet0_Daughters_ID[0])

11

In [42]:
testing_df.Jet1_Daughters_ID[0][1]

211.0