The data contains two lists in each cell under the daughter kinematics. This notebook removes the second list and saves it as a new csv file.

In [2]:
import pandas as pd
import ast
import numpy as np
import torch

In [6]:
# Load data
orig_sig_df = pd.read_csv('Dijet_bb_pt10_15_dw.csv')
orig_back_df = pd.read_csv('Dijet_qq_pt10_15_dw.csv')

In [3]:
# Load data
signal_df = pd.read_csv('Dijet_bb_pt20_50_dw.csv')
background_df = pd.read_csv('Dijet_qq_pt20_50_dw.csv')

In [4]:
signal_df = signal_df.head(100000)
background_df = background_df.head(100000)

In [5]:
# Distinguish signal and background
signal_df['IsB'] = 1
background_df['IsB'] = 0

# Separate Jet 0 and Jet 1 data
sig_jet0 = signal_df[signal_df.columns[signal_df.columns.str.contains("Jet0|IsB")]]
back_jet0 = background_df[background_df.columns[background_df.columns.str.contains("Jet0|IsB")]]

sig_jet1 = signal_df[signal_df.columns[signal_df.columns.str.contains("Jet1|IsB")]]
back_jet1 = background_df[background_df.columns[background_df.columns.str.contains("Jet1|IsB")]]

# Combine signal and background
train_df = pd.concat([sig_jet0, back_jet0])
test_df = pd.concat([sig_jet1, back_jet1])

In [10]:
train_df.Jet0_Daughters_ID[0].astype

<bound method NDFrame.astype of 0    [22.0, 22.0, -211.0, 22.0, 211.0, -211.0, 22.0...
0    [22.0, -211.0, 22.0, -211.0, 22.0, 22.0, 211.0...
Name: Jet0_Daughters_ID, dtype: object>

In [11]:
d_cols = list(train_df[train_df.columns[train_df.columns.str.contains("_Daughters")]])

In [12]:
# Create new csv and drop the second list

def process_cell(cell_value):
    if isinstance(cell_value, str) and cell_value.startswith('['):
        cell_value = cell_value[1:]
        cell_value = cell_value.split(']')[0]
        return [float(num) for num in cell_value.strip('[]').split(',') if num]
        
    return cell_value

def rewrite_data(df):
    return df.applymap(process_cell)

In [13]:
train_df.applymap(process_cell)

Unnamed: 0,Jet0_ENDVERTEX_X,Jet0_ENDVERTEX_Y,Jet0_ENDVERTEX_Z,Jet0_ENDVERTEX_XERR,Jet0_ENDVERTEX_YERR,Jet0_ENDVERTEX_ZERR,Jet0_ENDVERTEX_CHI2,Jet0_ENDVERTEX_NDOF,Jet0_OWNPV_X,Jet0_OWNPV_Y,...,Jet0_Hlt1Phys_Dec,Jet0_Hlt1Phys_TIS,Jet0_Hlt1Phys_TOS,Jet0_Hlt2Global_Dec,Jet0_Hlt2Global_TIS,Jet0_Hlt2Global_TOS,Jet0_Hlt2Phys_Dec,Jet0_Hlt2Phys_TIS,Jet0_Hlt2Phys_TOS,IsB
0,0.8738,-0.1754,14.7203,0.0093,0.0093,0.0573,49.857929,107,0.766100,-0.200700,...,True,True,True,True,True,True,True,True,True,1
1,0.7844,-0.1852,-61.7426,0.0074,0.0073,0.0379,68.600578,167,0.783929,-0.185155,...,True,True,True,True,True,True,True,True,True,1
2,0.8658,-0.1414,-9.7498,0.0101,0.0101,0.0570,21.601925,57,0.862175,-0.136493,...,True,True,True,True,True,True,True,True,False,1
3,0.8803,-0.1820,28.3628,0.0094,0.0093,0.0503,40.823803,91,0.881767,-0.183634,...,True,True,True,True,True,True,True,True,True,1
4,0.8163,-0.0868,8.4282,0.0286,0.0275,0.3193,7.705523,13,0.799059,-0.161079,...,False,False,False,True,False,False,True,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.8171,-0.1728,-21.6793,0.0165,0.0167,0.0812,12.615396,27,0.902898,-0.175871,...,True,True,True,True,True,True,True,True,True,0
99996,0.8826,-0.1965,-9.6131,0.0141,0.0131,0.0533,18.875439,39,0.850605,-0.183178,...,True,True,True,True,True,True,True,True,False,0
99997,0.8162,-0.1947,-45.3649,0.0090,0.0089,0.0454,45.366772,101,0.816201,-0.193130,...,True,True,True,True,True,True,True,True,False,0
99998,0.8839,-0.2102,-11.6960,0.0097,0.0097,0.0475,41.310898,85,0.883863,-0.211633,...,True,True,True,True,True,True,True,True,False,0


In [15]:
train_df.Jet0_Daughters_ID[0][0]

0    [22.0, 22.0, -211.0, 22.0, 211.0, -211.0, 22.0...
0    [22.0, -211.0, 22.0, -211.0, 22.0, 22.0, 211.0...
Name: Jet0_Daughters_ID, dtype: object

In [16]:
train_data = rewrite_data(train_df)
train_data.to_csv('new_train_data.csv', index=False)
training_df = pd.read_csv('new_train_data.csv')

In [19]:
training_df.Jet0_Daughters_ID[0] # returns a string of one list of values, later to be transformed into a list of floats

'[22.0, 22.0, -211.0, 22.0, 211.0, -211.0, 22.0, 211.0, 22.0, 22.0, -211.0, 22.0, 211.0, 13.0]'

In [20]:
test_data = rewrite_data(test_df)
test_data.to_csv('new_test_data.csv', index=False)
testing_df = pd.read_csv('new_test_data.csv')

In [21]:
testing_df.Jet1_Daughters_ID[0]

'[22.0, -211.0, 22.0, -22.0, -22.0, 22.0, 22.0, 22.0, -22.0, 22.0, 22.0, 22.0, -13.0, -22.0, 22.0, 11.0, -22.0]'