In [1]:
import argparse
import json
import os
import pathlib
import pickle as pkl
import shutil
import sys
import time

import awkward as ak
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

import uproot
import glob

import sklearn
from sklearn import model_selection

In [2]:
! ls ../datafiles/TaggerInput_Small2/

[34mTaggerInputMar26_2017[m[m [34mTaggerInput_2018[m[m


# Postprocessing

In [3]:
def rename_vars(df):
    
    df = df.rename(columns={
        "mt_lep_met": "lep_met_mt", 
        "lep_dR_fj": "lep_fj_dr",
    })
    return df

# we must add the "fj_isVBF" & "fj_isggF" labels that we forgot to include in the processor
def postprocess(df, sample):
    if "HToWW" in sample:        
        if "VBF" in sample:
            df["fj_isggF"] = 0
            df["fj_isVBF"] = 1
        elif "GluGluHToWW" in sample:
            df["fj_isggF"] = 1
            df["fj_isVBF"] = 0      
    else:
        df["fj_isggF"] = 0
        df["fj_isVBF"] = 0  

    df = rename_vars(df)
    return df

In [5]:
# events will follow a 80/20 split 
# samples_num = {
# #     "GluGluHToWW_Pt-200ToInf_M-125": 100_000_000_000_000,    # basically all
#     "GluGluHToWW_Pt-200ToInf_M-125": 10,    # basically all

#     "QCD_Pt_170to300": 10,
#     "QCD_Pt_300to470": 10,
#     "QCD_Pt_470to600": 10,
#     "QCD_Pt_600to800": 10,
    
#     "TTToSemiLeptonic": 10,
    
#     "WJetsToLNu_HT-200To400": 10,
#     "WJetsToLNu_HT-400To600": 10,
#     "WJetsToLNu_HT-600To800": 10,
# }

# # v35_10*
# samples_num = {
#     "GluGluHToWW_Pt-200ToInf_M-125": 100_000,    # basically all
# #     "GluGluHToWW_Pt-200ToInf_M-125": 10,    # basically all

#     "QCD_Pt_170to300": 2_500,
#     "QCD_Pt_300to470": 2_500,
#     "QCD_Pt_470to600": 2_500,
#     "QCD_Pt_600to800": 2_500,
    
#     "TTToSemiLeptonic": 100_000,
    
#     "WJetsToLNu_HT-200To400": 30_000,
#     "WJetsToLNu_HT-400To600": 40_000,
#     "WJetsToLNu_HT-600To800": 30_000,
# }


# # v35_11*
# samples_num = {
#     "GluGluHToWW_Pt-200ToInf_M-125": 100_000,    # basically all
# #     "GluGluHToWW_Pt-200ToInf_M-125": 10,    # basically all

#     "QCD_Pt_170to300": 10_000,
#     "QCD_Pt_300to470": 10_000,
#     "QCD_Pt_470to600": 10_000,
#     "QCD_Pt_600to800": 10_000,
    
#     "TTToSemiLeptonic": 200_000,
    
#     "WJetsToLNu_HT-200To400": 60_000,
#     "WJetsToLNu_HT-400To600": 80_000,
#     "WJetsToLNu_HT-600To800": 60_000,
# }

# v35_12*
samples_num = {
    "GluGluHToWW_Pt-200ToInf_M-125": 200_000,    # basically all

    "QCD_Pt_170to300": 50_000,
    "QCD_Pt_300to470": 50_000,
    "QCD_Pt_470to600": 50_000,
    "QCD_Pt_600to800": 50_000,
    
    "TTToSemiLeptonic": 300_000,
    
    "WJetsToLNu_HT-200To400": 100_000,
    "WJetsToLNu_HT-400To600": 200_000,
    "WJetsToLNu_HT-600To800": 100_000,
}

# # v35_13*
# samples_num = {
#     "GluGluHToWW_Pt-200ToInf_M-125": 100_000,    # basically all
# #     "GluGluHToWW_Pt-200ToInf_M-125": 10,    # basically all

#     "QCD_Pt_170to300": 100_000,
#     "QCD_Pt_300to470": 100_000,
#     "QCD_Pt_470to600": 100_000,
#     "QCD_Pt_600to800": 100_000,
    
#     "TTToSemiLeptonic": 500_000,
    
#     "WJetsToLNu_HT-200To400": 200_000,
#     "WJetsToLNu_HT-400To600": 400_000,
#     "WJetsToLNu_HT-600To800": 200_000,
# }

# # v35_14*
# samples_num = {
#     "GluGluHToWW_Pt-200ToInf_M-125": 100_000,    # basically all
# #     "GluGluHToWW_Pt-200ToInf_M-125": 10,    # basically all

#     "QCD_Pt_170to300": 200_000,
#     "QCD_Pt_300to470": 200_000,
#     "QCD_Pt_470to600": 200_000,
#     "QCD_Pt_600to800": 100_000,
    
#     "TTToSemiLeptonic": 800_000,
    
#     "WJetsToLNu_HT-200To400": 200_000,
#     "WJetsToLNu_HT-400To600": 600_000,
#     "WJetsToLNu_HT-600To800": 200_000,
# }

In [6]:
OUTPATH = "../datafiles/TaggerInput_Small/"

for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):
    if ".DS_Store" in dir_TaggerInput:
        continue
    
    for sample in os.listdir(f"{OUTPATH}/{dir_TaggerInput}/"):
        if "DS_Store" in sample:
            continue              
        
#         if "TTToSemiLeptonic" not in sample:
#             continue
        
        print(f"Processing {sample}")

        outdir = f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles"
        
        df = pd.read_parquet(glob.glob(f"{outdir}/*.parquet"))
        df = postprocess(df, sample)

        # apply selection in config v35
        msk1 = (df["lep_fj_dr"] > 0.03) & (df["lep_fj_dr"] < 0.8)
        msk_H = (df["fj_genRes_mass"] == 125) & (df["fj_isHVV_Matched"] == 1) & (df["fj_lepinprongs"] == 1) & (df["fj_nquarks"] == 2)
        msk_noH = (df["fj_genRes_mass"] != 125)

        df = df[msk1 & (msk_H | msk_noH)]

        # select the number of events specefied
        df = df.head(samples_num[sample])
        
        df_train, df_test = sklearn.model_selection.train_test_split(df, test_size=0.1)

        os.system(f"mkdir -p {outdir}/train")
        with uproot.recreate(f"{outdir}/train/out.root", compression=uproot.LZ4(4)) as rfile:
            rfile["Events"] = ak.Array(df_train.to_dict(orient="list", index=True))

        os.system(f"mkdir -p {outdir}/test")            
        with uproot.recreate(f"{outdir}/test/out.root", compression=uproot.LZ4(4)) as rfile:
            rfile["Events"] = ak.Array(df_test.to_dict(orient="list", index=True))

        print("--------------------------")    
    #     break

Processing TTToSemiLeptonic


NameError: name 'postprocess' is not defined

# Check Number of events in ntuples

In [98]:
samples = [
    "GluGluHToWW_Pt-200ToInf_M-125",  
    "TTToSemiLeptonic",   
    "WJetsToLNu_HT-200To400",
    "WJetsToLNu_HT-400To600",
    "WJetsToLNu_HT-600To800",
    "QCD_Pt_170to300",
    "QCD_Pt_300to470",
    "QCD_Pt_470to600",
    "QCD_Pt_600to800",
]

OUTPATH = "../datafiles/TaggerInput_Small/"

num_events = {}


numtrain_total, numtest_total = 0, 0
for sample in samples:

    num = 0    
    for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):   
        if "DS_Store" in dir_TaggerInput:
            continue              
            
        try:
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/train/out.root")["Events"]
            num += events.num_entries
            
            numtrain_total += num
        except:
            continue
    print(sample, "train", num)


    num = 0    
    for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):   
        if "DS_Store" in dir_TaggerInput:
            continue    
            
        try:
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/test/out.root")["Events"]
            num += events.num_entries
            
            numtest_total += num            
        except:
            continue
    print(sample, "test", num)
    
    print("--------------------------")
    
print("Total train", numtrain_total)
print("Total test", numtest_total)

GluGluHToWW_Pt-200ToInf_M-125 train 90000
GluGluHToWW_Pt-200ToInf_M-125 test 10000
--------------------------
TTToSemiLeptonic train 270000
TTToSemiLeptonic test 30000
--------------------------
WJetsToLNu_HT-200To400 train 90000
WJetsToLNu_HT-200To400 test 10000
--------------------------
WJetsToLNu_HT-400To600 train 180000
WJetsToLNu_HT-400To600 test 20000
--------------------------
WJetsToLNu_HT-600To800 train 90000
WJetsToLNu_HT-600To800 test 10000
--------------------------
QCD_Pt_170to300 train 45000
QCD_Pt_170to300 test 5000
--------------------------
QCD_Pt_300to470 train 45000
QCD_Pt_300to470 test 5000
--------------------------
QCD_Pt_470to600 train 45000
QCD_Pt_470to600 test 5000
--------------------------
QCD_Pt_600to800 train 45000
QCD_Pt_600to800 test 5000
--------------------------
Total train 900000
Total test 100000


In [14]:
0.000001

1e-06

In [15]:
1e-6

1e-06

In [8]:
samples = [
#     "GluGluHToWW_Pt-200ToInf_M-125",  
    "TTToSemiLeptonic",   
#     "WJetsToLNu_HT-200To400",
#     "WJetsToLNu_HT-400To600",
#     "WJetsToLNu_HT-600To800",
#     "QCD_Pt_170to300",
#     "QCD_Pt_300to470",
#     "QCD_Pt_470to600",
#     "QCD_Pt_600to800",
]

OUTPATH = "../datafiles/TaggerInput_Small/"

num_events = {}


numtrain_total, numtest_total = 0, 0
for sample in samples:

    num = 0    
    for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):   
        if "DS_Store" in dir_TaggerInput:
            continue              
            
        try:
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/train/out.root")["Events"]
            num += events.num_entries
            
            numtrain_total += num
        except:
            continue
    print(sample, "train", num)


    num = 0    
    for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):   
        if "DS_Store" in dir_TaggerInput:
            continue    
            
        try:
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/test/out.root")["Events"]
            num += events.num_entries
            
            numtest_total += num            
        except:
            continue
    print(sample, "test", num)
    
    print("--------------------------")
    
print("Total train", numtrain_total)
print("Total test", numtest_total)

TTToSemiLeptonic train 0
TTToSemiLeptonic test 30000
--------------------------
Total train 0
Total test 30000
