# Exploring econdata 

In [1]:
import uproot
import awkward as ak
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

import matplotlib.pylab as pylab
params = {'legend.fontsize': 'medium',
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'large',
         'ytick.labelsize':'medium'}
pylab.rcParams.update(params)

## 1. Load data set from ntuple

In [2]:
fname = "../data/econdata_Dec22_100evt.root"
ev_dict = uproot.open(fname)["FloatingpointAutoEncoderStrideDummyHistomaxGenmatchGenclustersntuple/HGCalTriggerNtuple"]

In [3]:
#ev_dict.show()

In [4]:
arrays_toread = [
    "econ_index","econ_data",
    "econ_subdet","econ_zside","econ_layer","econ_waferu","econ_waferv","econ_wafertype",
    "tc_simenergy",
    "tc_subdet","tc_zside","tc_layer","tc_waferu","tc_waferv","tc_wafertype",
    "gen_pt","gen_energy","gen_eta","gen_phi",
    "genpart_pt","genpart_energy","econ_id"
]
events = ev_dict.arrays(arrays_toread)

#Separate the data sets
econ = ak.zip({
    "index": events['econ_index'],
    "id":events['econ_id'],
    "data": events["econ_data"],
    "subdet": events["econ_subdet"],
    "zside": events["econ_zside"],
    "layer": events["econ_layer"],
    "waferu": events["econ_waferu"],
    "waferv": events["econ_waferv"],
})

tc = ak.zip({
    "simenergy": events["tc_simenergy"],
    "subdet": events["tc_subdet"],
    "zside": events["tc_zside"],
    "layer": events["tc_layer"],
    "waferu": events["tc_waferu"],
    "waferv": events["tc_waferv"],
})

gen = ak.zip({
    "pt": events["gen_pt"],
    "energy": events["gen_energy"],
    "eta": events["gen_eta"],
    "phi": events["gen_phi"],
})

In [5]:
# find wafers that we want to save
# the problem is that the number of wafers from trigger cells: trigger cells/48 
# is not the same as the number of wafers from econ data: econ_data/16
df_tc = ak.to_pandas(tc)
df_econ = ak.to_pandas(econ)
df_gen = ak.to_pandas(gen)

In [6]:
df_simtotal = df_tc.groupby(['entry','subdet','zside','layer','waferu','waferv'])["simenergy"].sum()

#with pd.option_context('display.max_rows', None,
#                       'display.max_columns', None,
#                       'display.precision', 3,
#                       ):
#    print(df_simtotal)
#print(df_simtotal)

## 2. Process the econ data set

In [7]:
#Prepare df_econ
df_econ.reset_index(inplace=True)
df_econ.set_index(['entry','subdet','zside','layer','waferu','waferv'],inplace=True)
df_econ['simenergy'] = df_simtotal
df_econ.drop(columns='subentry',inplace=True)

#filter out zero simenergy
df_econ_wsimenergy = df_econ[df_econ.simenergy > 0]
df_econ_wsimenergy['layer'] = df_econ_wsimenergy.index.get_level_values('layer') 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [8]:
def prepare_data(df_econ_wsimenergy):
    """
    Take in econ data frame and select out the data for the \
    top 50 most common ids
    """
    
    #Another dataframe to perform counting 
    df_mask = df_econ_wsimenergy[['layer','id','index']].droplevel(3)
    df_mask = df_mask.groupby(['layer','id']).count()
    df_mask['layer'] = df_mask.index.get_level_values('layer')
    df_mask['id'] = df_mask.index.get_level_values('id')
    
    #Count
    df_mask['count'] = df_mask['index']/16
    df_mask = df_mask.drop(['index'], axis = 1)
    
    #Select the ids
    id_list = df_mask.sort_values(['count'], ascending = False).iloc[:50]['id'].tolist()
    
    #return the new dataframe with only the selected ids
    return df_econ_wsimenergy[df_econ_wsimenergy['id'].isin(id_list)]

In [9]:
b = prepare_data(df_econ_wsimenergy)

In [11]:
b

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,index,id,data,simenergy,layer
entry,subdet,zside,layer,waferu,waferv,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2,1,29,1,3,0,3020185856,256,2.053700,29
0,2,1,29,1,3,1,3020185856,0,2.053700,29
0,2,1,29,1,3,2,3020185856,0,2.053700,29
0,2,1,29,1,3,3,3020185856,320,2.053700,29
0,2,1,29,1,3,4,3020185856,384,2.053700,29
...,...,...,...,...,...,...,...,...,...,...
99,1,1,19,1,2,11,2991341824,384,0.486294,19
99,1,1,19,1,2,12,2991341824,192,0.486294,19
99,1,1,19,1,2,13,2991341824,192,0.486294,19
99,1,1,19,1,2,14,2991341824,320,0.486294,19


In [10]:
df_econ_wsimenergy

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,index,id,data,simenergy,layer
entry,subdet,zside,layer,waferu,waferv,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2,1,30,1,4,0,3020456192,320,0.639610,30
0,2,1,30,1,4,1,3020456192,0,0.639610,30
0,2,1,30,1,4,2,3020456192,0,0.639610,30
0,2,1,30,1,4,3,3020456192,320,0.639610,30
0,2,1,30,1,4,4,3020456192,384,0.639610,30
...,...,...,...,...,...,...,...,...,...,...
99,1,1,21,4,3,11,2991875072,64,0.055536,21
99,1,1,21,4,3,12,2991875072,448,0.055536,21
99,1,1,21,4,3,13,2991875072,448,0.055536,21
99,1,1,21,4,3,14,2991875072,448,0.055536,21


In [None]:
#TODO:
# 1. Reformat the dataframe a bit so it's more structured and ready for training
# 2. Train test split
# 3. Plug it in a simple DNN to see what happens.