In [33]:
import uproot
import pandas
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint, CSVLogger
import gc
from sklearn.model_selection import train_test_split
import keras

In [2]:
central_tau_id_branches = ['againstElectronMVA6Raw', 'byCombinedIsolationDeltaBetaCorrRaw3Hits',
    'byIsolationMVArun2v1DBoldDMwLTraw', 'byIsolationMVArun2v1DBdR03oldDMwLTraw',
    'byIsolationMVArun2v1DBoldDMwLTraw2016', 'byIsolationMVArun2017v2DBoldDMwLTraw2017',
    'byIsolationMVArun2017v2DBoldDMdR0p3wLTraw2017', 'id_flags']
truth_branches = ['gen_match']
input_branches = ['pt', 'eta', 'mass', 'decayMode', 'chargedIsoPtSum', 'neutralIsoPtSum', 'neutralIsoPtSumWeight',
                  'photonPtSumOutsideSignalCone', 'puCorrPtSum',
                  'dxy', 'dxy_sig', 'dz', 'ip3d', 'ip3d_sig',
                  'hasSecondaryVertex', 'flightLength_r', 'flightLength_dEta', 'flightLength_dPhi',
                  'flightLength_sig', 'leadChargedHadrCand_pt', 'leadChargedHadrCand_dEta',
                  'leadChargedHadrCand_dPhi', 'leadChargedHadrCand_mass', 'pt_weighted_deta_strip',
                  'pt_weighted_dphi_strip', 'pt_weighted_dr_signal', 'pt_weighted_dr_iso',
                  'leadingTrackNormChi2', 'e_ratio', 'gj_angle_diff', 'n_photons', 'emFraction',
                  'has_gsf_track', 'inside_ecal_crack',
                  'gsf_ele_matched', 'gsf_ele_pt', 'gsf_ele_dEta', 'gsf_ele_dPhi', 'gsf_ele_mass', 'gsf_ele_Ee',
                  'gsf_ele_Egamma', 'gsf_ele_Pin', 'gsf_ele_Pout', 'gsf_ele_EtotOverPin', 'gsf_ele_Eecal',
                  'gsf_ele_dEta_SeedClusterTrackAtCalo', 'gsf_ele_dPhi_SeedClusterTrackAtCalo', 'gsf_ele_mvaIn_sigmaEtaEta',
                  'gsf_ele_mvaIn_hadEnergy',
                  'gsf_ele_mvaIn_deltaEta', 'gsf_ele_Chi2NormGSF', 'gsf_ele_GSFNumHits', 'gsf_ele_GSFTrackResol',
                  'gsf_ele_GSFTracklnPt', 'gsf_ele_Chi2NormKF', 'gsf_ele_KFNumHits',
                  'leadChargedCand_etaAtEcalEntrance', 'leadChargedCand_pt', 'leadChargedHadrCand_HoP',
                  'leadChargedHadrCand_EoP', 'tau_visMass_innerSigCone', 'n_matched_muons', 'muon_pt', 'muon_dEta', 'muon_dPhi',
                  'muon_n_matches_DT_1', 'muon_n_matches_DT_2', 'muon_n_matches_DT_3', 'muon_n_matches_DT_4',
                  'muon_n_matches_CSC_1', 'muon_n_matches_CSC_2', 'muon_n_matches_CSC_3', 'muon_n_matches_CSC_4',
                  'muon_n_hits_DT_2', 'muon_n_hits_DT_3', 'muon_n_hits_DT_4',
                  'muon_n_hits_CSC_2', 'muon_n_hits_CSC_3', 'muon_n_hits_CSC_4',
                  'muon_n_hits_RPC_2', 'muon_n_hits_RPC_3', 'muon_n_hits_RPC_4',
                  'muon_n_stations_with_matches_03', 'muon_n_stations_with_hits_23', 
                  'signalChargedHadrCands_sum_innerSigCone_pt', 'signalChargedHadrCands_sum_innerSigCone_dEta',
                  'signalChargedHadrCands_sum_innerSigCone_dPhi', 'signalChargedHadrCands_sum_innerSigCone_mass',
                  'signalChargedHadrCands_sum_outerSigCone_pt', 'signalChargedHadrCands_sum_outerSigCone_dEta',
                  'signalChargedHadrCands_sum_outerSigCone_dPhi', 'signalChargedHadrCands_sum_outerSigCone_mass',
                  'signalChargedHadrCands_nTotal_innerSigCone', 'signalChargedHadrCands_nTotal_outerSigCone',
                  'signalNeutrHadrCands_sum_innerSigCone_pt', 'signalNeutrHadrCands_sum_innerSigCone_dEta',
                  'signalNeutrHadrCands_sum_innerSigCone_dPhi', 'signalNeutrHadrCands_sum_innerSigCone_mass',
                  'signalNeutrHadrCands_sum_outerSigCone_pt', 'signalNeutrHadrCands_sum_outerSigCone_dEta',
                  'signalNeutrHadrCands_sum_outerSigCone_dPhi', 'signalNeutrHadrCands_sum_outerSigCone_mass',
                  'signalNeutrHadrCands_nTotal_innerSigCone', 'signalNeutrHadrCands_nTotal_outerSigCone',
                  'signalGammaCands_sum_innerSigCone_pt', 'signalGammaCands_sum_innerSigCone_dEta',
                  'signalGammaCands_sum_innerSigCone_dPhi', 'signalGammaCands_sum_innerSigCone_mass',
                  'signalGammaCands_sum_outerSigCone_pt', 'signalGammaCands_sum_outerSigCone_dEta',
                  'signalGammaCands_sum_outerSigCone_dPhi', 'signalGammaCands_sum_outerSigCone_mass',
                  'signalGammaCands_nTotal_innerSigCone', 'signalGammaCands_nTotal_outerSigCone',
                  'isolationChargedHadrCands_sum_pt', 'isolationChargedHadrCands_sum_dEta',
                  'isolationChargedHadrCands_sum_dPhi', 'isolationChargedHadrCands_sum_mass',
                  'isolationChargedHadrCands_nTotal',
                  'isolationNeutrHadrCands_sum_pt', 'isolationNeutrHadrCands_sum_dEta',
                  'isolationNeutrHadrCands_sum_dPhi', 'isolationNeutrHadrCands_sum_mass',
                  'isolationNeutrHadrCands_nTotal',
                  'isolationGammaCands_sum_pt', 'isolationGammaCands_sum_dEta',
                  'isolationGammaCands_sum_dPhi', 'isolationGammaCands_sum_mass',
                  'isolationGammaCands_nTotal',
                 ]
all_branches = truth_branches + input_branches + central_tau_id_branches
match_suffixes = [ 'e', 'mu', 'tau', 'jet' ]
gen_match_ex_branches = [ 'gen_match_{}'.format(suff) for suff in match_suffixes ]

In [3]:
def ReadBranchesFromFile(file_name, tree_name, branches, dtype, chunk_size = 10):
    data = None
    n = 0
    branch_chunks = [ branches[pos:pos+chunk_size] for pos in range(0, len(branches), chunk_size) ]
    with uproot.open(file_name) as file:
        tree = file[tree_name]
        data = np.empty([tree.numentries, len(branches)], dtype=dtype)
    gc.collect()
    for chunk in branch_chunks:
        with uproot.open(file_name) as file:
            tree = file[tree_name]
            arrays = tree.arrays(chunk)
            for br in chunk:
                data[:, n] = arrays[br.encode()].astype(dtype)
                print("branch '{}' loaded. {}/{}".format(br, n + 1, len(branches)))
                n += 1
        gc.collect()
    return data

In [4]:
def VectorizeGenMatch(data, dtype):
    if data.shape[1] != 1:
        raise RuntimeError("Invalid input")
    v_data = np.zeros([data.shape[0], 4], dtype=dtype)
    v_data[:, 0] = ((data[:, 0] == 1) | (data[:, 0] == 3)).astype(dtype)
    v_data[:, 1] = ((data[:, 0] == 2) | (data[:, 0] == 4)).astype(dtype)
    v_data[:, 2] = (data[:, 0] == 5).astype(dtype)
    v_data[:, 3] = (data[:, 0] == 6).astype(dtype)
    return v_data

In [5]:
X = ReadBranchesFromFile('../../tuples/TTTo2L2Nu_TuneCP5_PSweights_13TeV-powheg-pythia8/training_5e6/shuffled.root',
                         'taus', input_branches, np.float32)
X.shape

branch 'pt' loaded. 1/129
branch 'eta' loaded. 2/129
branch 'mass' loaded. 3/129
branch 'decayMode' loaded. 4/129
branch 'chargedIsoPtSum' loaded. 5/129
branch 'neutralIsoPtSum' loaded. 6/129
branch 'neutralIsoPtSumWeight' loaded. 7/129
branch 'photonPtSumOutsideSignalCone' loaded. 8/129
branch 'puCorrPtSum' loaded. 9/129
branch 'dxy' loaded. 10/129
branch 'dxy_sig' loaded. 11/129
branch 'dz' loaded. 12/129
branch 'ip3d' loaded. 13/129
branch 'ip3d_sig' loaded. 14/129
branch 'hasSecondaryVertex' loaded. 15/129
branch 'flightLength_r' loaded. 16/129
branch 'flightLength_dEta' loaded. 17/129
branch 'flightLength_dPhi' loaded. 18/129
branch 'flightLength_sig' loaded. 19/129
branch 'leadChargedHadrCand_pt' loaded. 20/129
branch 'leadChargedHadrCand_dEta' loaded. 21/129
branch 'leadChargedHadrCand_dPhi' loaded. 22/129
branch 'leadChargedHadrCand_mass' loaded. 23/129
branch 'pt_weighted_deta_strip' loaded. 24/129
branch 'pt_weighted_dphi_strip' loaded. 25/129
branch 'pt_weighted_dr_signal' l

(30000000, 129)

In [6]:
Y_raw = ReadBranchesFromFile('../../tuples/TTTo2L2Nu_TuneCP5_PSweights_13TeV-powheg-pythia8/training_5e6/shuffled.root',
                             'taus', truth_branches, int)
Y = VectorizeGenMatch(Y_raw, int)
Y.shape

branch 'gen_match' loaded. 1/1


(30000000, 4)

In [7]:
input_shape = (len(input_branches), )
n_outputs = len(gen_match_ex_branches)

In [8]:
gc.collect()

2671

In [41]:
n_hidden_layers = 10
n_neurons = 1024
model_name = "{}L{}N".format(n_hidden_layers, n_neurons)
model = Sequential()
model.add(Dense(n_neurons, input_shape=input_shape, kernel_initializer='he_uniform'))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Dropout(0.5))
for n in range(n_hidden_layers):
    model.add(Dense(n_neurons, kernel_initializer='he_uniform'))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Dropout(0.5))
model.add(Dense(n_outputs))
model.add(Activation("softmax"))

In [48]:
model = load_model('10L1024N_0604_v3_s2_acc.hdf5')
model_name = '10L1024N'
opt = keras.optimizers.Adam(lr=1e-7)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"],)

In [27]:
#model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"],)

In [49]:
train_name = '{}_0604_v3_s3'.format(model_name)
n_train = int(X.shape[0] * 0.9)
cb_acc = ModelCheckpoint("{}_acc.hdf5".format(train_name), monitor="val_acc", save_best_only=True,
                         save_weights_only=False, mode="max", verbose=1)
cb_loss = ModelCheckpoint("{}_loss.hdf5".format(train_name), monitor="val_loss", save_best_only=True,
                          save_weights_only=False, mode="min", verbose=1)
csv_log = CSVLogger("{}.log".format(train_name))
fit_hist = model.fit(X[0:n_train, :], Y[0:n_train, :], callbacks=[cb_acc, cb_loss, csv_log],
                     validation_data=(X[n_train:, :], Y[n_train:, :]), batch_size=10000, epochs=20, verbose=1,
                     class_weight={0: 1, 1: 1, 2: 1, 3: 1 })

Train on 27000000 samples, validate on 3000000 samples
Epoch 1/20

Epoch 00001: val_acc improved from -inf to 0.90957, saving model to 10L1024N_0604_v3_s3_acc.hdf5

Epoch 00001: val_loss improved from inf to 0.24795, saving model to 10L1024N_0604_v3_s3_loss.hdf5
Epoch 2/20

Epoch 00002: val_acc did not improve from 0.90957

Epoch 00002: val_loss improved from 0.24795 to 0.24660, saving model to 10L1024N_0604_v3_s3_loss.hdf5
Epoch 3/20

Epoch 00003: val_acc improved from 0.90957 to 0.91294, saving model to 10L1024N_0604_v3_s3_acc.hdf5

Epoch 00003: val_loss did not improve from 0.24660
Epoch 4/20

Epoch 00004: val_acc did not improve from 0.91294

Epoch 00004: val_loss improved from 0.24660 to 0.24355, saving model to 10L1024N_0604_v3_s3_loss.hdf5
Epoch 5/20

Epoch 00005: val_acc did not improve from 0.91294

Epoch 00005: val_loss did not improve from 0.24355
Epoch 6/20

Epoch 00006: val_acc did not improve from 0.91294

Epoch 00006: val_loss did not improve from 0.24355
Epoch 7/20

Epo

KeyboardInterrupt: 

In [31]:
#model.save("5L1024N_0604_20ep.hdf5")