In [160]:
import sys
sys.path.append("..")

In [161]:
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (8,8)

In [162]:
import pickle
from sklearn.preprocessing import RobustScaler
from recnn.preprocessing import extract, permute_by_pt

def load_test(filename):
    # Make training data
    print("Loading data...")

    fd = open(filename, "rb")
    X, y = pickle.load(fd)
    fd.close()
    indices = np.random.permutation(len(X))
    y = np.array(y)

    print("\tfilename = %s" % filename)
    print("\tX size = %d" % len(X))
    print("\ty size = %d" % len(y))

    # Preprocessing 
    print("Preprocessing...")
    X = [extract(permute_by_pt(jet)) for jet in X]
    tf = RobustScaler().fit(np.vstack([jet["content"] for jet in X]))

    for jet in X:
        jet["content"] = tf.transform(jet["content"])
        
    # Cropping
    X_ = [j for j in X if 250 < j["pt"] < 300 and 50 < j["mass"] < 110]
    y_ = [y[i] for i, j in enumerate(X) if 250 < j["pt"] < 300 and 50 < j["mass"] < 110]

    X = X_
    y = y_
    
    print("\tX size = %d" % len(X))
    print("\ty size = %d" % len(y))
        
    return X, y, tf

X, y, tf = load_test("../data/w-vs-qcd/anti-kt/antikt-test.pickle-py27-kt")
# X, y, tf = load_test("../data/w-vs-qcd/anti-kt/antikt-delphes-test.pickle-kt")

Loading data...
	filename = ../data/w-vs-qcd/anti-kt/antikt-test.pickle-py27-kt
	X size = 20000
	y size = 20000
Preprocessing...
	X size = 7690
	y size = 7690


In [163]:
from recnn.recnn import grnn_predict_simple
from recnn.recnn import grnn_transform_simple

import pickle

fd = open("../models/delphes/w-kt-1.pickle", "rb")
# fd = open("../models/delphes/w-delphes-kt-1.pickle", "rb")
params = pickle.load(fd)
fd.close()

In [164]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y, grnn_predict_simple(params, X))

0.91236886384659421

In [165]:
fd = open("../data/w-vs-qcd/anti-kt/antikt-event-test.pickle-kt", "rb")
# fd = open("../data/w-vs-qcd/anti-kt/antikt-delphes-event-test.pickle-kt", "rb")

events = []
ys = []

for i in range(10000):
    if i % 20 == 0:
        print(i)
    e, y = pickle.load(fd)
    
    original_features = []
    jets = []
    
    for i, (phi, eta, pt, mass, jet) in enumerate(e):
        if y == 0 and i == 1:  # remove 2nd hard qcd jets!
            continue
            
        original_features.append((phi, eta, pt, mass))
        jet = extract(permute_by_pt(jet))
        jet["content"] = tf.transform(jet["content"])
        jets.append(jet)
        
    events.append((np.array(original_features), jets))
    ys.append(y)
    
y = np.array(ys)
    
fd.close()

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1020
1040
1060
1080
1100
1120
1140
1160
1180
1200
1220
1240
1260
1280
1300
1320
1340
1360
1380
1400
1420
1440
1460
1480
1500
1520
1540
1560
1580
1600
1620
1640
1660
1680
1700
1720
1740
1760
1780
1800
1820
1840
1860
1880
1900
1920
1940
1960
1980
2000
2020
2040
2060
2080
2100
2120
2140
2160
2180
2200
2220
2240
2260
2280
2300
2320
2340
2360
2380
2400
2420
2440
2460
2480
2500
2520
2540
2560
2580
2600
2620
2640
2660
2680
2700
2720
2740
2760
2780
2800
2820
2840
2860
2880
2900
2920
2940
2960
2980
3000
3020
3040
3060
3080
3100
3120
3140
3160
3180
3200
3220
3240
3260
3280
3300
3320
3340
3360
3380
3400
3420
3440
3460
3480
3500
3520
3540
3560
3580
3600
3620
3640
3660
3680
3700
3720
3740
3760
3780
3800
3820
3840
3860
3880
3900
3920
3940
3960
3980
4000
4020
4040
4060
4080
4100
4120
4140
4160
4180
4200
4

In [166]:
for l in range(10):
    scores = []
    ys = []
    
    for i in range(len(events)):
        if l < len(events[i][1]):
            scores.append(grnn_predict_simple(params, events[i][1][l:l+1])[0])
            ys.append(y[i])
    
    print(l, roc_auc_score(ys, scores))

(0, 0.89633835585353427)
(1, 0.5926277037051082)
(2, 0.59154354366174167)
(3, 0.56017140240685614)
(4, 0.53037420121496814)
(5, 0.51871830074873204)
(6, 0.50968828038753122)
(7, 0.50247924009916967)
(8, 0.5054236602169464)
(9, 0.49864335994573444)


In [167]:
scores = []

for i in range(len(events)):
    s = 0.0
    
    for l in range(5):
        s += grnn_predict_simple(params, events[i][1][l:l+1])[0]
        
    scores.append(s)
    
roc_auc_score(y, scores)

0.86977407479096291

---

# Fixed embedding

In [179]:
from sklearn.preprocessing import RobustScaler
tf = RobustScaler().fit(np.vstack([features for features, _ in events]))

In [186]:
X = []

for features, jets in events:
    f = tf.transform(features)
    h = grnn_transform_simple(params, jets)
    X.append(np.hstack([f, h]))
    
X = [x_i[:5] for x_i in X]
X = np.array(X)
print(X.shape)

(10000, 5, 44)


In [188]:
import autograd.numpy as np
from recnn.recnn import glorot_uniform
from recnn.recnn import orthogonal
from recnn.recnn import relu
from recnn.recnn import sigmoid
from recnn.recnn import check_random_state

def vanilla_rnn_init(n_features, n_hidden, random_state=None):
    rng = check_random_state(random_state)
    return {"init_h": glorot_uniform(n_hidden, 0, rng),
            "W_h": orthogonal((n_hidden, n_hidden), rng),
            "W_x": glorot_uniform(n_hidden, n_features, rng),
            "b": np.zeros(n_hidden),
            "W_clf": [glorot_uniform(n_hidden, n_hidden, rng),
                      glorot_uniform(n_hidden, n_hidden, rng),
                      glorot_uniform(n_hidden, 0, rng)],
            "b_clf": [np.zeros(n_hidden),
                      np.zeros(n_hidden),
                      np.ones(1)]}

def vanilla_rnn_transform(params, jets):
    h = np.tile(params["init_h"], len(jets)).reshape(len(jets), -1)
    
    for t in range(jets.shape[1]):
        xt = jets[:, t, :]
        h = relu(np.dot(params["W_h"], h.T).T + np.dot(params["W_x"], xt.T).T + params["b"])

    return h

def vanilla_rnn_predict(params, jets):
    h = vanilla_rnn_transform(params, jets)

    h = relu(np.dot(params["W_clf"][0], h.T).T + params["b_clf"][0])
    h = relu(np.dot(params["W_clf"][1], h.T).T + params["b_clf"][1])
    h = sigmoid(np.dot(params["W_clf"][2], h.T).T + params["b_clf"][2])

    return h.ravel()



def gru_init(n_features, n_hidden, random_state=None):
    rng = check_random_state(random_state)
    return {"init_h": glorot_uniform(n_hidden, 0, rng),
            "W_hh": orthogonal((n_hidden, n_hidden), rng),
            "W_hx": glorot_uniform(n_hidden, n_features, rng),
            "b_h": np.zeros(n_hidden),
            "W_zh": orthogonal((n_hidden, n_hidden), rng),
            "W_zx": glorot_uniform(n_hidden, n_features, rng),
            "b_z": np.zeros(n_hidden),
            "W_rh": orthogonal((n_hidden, n_hidden), rng),
            "W_rx": glorot_uniform(n_hidden, n_features, rng),
            "b_r": np.zeros(n_hidden),
            "W_clf": [glorot_uniform(n_hidden, n_hidden, rng),
                      glorot_uniform(n_hidden, n_hidden, rng),
                      glorot_uniform(n_hidden, 0, rng)],
            "b_clf": [np.zeros(n_hidden),
                      np.zeros(n_hidden),
                      np.ones(1)]}

def gru_transform(params, jets):
    h = np.tile(params["init_h"], len(jets)).reshape(len(jets), -1)
    
    for t in range(jets.shape[1]):
        xt = jets[:, t, :]
        zt = sigmoid(np.dot(params["W_zh"], h.T).T + np.dot(params["W_zx"], xt.T).T + params["b_z"])
        rt = sigmoid(np.dot(params["W_rh"], h.T).T + np.dot(params["W_rx"], xt.T).T + params["b_r"])       
        ht = relu(np.dot(params["W_hh"], np.multiply(rt, h).T).T + np.dot(params["W_hx"], xt.T).T + params["b_h"])
        h = np.multiply(1. - zt, h) + np.multiply(zt, ht)

    return h

def gru_predict(params, jets):
    h = gru_transform(params, jets)

    h = relu(np.dot(params["W_clf"][0], h.T).T + params["b_clf"][0])
    h = relu(np.dot(params["W_clf"][1], h.T).T + params["b_clf"][1])
    h = sigmoid(np.dot(params["W_clf"][2], h.T).T + params["b_clf"][2])

    return h.ravel()

In [191]:
import copy

import autograd as ag
from recnn.recnn import log_loss
from recnn.recnn import adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      test_size=2000,
                                                      random_state=1)

# init = vanilla_rnn_init
# predict = vanilla_rnn_predict
init = gru_init
predict = gru_predict

n_epochs = 50
batch_size = 64
step_size = 0.0005
decay = 0.95

trained_params = init(40+4, 40, random_state=0)
n_batches = int(np.ceil(len(X_train) / batch_size))
best_score = [-np.inf]  # yuck, but works
best_params = [trained_params]

def loss(X, y, params):
    y_pred = predict(params, X)
    l = log_loss(y, y_pred).mean()
    return l

def objective(params, iteration):
    rng = check_random_state(iteration % n_batches)
    start = rng.randint(len(X_train) - batch_size)
    idx = slice(start, start+batch_size)
    return loss(X_train[idx], y_train[idx], params)

def callback(params, iteration, gradient):
    if iteration % 25 == 0:
        roc_auc = roc_auc_score(y_valid, predict(params, X_valid))

        if roc_auc > best_score[0]:
            best_score[0] = roc_auc
            best_params[0] = copy.deepcopy(params)

        print(
            "%5d\t~loss(train)=%.4f\tloss(valid)=%.4f"
            "\troc_auc(valid)=%.4f\tbest_roc_auc(valid)=%.4f" % (
                iteration,
                loss(X_train[:5000], y_train[:5000], params),
                loss(X_valid, y_valid, params),
                roc_auc,
                best_score[0]))

for i in range(n_epochs):
    print("epoch = %d" % i)
    print("step_size = %.4f" % step_size)

    trained_params = adam(ag.grad(objective),
                          trained_params,
                          step_size=step_size,
                          num_iters=1 * n_batches,
                          callback=callback)
    step_size = step_size * decay

epoch = 0
step_size = 0.0005
    0	~loss(train)=1.1048	loss(valid)=1.1612	roc_auc(valid)=0.4704	best_roc_auc(valid)=0.4704
   25	~loss(train)=1.0405	loss(valid)=1.0919	roc_auc(valid)=0.6991	best_roc_auc(valid)=0.6991
   50	~loss(train)=0.6825	loss(valid)=0.6677	roc_auc(valid)=0.7483	best_roc_auc(valid)=0.7483
   75	~loss(train)=0.5881	loss(valid)=0.5851	roc_auc(valid)=0.8392	best_roc_auc(valid)=0.8392
  100	~loss(train)=0.5240	loss(valid)=0.5208	roc_auc(valid)=0.8640	best_roc_auc(valid)=0.8640
epoch = 1
step_size = 0.0005
    0	~loss(train)=0.4730	loss(valid)=0.4775	roc_auc(valid)=0.8706	best_roc_auc(valid)=0.8706
   25	~loss(train)=0.4600	loss(valid)=0.4608	roc_auc(valid)=0.8783	best_roc_auc(valid)=0.8783
   50	~loss(train)=0.4410	loss(valid)=0.4417	roc_auc(valid)=0.8862	best_roc_auc(valid)=0.8862
   75	~loss(train)=0.4326	loss(valid)=0.4356	roc_auc(valid)=0.8870	best_roc_auc(valid)=0.8870
  100	~loss(train)=0.4523	loss(valid)=0.4471	roc_auc(valid)=0.8871	best_roc_auc(valid)=0.8871
ep

KeyboardInterrupt: 

---

# Joint learning of embedding + rnn + classifier

In [192]:
from sklearn.preprocessing import RobustScaler
tf = RobustScaler().fit(np.vstack([features for features, _ in events]))

In [193]:
n_jets_per_event = 5
X = []

for features, jets in events:
    f = tf.transform(features)
    X.append([f[:n_jets_per_event], jets[:n_jets_per_event]])

In [212]:
import autograd.numpy as np

from recnn.recnn import glorot_uniform
from recnn.recnn import orthogonal
from recnn.recnn import relu
from recnn.recnn import sigmoid
from recnn.recnn import check_random_state

from recnn.recnn import grnn_init_simple
from recnn.recnn import grnn_transform_simple

def full_init(n_features_embedding, n_hidden_embedding, 
              n_features_rnn, n_hidden_rnn, random_state=None):
    rng = check_random_state(random_state)
    params = grnn_init_simple(n_features_embedding, 
                              n_hidden_embedding, 
                              random_state=rng)
    
    params.update(
        {"rnn_init_h": glorot_uniform(n_hidden_rnn, 0, rng),
         "rnn_W_hh": orthogonal((n_hidden_rnn, n_hidden_rnn), rng),
         "rnn_W_hx": glorot_uniform(n_hidden_rnn, n_features_rnn, rng),
         "rnn_b_h": np.zeros(n_hidden_rnn),
         "rnn_W_zh": orthogonal((n_hidden_rnn, n_hidden_rnn,), rng),
         "rnn_W_zx": glorot_uniform(n_hidden_rnn, n_features_rnn, rng),
         "rnn_b_z": np.zeros(n_hidden_rnn),
         "rnn_W_rh": orthogonal((n_hidden_rnn, n_hidden_rnn,), rng),
         "rnn_W_rx": glorot_uniform(n_hidden_rnn, n_features_rnn, rng),
         "rnn_b_r": np.zeros(n_hidden_rnn),
         "W_clf": [glorot_uniform(n_hidden_rnn, n_hidden_rnn, rng),
                   glorot_uniform(n_hidden_rnn, n_hidden_rnn, rng),
                   glorot_uniform(n_hidden_rnn, 0, rng)],
         "b_clf": [np.zeros(n_hidden_rnn),
                   np.zeros(n_hidden_rnn),
                   np.ones(1)]
        })
    
    return params

def full_transform(params, X):
    # Convert jets
    jets = []
    features = []
    
    for e in X:
        features.append(e[0])
        jets.extend(e[1])
        
    # Append original features
#     h_jets = np.hstack([np.vstack(features), 
#                         grnn_transform_simple(params, jets)]) 
    h_jets = np.vstack(features)
    h_jets = h_jets.reshape(len(X), n_jets_per_event, -1)   
    
    # RNN layer
    h = np.tile(params["rnn_init_h"], len(X)).reshape(len(X), -1)
    
    for t in range(h_jets.shape[1]):
        xt = h_jets[:, t, :]
        zt = sigmoid(np.dot(params["rnn_W_zh"], h.T).T + np.dot(params["rnn_W_zx"], xt.T).T + params["rnn_b_z"])
        rt = sigmoid(np.dot(params["rnn_W_rh"], h.T).T + np.dot(params["rnn_W_rx"], xt.T).T + params["rnn_b_r"])       
        ht = relu(np.dot(params["rnn_W_hh"], np.multiply(rt, h).T).T + np.dot(params["rnn_W_hx"], xt.T).T + params["rnn_b_h"])
        h = np.multiply(1. - zt, h) + np.multiply(zt, ht)

    return h

def full_predict(params, X):
    h = full_transform(params, X)

    h = relu(np.dot(params["W_clf"][0], h.T).T + params["b_clf"][0])
    h = relu(np.dot(params["W_clf"][1], h.T).T + params["b_clf"][1])
    h = sigmoid(np.dot(params["W_clf"][2], h.T).T + params["b_clf"][2])

    return h.ravel()

In [216]:
import copy

import autograd as ag
from recnn.recnn import log_loss
from recnn.recnn import adam, sgd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      test_size=2000,
                                                      random_state=1)

init = full_init
predict = full_predict

n_epochs = 50
batch_size = 64
step_size = 0.001
decay = 0.95

trained_params = init(7, 10, 4, 5, random_state=123)
n_batches = int(np.ceil(len(X_train) / batch_size))
best_score = [-np.inf]  # yuck, but works
best_params = [trained_params]

def loss(X, y, params):
    y_pred = predict(params, X)
    l = log_loss(y, y_pred).mean()
    return l

def objective(params, iteration):
    rng = check_random_state(iteration % n_batches)
    start = rng.randint(len(X_train) - batch_size)
    idx = slice(start, start+batch_size)
    return loss(X_train[idx], y_train[idx], params)

def callback(params, iteration, gradient):
    if iteration % 25 == 0:
        roc_auc = roc_auc_score(y_valid, predict(params, X_valid))

        if roc_auc > best_score[0]:
            best_score[0] = roc_auc
            best_params[0] = copy.deepcopy(params)

        print(
            "%5d\t~loss(train)=%.4f\tloss(valid)=%.4f"
            "\troc_auc(valid)=%.4f\tbest_roc_auc(valid)=%.4f" % (
                iteration,
                loss(X_train[:5000], y_train[:5000], params),
                loss(X_valid, y_valid, params),
                roc_auc,
                best_score[0]))

for i in range(1):
    print("epoch = %d" % i)
    print("step_size = %.4f" % step_size)

    trained_params = adam(ag.grad(objective),
                          trained_params,
                          step_size=step_size,
                          num_iters=n_epochs * n_batches,
                          callback=callback)
    step_size = step_size * decay

epoch = 0
step_size = 0.0010
    0	~loss(train)=1.1043	loss(valid)=1.1607	roc_auc(valid)=0.4646	best_roc_auc(valid)=0.4646
   25	~loss(train)=1.0704	loss(valid)=1.1240	roc_auc(valid)=0.5514	best_roc_auc(valid)=0.5514
   50	~loss(train)=0.9748	loss(valid)=1.0198	roc_auc(valid)=0.5412	best_roc_auc(valid)=0.5514
   75	~loss(train)=0.7101	loss(valid)=0.7192	roc_auc(valid)=0.5429	best_roc_auc(valid)=0.5514
  100	~loss(train)=0.6832	loss(valid)=0.6802	roc_auc(valid)=0.6237	best_roc_auc(valid)=0.6237
  125	~loss(train)=0.6617	loss(valid)=0.6640	roc_auc(valid)=0.6418	best_roc_auc(valid)=0.6418
  150	~loss(train)=0.6733	loss(valid)=0.6729	roc_auc(valid)=0.6202	best_roc_auc(valid)=0.6418
  175	~loss(train)=0.6516	loss(valid)=0.6592	roc_auc(valid)=0.6681	best_roc_auc(valid)=0.6681
  200	~loss(train)=0.6472	loss(valid)=0.6453	roc_auc(valid)=0.6716	best_roc_auc(valid)=0.6716
  225	~loss(train)=0.6582	loss(valid)=0.6560	roc_auc(valid)=0.6687	best_roc_auc(valid)=0.6716
  250	~loss(train)=0.6332	loss(

In [215]:
full_predict(trained_params, X[:10]), y[:10]

(array([ 0.27163564,  0.20312512,  0.48590398,  0.74796822,  0.71391466,
         0.72180513,  0.20404719,  0.76180793,  0.71370997,  0.66287883]),
 array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1]))