In [1]:
import numpy as np
import matplotlib.pyplot as plt

import h5py

import uproot
import awkward as ak

# E906 Messy MC Data

Let $\beta_{0}$ and $\beta_{1}$ define two distributions, $p(x|\beta_{0})$ and $p(x|\beta_{1})$ respectively. Then the likelihood ratio is defined by;
$$
\mathcal{L}(x| \beta_{0}, \beta_{1}) =  \frac{p(x|\beta_{0})}{p(x|\beta_{1})}
$$

A classifier function $f$, designed to distinguish samples drawn from $p(x|\beta_{0})$ and $p(x|\beta_{1})$, can be used to approximate likelihood ratios;

$$
\mathcal{L}(x| \beta_{0}, \beta_{1}) =  \frac{f(x, \beta_{0}, \beta_{1})}{1 - f(x, \beta_{0}, \beta_{1})}
$$

Consider the cross-section of the Drell-Yan (DY) angular distribution;

$$
\frac{d\sigma}{d\Omega} \propto 1 + \lambda \cos^{2}\theta + \mu\sin2\theta\cos\phi + \frac{\nu}{2}\sin^{2}\theta\cos2\phi
$$

Our goal is to extract Drell-Yan (DY) angular coefficients, $\lambda$, $\mu$, and $\nu$, using the likelihood ratio method. We aim to train a classifier (neural network) capable of classifying two samples. We sample $\lambda$, $\mu$, and $\nu$ from uniform ranges: $\lambda$ in $[-1, 1]$, $\mu$ and $\nu$ in $[-0.5, 0.5]$. We extract the bin center and bin content of $\phi$ vs. $\cos\theta$ histograms as inputs and weights in the classifier and loss function. The two classes are defined as follows:

```
H0: (lambda=0, mu=0, nu=0, phi, costheta) with beta = (lambda, mu, nu) --> label = 0
H1: (lambda, mu, nu, phi, costheta) with beta = (lambda, mu, nu) --> label = 1
```

Note that the class with label 0 has mismatching $\beta$ values.

In [2]:
X0_train = uproot.open("net.root:X0_train_tree")
X1_train = uproot.open("net.root:X1_train_tree")
X0_test = uproot.open("net.root:X0_test_tree")
X1_test = uproot.open("net.root:X1_test_tree")

In [3]:
outputs = h5py.File("net.hdf5", "w")

outputs.create_dataset("X0_train/thetas", data=X0_train["thetas"].array().to_numpy())
outputs.create_dataset("X0_train/X_par", data=X0_train["X_par"].array().to_numpy())
outputs.create_dataset("X0_train/X_det", data=X0_train["X_det"].array().to_numpy())
outputs.create_dataset("X0_train/W_par", data=X0_train["W_par"].array().to_numpy())
outputs.create_dataset("X0_train/W_det", data=X0_train["W_det"].array().to_numpy())
outputs.create_dataset("X0_train/label", data=X0_train["label"].array().to_numpy())

outputs.create_dataset("X1_train/thetas", data=X1_train["thetas"].array().to_numpy())
outputs.create_dataset("X1_train/X_par", data=X1_train["X_par"].array().to_numpy())
outputs.create_dataset("X1_train/X_det", data=X1_train["X_det"].array().to_numpy())
outputs.create_dataset("X1_train/W_par", data=X1_train["W_par"].array().to_numpy())
outputs.create_dataset("X1_train/W_det", data=X1_train["W_det"].array().to_numpy())
outputs.create_dataset("X1_train/label", data=X1_train["label"].array().to_numpy())

outputs.create_dataset("X0_test/thetas", data=X0_test["thetas"].array().to_numpy())
outputs.create_dataset("X0_test/X_par", data=X0_test["X_par"].array().to_numpy())
outputs.create_dataset("X0_test/X_det", data=X0_test["X_det"].array().to_numpy())
outputs.create_dataset("X0_test/W_par", data=X0_test["W_par"].array().to_numpy())
outputs.create_dataset("X0_test/W_det", data=X0_test["W_det"].array().to_numpy())
outputs.create_dataset("X0_test/label", data=X0_test["label"].array().to_numpy())

outputs.create_dataset("X1_test/thetas", data=X1_test["thetas"].array().to_numpy())
outputs.create_dataset("X1_test/X_par", data=X1_test["X_par"].array().to_numpy())
outputs.create_dataset("X1_test/X_det", data=X1_test["X_det"].array().to_numpy())
outputs.create_dataset("X1_test/W_par", data=X1_test["W_par"].array().to_numpy())
outputs.create_dataset("X1_test/W_det", data=X1_test["W_det"].array().to_numpy())
outputs.create_dataset("X1_test/label", data=X1_test["label"].array().to_numpy())

outputs.close()