<a href="https://colab.research.google.com/github/donalrinho/Bc2JpsiMuNu/blob/main/Bc2JpsiMuNu_RapidSim_LHCb_binned_fit_old_zfit_method.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q uproot
!pip install -q tensorflow==2.6.2 #specific versions for compatability with zfit
!pip install -q hist
!pip install -q mplhep
!pip install -q zfit #git+https://github.com/zfit/zfit #development version of zfit needed to get binned fit tools
!pip install -q uncertainties

[K     |████████████████████████████████| 301 kB 7.7 MB/s 
[K     |████████████████████████████████| 458.3 MB 12 kB/s 
[K     |████████████████████████████████| 132 kB 36.7 MB/s 
[K     |████████████████████████████████| 1.3 MB 42.9 MB/s 
[K     |████████████████████████████████| 5.6 MB 13.2 MB/s 
[K     |████████████████████████████████| 462 kB 71.9 MB/s 
[?25h  Building wheel for clang (setup.py) ... [?25l[?25hdone
  Building wheel for wrapt (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.4 MB 15.7 MB/s 
[K     |████████████████████████████████| 11.2 MB 12.1 MB/s 
[K     |████████████████████████████████| 5.8 MB 31.1 MB/s 
[K     |████████████████████████████████| 895 kB 58.2 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which i

In [1]:
import uproot
import numpy as np
import tensorflow as tf
import zfit
import hist
from hist import Hist
import mplhep
import pandas as pd
import pickle
import json
import random
from uncertainties import *



In [2]:
#Load our histogram templates from previous notebook
all_h_norm = {}
hist_path = "/content/drive/MyDrive/Bc2JpsiMuNu_Analysis/pickle"
for i in range(0,6):
  with open(f"{hist_path}/hist_{i}.pkl", "rb") as f:
    all_h_norm[i] = pickle.load(f)
    all_h_norm[i] = all_h_norm[i].values() #convert to numpy arrays

In [3]:
#Load our ROOT file containing the MC we want to fit
drive_dir = "/content/drive/MyDrive/Bc2JpsiMuNu_ROOT_files"
file_path = f"{drive_dir}/Bc2JpsiMuNu_RapidSim_LHCb_Vars_Weights"
print(f"Loading ROOT file {file_path}.root")
tree_name = "DecayTree"
events = uproot.open(f"{file_path}.root:{tree_name}")
events

#Make pandas DataFrame
df = events.arrays(library="pd")

#Downsample to DataFrame to 100k events, which will act as our fit dataset
df_fit = df.sample(n=100000, random_state=42)
n_data = len(df_fit)

Loading ROOT file /content/drive/MyDrive/Bc2JpsiMuNu_ROOT_files/Bc2JpsiMuNu_RapidSim_LHCb_Vars_Weights.root


  out[name] = series[name]


In [4]:
#Define fit variables dict containing useful properties like name, min, max, LaTeX title
vars = {}
vars["x_var"] = {"name": "costheta_Jpsi_reco", "min": -1., "max": 1., "bins": 10, "latex": "$\\cos(\\theta_{J/\\psi})$"}
vars["y_var"] = {"name": "costheta_W_reco", "min": -1., "max": 1., "bins": 10, "latex": "$\\cos(\\theta_{W})$"}
vars["z_var"] = {"name": "chi_reco", "min": -np.pi, "max": np.pi, "bins": 10, "latex": "$\\chi$ [rad]"}

In [5]:
#Get the binning schemes we used to make our templates (we saved them into a JSON file)
json_path = "/content/drive/MyDrive/Bc2JpsiMuNu_Analysis/json"
with open(f"{json_path}/binnings.json") as json_file:
  binnings = json.load(json_file)
binnings

{'x_var': [-1.0,
  -0.79216,
  -0.58822,
  -0.38891,
  -0.19395,
  0.00055617,
  0.19486,
  0.39035,
  0.5888,
  0.79242,
  1.0],
 'y_var': [-1.0,
  -0.46394,
  -0.12545,
  0.13095,
  0.33459,
  0.50304,
  0.64358,
  0.76183,
  0.861,
  0.94157,
  1.0],
 'z_var': [-3.141592653589793,
  -2.42962,
  -1.84089,
  -1.30021,
  -0.71122,
  0.0043844,
  0.71329,
  1.29936,
  1.83985,
  2.42724,
  3.141592653589793]}

In [6]:
#Make a hist of the dataset we will fit, filling it with our MC
data_h = (
    Hist.new
    .Variable(binnings["x_var"], name=vars["x_var"]["name"])
    .Variable(binnings["y_var"], name=vars["y_var"]["name"])
    .Variable(binnings["z_var"], name=vars["z_var"]["name"])
    .Weight()
    )
data_h.fill(df_fit[vars["x_var"]["name"]], 
            df_fit[vars["y_var"]["name"]], 
            df_fit[vars["z_var"]["name"]])
data_h = data_h.values() #convert to numpy array

In [7]:
#Binned maximum likelihood function which we will minimise (in order to maximise the likelihood)
def binned_nll(pdf, data):
      return np.sum(pdf - data + data * np.log((data + 1e-14) / (pdf + 1e-14)))
      # 1e-14 added in case there are empty bins

In [8]:
#Define our fit PDF as a sum of histogram templates, the binned analogue of what we did in our unbinned fit to the true angles
def make_pdf(params, templates):
  
  #Fit parameters, the angular coefficients
  H0_amp = params['H0_amp']
  Hm_amp = params['Hm_amp']

  Hp_phi = params['Hp_phi']
  Hm_phi = params['Hm_phi']

  #Fixed and derived parameters
  H0_phi = 0.
  Hp_amp = float(np.sqrt(1. - H0_amp**2 - Hm_amp**2))

  h_0 = tf.complex(H0_amp*np.cos(H0_phi),H0_amp*np.sin(H0_phi))
  h_p = tf.complex(Hp_amp*np.cos(Hp_phi),Hp_amp*np.sin(Hp_phi))
  h_m = tf.complex(Hm_amp*np.cos(Hm_phi),Hm_amp*np.sin(Hm_phi))
  
  h_0st = tf.math.conj(h_0)
  h_pst = tf.math.conj(h_p)
  h_mst = tf.math.conj(h_m)
  
  HpHmst = h_p*h_mst
  HpH0st = h_p*h_0st
  HmH0st = h_m*h_0st

  pdf = H0_amp**2 * 2 * templates[0]
  pdf += Hp_amp**2 * 0.5 * templates[1]
  pdf += Hm_amp**2 * 0.5 * templates[2]
  pdf += float(tf.math.real(HpH0st)) * templates[3]
  pdf += -float(tf.math.real(HmH0st)) * templates[4]
  pdf += float(tf.math.real(HpHmst)) * templates[5]

  #Normalise the PDF
  pdf = pdf / np.sum(pdf)
  
  return pdf

In [9]:
#Define loss function
def loss(x):
  # by default, x is an OrderedSet of zfit parameters
  # The order of x matches the order of the params dict
  x = np.array(x)

  #print("Value of the parameters", x)

  #Put the fit parameters from x into a dict of variables, which is passed to the PDF maker function, make_pdf()
  pars_dict = {}

  pars_dict["H0_amp"] = x[0]
  pars_dict["Hm_amp"] = x[1]
  pars_dict["Hp_phi"] = x[2]
  pars_dict["Hm_phi"] = x[3]
 
  pdf = make_pdf(pars_dict, all_h_norm) #Pass our dict of params and our dict of histogram templates

  #Scale PDF to match the dataset size
  pdf = pdf * n_data

  #Binned log-likelihood, passing the total PDF and the dataset
  nll = binned_nll(pdf, data_h)

  return nll

In [10]:
#Initial parameter values
init_vals = {"H0_amp": 0.7,
             "Hm_amp": 0.6,
             "Hp_phi": 1.5,
             "Hm_phi": -1.5
}

In [11]:
#Fit parameters dictionary
rand = random.randint(0,100000)
params = {"value": [init_vals["H0_amp"], init_vals["Hm_amp"], init_vals["Hp_phi"], init_vals["Hm_phi"]],
          "lower": [0., 0., -2*np.pi, -2*np.pi],
          "upper": [1., 1., 2*np.pi, 2*np.pi],
          "name": [f"H0_amp_{rand}", f"Hm_amp_{rand}", f"Hp_phi_{rand}", f"Hm_phi_{rand}"]
}

In [12]:
#Test our building of the PDF and that it returns a sensible nll and chi2/dof value
pdf_test = make_pdf(init_vals, all_h_norm)
#Scale to match the yield of the dataset we are fitting
pdf_test = pdf_test * n_data

#Compute NLL of data and test PDF (initialised with values from the unbinned fit)
nll = binned_nll(pdf_test, data_h)
print(f"NLL = {nll}")

#Chi2 sum over all bins
chi2 = np.sum((pdf_test - data_h)**2 / data_h)
#Reduced chi2
chi2_dof = chi2 / data_h.size
print(f"chi2/dof = {chi2_dof}")

NLL = 637.2926914668059
chi2/dof = 1.3073173925894792


In [13]:
#Run fit
loss.errordef = 0.5 # 0.5 for a log-likelihood, 1 for chi2

minimiser = zfit.minimize.Minuit(verbosity=5)
#Since we're using numpy histograms, we need to disable the graph mode of zfit
zfit.run.set_autograd_mode(False)
zfit.run.set_graph_mode(False)

#Pass the dict of parameters with inital values, as defined in defs.py
result = minimiser.minimize(loss, params)
param_errors = result.hesse(method="minuit_hesse")
corr = result.correlation(method="minuit_hesse")
cov = result.covariance(method="minuit_hesse")

print(result.info['original'])

result_params = result.params

for p in result_params:
  print(f"{p.name}: {result_params[p.name]['value']} +/- {result_params[p.name]['minuit_hesse']['error']}")

┌─────────────────────────────────────────────────────────────────────────┐
│                                Migrad                                   │
├──────────────────────────────────┬──────────────────────────────────────┤
│ FCN = 423.9                      │              Nfcn = 100              │
│ EDM = 2.98e-05 (Goal: 0.001)     │                                      │
├──────────────────────────────────┼──────────────────────────────────────┤
│          Valid Minimum           │        No Parameters at limit        │
├──────────────────────────────────┼──────────────────────────────────────┤
│ Below EDM threshold (goal x 10)  │           Below call limit           │
├───────────────┬──────────────────┼───────────┬─────────────┬────────────┤
│  Covariance   │     Hesse ok     │ Accurate  │  Pos. def.  │ Not forced │
└───────────────┴──────────────────┴───────────┴─────────────┴────────────┘
H0_amp_69578: 0.6842905569840235 +/- 0.003584354066448079
Hm_amp_69578: 0.64260355571738

In [14]:
#Put results into a dictionary
results_dict = {}
for p in result_params:
    par_name = p.name
    #Remove the random piece of the name which we added to allow zfit to run many times
    par_name = par_name.replace("_"+str(rand),"")
    results_dict[par_name] = [result_params[p]['value'], param_errors[p]["error"]]
results_dict

{'H0_amp': [0.6842905569840235, 0.003584354066448079],
 'Hm_amp': [0.6426035557173815, 0.0028407413743337675],
 'Hm_phi': [-1.518089147445701, 0.03185314372137788],
 'Hp_phi': [1.5384539453990644, 0.05302641082634226]}

In [15]:
#Calculate the parameter H+ based on the values of H0 and H-, using the Python uncertainties package to propagate uncertainties for us
#Here, results_dict["H0_amp"][0] gets us the value of H0_amp from the fit
#results_dict["H0_amp"][1] gets us the error
#ufloat is an uncertainties object, which has a central value (the first value) and an uncertainty (the second value)
v_H0_amp = ufloat(results_dict["H0_amp"][0], results_dict["H0_amp"][1])
v_Hm_amp = ufloat(results_dict["Hm_amp"][0], results_dict["Hm_amp"][1])

v_H0_amp, v_Hm_amp

(0.6842905569840235+/-0.003584354066448079,
 0.6426035557173815+/-0.0028407413743337675)

In [16]:
#Calculate a new ufloat for Hp_amp, using the formula Hp_amp = sqrt(1 - H0_amp^2 - Hm_amp^2) [since all three squares of the H_amp sum to 1]
v_Hp_amp = (1. - v_H0_amp**2 - v_Hm_amp**2)**(1./2.)
v_Hp_amp

0.3446840637480547+/-0.008870426863269882

In [17]:
#Add the value of Hp_amp into our dictionary (n gives us its nominal value, and s its standard deviation)
results_dict["Hp_amp"] = [v_Hp_amp.n, v_Hp_amp.s]
results_dict

{'H0_amp': [0.6842905569840235, 0.003584354066448079],
 'Hm_amp': [0.6426035557173815, 0.0028407413743337675],
 'Hm_phi': [-1.518089147445701, 0.03185314372137788],
 'Hp_amp': [0.3446840637480547, 0.008870426863269882],
 'Hp_phi': [1.5384539453990644, 0.05302641082634226]}

In [18]:
#Write fit results dictionary to a JSON file, which we can use later in other analyses
ana_dir = "/content/drive/MyDrive/Bc2JpsiMuNu_Analysis"
file_path = f"{ana_dir}/json/Bc2JspiMuNu_RapidSim_binned_fit_results.json"
with open(file_path, 'w') as f:
  json.dump(results_dict, f, sort_keys=True, indent=4)