In [None]:
%load_ext autoreload
%autoreload 2

import os
import uproot
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cbook as cbook
from matplotlib.legend_handler import HandlerLine2D, HandlerTuple
import numpy as np
import pandas as pd
from decimal import Decimal
from scipy.stats import norm
from scipy.optimize import curve_fit
from scipy import stats
import datetime as dt
import scipy.optimize
import landau
from scipy.interpolate import CubicSpline, interp2d
from scipy.integrate import quad
from scipy.special import erf
from numpy import linalg

import importlib
from multiprocessing import Pool

from tqdm.auto import tqdm

# local imports
from lib.constants import *

In [None]:
dosave = True
plt.rcParams.update({'font.size': 14})
plotqual = "mc_"
savedir = "./plots_11_14_23/%s" % plotqual

plottitle = "Run %i"
tpcnames = ["EE", "EW", "WE", "WW"]

datadir = "/icarus/data/users/gputnam/DMCP2023G/calib-data/"
filedirs = [
        "/pnfs/sbn/persistent/users/gputnam/calib-data/Run1/",
]

filedirs = [
    "/pnfs/sbn/persistent/users/gputnam/DMCP2023G/calib-mc/",
]

isMC = True

In [None]:
lifetime_file = datadir + "lifetimes_runArun1run2.txt"
scaleyz_file = datadir + "P2_scaleYZ_Run1.txt"

In [None]:
from XRootD import client

def open_hdf(fname):
    if not fname.startswith("/pnfs"):
        return pd.read_hdf(fname, key="df")
    
    fname = fname.replace("/pnfs", "root://fndca1.fnal.gov:1094/pnfs/fnal.gov/usr")
    d = {}
    with client.File() as f:
        f.open(fname)
        with pd.HDFStore(
                "data.h5",
                mode="r",
                driver="H5FD_CORE",
                driver_core_backing_store=0,
                driver_core_image=f.read()[1]
                ) as store:
            return store["df"]

In [None]:
if not isMC:
    files = [filedir+f for filedir in filedirs for f in os.listdir(filedir) if f.endswith(".df") and f.startswith("calib") 
             and "anode" not in f and "old" not in f and "sce" not in f for filedir in filedirs]

else:
    files = [filedirs[0] + "calib_mcnuphase2.df"]
    
files

In [None]:
dfs = [pd.read_hdf(f) for f in files]
# dfs = [open_hdf(f) for f in files]

In [None]:
offset = 0
for i,(fname, df) in enumerate(zip(files, dfs)):
    df["itrk"] = df.index.get_level_values(0) + offset
    if not df.empty:
        offset += df.index.get_level_values(0).max() + 1
    if "anode" in fname:
        df["anode"] = True
    else:
        df["anode"] = False
    dfs[i] = df.join((df.true_rr.groupby(level=0).min() < 10).rename("true_stopping"))
    # dfs[i] = dfs[i][(df.pitch < 0.4)]
    
#     if isMC:
#         dfs[i] = dfs[i][dfs[i].whicht0 == 0]

In [None]:
if not isMC:
    todelete = [c for c in dfs[0].columns if "true_" in c or "trueh_" in c]
    todelete += ["michelE", "closest_tdaughter"]
    for df in dfs:
        for c in todelete:
            if c in df:
                del df[c]

In [None]:
data = pd.concat(dfs, ignore_index=True)

In [None]:
del dfs

In [None]:
data.columns

In [None]:
data[:10000]

In [None]:
data["thit"] = (data.time * tick_period - data.pandora_t0 - tanode*tick_period) / 1000.

In [None]:
is_stopping = True # x_endp_fid_perhit #

In [None]:
data["tpcEE"] = data.tpcE & (data.cryostat == 0)
data["tpcEW"] = ~data.tpcE & (data.cryostat == 0)
data["tpcWE"] = data.tpcE & (data.cryostat == 1)
data["tpcWW"] = ~data.tpcE & (data.cryostat == 1)

In [None]:
data["itpc"] = -1
data.loc[data.tpcEE, "itpc"] = 0
data.loc[data.tpcEW, "itpc"] = 1
data.loc[data.tpcWE, "itpc"] = 2
data.loc[data.tpcWW, "itpc"] = 3

In [None]:
data["lifetime"] = np.nan

if not isMC:
    with open(lifetime_file) as f:
        next(f)
        for line in f:
            dat = list(map(float, line.rstrip("\n").split(" ")))
            run = int(dat[0])
            data.loc[(data.run == run) & data.tpcEE, "lifetime"] = dat[1]
            data.loc[(data.run == run) & data.tpcEW, "lifetime"] = dat[2]
            data.loc[(data.run == run) & data.tpcWE, "lifetime"] = dat[3]
            data.loc[(data.run == run) & data.tpcWW, "lifetime"] = dat[4]
else:
    data.lifetime = 3e3
        
# data.lifetime = 3e3

In [None]:
yz_ybin = np.linspace(-180, 130, 32)
yz_ylos = yz_ybin[:-1]
yz_yhis = yz_ybin[1:]
yz_ys = (yz_ylos + yz_yhis) / 2.

yz_zbin = np.linspace(-900, 900, 181)
yz_zlos = yz_zbin[:-1]
yz_zhis = yz_zbin[1:]
yz_zs = (yz_zlos + yz_zhis) / 2.

if isMC:
    yz_ybin = np.linspace(-180, 120, 11)
    yz_ylos = yz_ybin[:-1]
    yz_yhis = yz_ybin[1:]
    yz_ys = (yz_ylos + yz_yhis) / 2.

    yz_zbin = np.linspace(-900, 900, 61)
    yz_zlos = yz_zbin[:-1]
    yz_zhis = yz_zbin[1:]
    yz_zs = (yz_zlos + yz_zhis) / 2.

In [None]:
data["ybin"] = np.searchsorted(yz_ybin, data.p_y.values) - 1

In [None]:
data["zbin"] = np.searchsorted(yz_zbin, data.p_z.values) - 1

In [None]:
data["scale_yz"] = np.nan

if not isMC:
    idx = []
    scales = []

    with open(scaleyz_file) as f:
        next(f)
        for line in f:
            dat = line.rstrip("\n").split("\t")
            tpc = dat[0]
            iy = int(dat[1])
            iz = int(dat[2])
            scale = float(dat[3])

            idx.append((tpcnames.index(tpc), iy, iz))
            scales.append(scale)

    scaledf = pd.DataFrame(scales, index=pd.MultiIndex.from_tuples(idx, names=["itpc", "ybin", "zbin"]), 
                           columns=["scale_yz"])
    dtmp = pd.merge(data[["itpc", "ybin", "zbin"]], scaledf, on=["itpc", "ybin", "zbin"], how="left")
    
    data["scale_yz"] = dtmp.scale_yz
    
    del dtmp

else:
    data.scale_yz = 1

In [None]:
data.scale_yz

In [None]:
data.dqdx_nocorr

In [None]:
# Normalize by drift time
data["dqdx_normt"] = data.dqdx_nocorr * np.exp(data.thit / data.lifetime)
data["dqdx_normyz"] = data.dqdx_nocorr / data.scale_yz
data["dqdx_normed"] = data.dqdx_normt / data.scale_yz

In [None]:
def fidYZ(data, iny=20, inz=100):
    ymax = 134
    ymin = -180
    
    zmin = -900
    zmax = 900
    
    fid = (data.p_y > ymin + iny) & (data.p_y < ymax - iny)\
        & (data.p_z < zmax - inz) & (data.p_z > zmin + inz)
    
    if not isMC:
        # Cut out some problem regions in the detector
        fid = fid & (np.abs(data.p_z) > 10)

        # TPC EW -- not that bad
#         bad_tpcEW = data.tpcEW & (data.p_z > 700) & (data.p_y < 0)
        bad_tpcEW = False
    
        # TPC WW
        bad_tpcWW = data.tpcWW & (data.p_y > 80) & (data.p_z > 0)

        fid = fid & ~bad_tpcEW & ~bad_tpcWW
    
    return fid

In [None]:
data["fid"] = fidYZ(data)

In [None]:
data.fid.sum()

In [None]:
# (data.fid & is_stopping & data.true_stopping).sum() / (data.fid & is_stopping).sum()

In [None]:
# bins = np.linspace(0, 2500, 26)

# _ = plt.hist(cm2_dqdx, bins=bins)

In [None]:
rpt = data.groupby("itrk").itrk.count()

In [None]:
# cm2_dqdx_perhit = np.repeat(cm2_dqdx.values, rpt) 

In [None]:
# data["cm2_dqdx"] = cm2_dqdx_perhit

In [None]:
endp_ind = data.groupby("itrk").rr.idxmin()

In [None]:
x_endp = data.p_x.loc[endp_ind]

In [None]:
x_endp_fid = (np.abs(x_endp) > 61.94 + 15) & (np.abs(x_endp) < 358.49 - 15)

In [None]:
x_endp_fid_perhit = np.repeat(x_endp_fid.values, rpt) 

In [None]:
data["x_endp_fid"] = x_endp_fid_perhit

In [None]:
_ = plt.hist(data.p_y)

In [None]:
bins = np.linspace(0, 1000, 21)
_ = plt.hist(data.thit[(data.rr > 80) & data.fid], bins=bins)
plt.xlabel("Hit Drift Time [$\\mu$s]")
plt.ylabel("Entries (R.R. > 80cm)")
plt.tight_layout()

# if dosave: plt.savefig(savedir + "hit_time.pdf")

In [None]:
bins = np.linspace(0, 2, 21)
_ = plt.hist(data.pitch[(data.rr > 80) & data.fid], bins=bins)
plt.xlabel("Hit Pitch [cm]")
plt.ylabel("Entries (R.R. > 80cm)")
plt.tight_layout()

# if dosave: plt.savefig(savedir + "hit_pitch.pdf")

In [None]:
bins = np.linspace(50, 300, 51)
_ = plt.hist(data.rr[data.fid], bins=bins)

plt.xlabel("Hit Residual Range [cm]")
plt.ylabel("Fiducial Entries")
plt.tight_layout()

# if dosave: plt.savefig(savedir + "hit_rr.pdf")

In [None]:
import gc
gc.collect()

In [None]:
categories = [data.tpcEE, data.tpcEW, data.tpcWE, data.tpcWW]
names = ["TPC EE", "TPC EW", "TPC WE", "TPC WW"]

if isMC:
    data["MC"] = True
    categories = [data.MC]
    names = ["MC"]

DQDX_NAME = "dqdx_normed"

In [None]:
bins = np.linspace(0, 4000, 41)
when = (data.pitch > 0.3) & (data.pitch < 0.4) & categories[-1] & is_stopping #& (data.cm2_dqdx > 1200)
varx = data.rr
vary = data.dqdx_normed

_ = plt.hist2d(varx[when], vary[when], bins=[np.linspace(0, 50, 41), bins])
# plt.ylim([500, 1500])

In [None]:
# dQdx binning
bins = np.linspace(200, 4000, 38*4+1)
bin_centers = (bins[1:] + bins[:-1]) / 2.

In [None]:
when = (data.pitch > 0.3) & (data.pitch < 0.4) & (data.rr > 200) & data.fid & data.tpcWW

_ = plt.hist(data.dqdx_nocorr[when], bins=bins, histtype="step", label="No Corr")
# _ = plt.hist(data.dqdx_normyz[when], bins=bins, histtype="step")

_ = plt.hist(data.dqdx_normt[when], bins=bins, histtype="step", label="T Corr")
_ = plt.hist(data.dqdx_normed[when], bins=bins, histtype="step", label="T+Y-Z Corr")

plt.legend()
plt.xlim([200, 1500])

plt.xlabel("dQ/dx [ADDC]")
plt.ylabel("Hits")

plt.title(names[-1].replace("_", " "))

plt.text(0.525, 0.35, "200 < R.R. < 300cm\n0.3 < pitch < 0.4cm", transform=plt.gca().transAxes)

plt.tight_layout()
# if dosave:
#     plt.savefig(savedir + "dqdx_resoluiton.pdf")

In [None]:
DRIFTS = [500, 600, 700, 800, 900]
DRIFTLOS = DRIFTS[:-1]
DRIFTHIS = DRIFTS[1:]

In [None]:
PITCHES = [0.3, 0.4] #, 0.6] #, 0.8]

PITCHLOS = PITCHES[:-1]
PITCHHIS = PITCHES[1:]

In [None]:
def landau_gaus(X, *p):
    mpv, eta, sigma, A = p
    sigma = np.minimum(sigma, 100*eta)
    return landau.landau.gauss_landau(X, mpv, eta, sigma, A)

def langau_chi2(x, y, yerr, popt):
    return np.sum(((landau_gaus(x, *popt) - y) / yerr)**2)

def opt_to_mpv(popt, perr):
    mpv = popt[0] - 0.22278*popt[1]
    mpv_err = np.sqrt(np.diag(perr)[0] + (0.22278)**2*np.diag(perr)[1] - 0.22278*(perr[0,1] + perr[1,0]))
    return mpv, mpv_err


In [None]:
data.columns

In [None]:
if not isMC:
    del data["integral"]
    del data["wire"]
    #del data["t0_tpcE"]
    #del data["t0_tpcW"]
    
    del data["width"]
    del data["sumadc"]
    # del data["dqdx_nocorr"]
    del data["mint_tpcE"]
    del data["maxt_tpcE"]
    del data["mint_tpcW"]
    del data["maxt_tpcW"]
    
    del data["dir_y"]
    del data["dir_z"]
    del data["ybin"]
    del data["zbin"]
    del data["run"]
    del data["evt"]
    del data["subrun"]

In [None]:
data.columns

In [None]:
import gc
gc.collect()

In [None]:
data["phi"] = np.arccos(np.abs(data.dir_x))*180/np.pi

In [None]:
data["thxw"] = np.abs(np.arctan(data.dir_x*data.pitch/0.3)*180/np.pi)

In [None]:
_ = plt.hist(data.phi)

In [None]:
mpv_df = pd.DataFrame(
    columns = [
        "rr",
        "rrlo",
        "rrhi",
        "pitch",
        "pitchlo",
        "pitchhi",
        "tdrift",
        "tlo",
        "thi",
        "mpv",
        "mpv_err",
        "loc",
        "loc_err",
        "eta",
        "eta_err",
        "sigma",
        "sigma_err",
        "phi",
        "philo",
        "phihi",
        "A",
        "A_err",
        "N",
        "chi2",
    ] + [n.replace(" ", "_") for n in names] \
    + ["bin%i" % i for i in range(len(bin_centers))] \
    + ["N%i" % i for i in range(len(bin_centers))]
)

In [None]:
def inner(inp):
    wirep = 0.3
    (c, name, idrift, dlo, dhi, ip, pitchlo, pitchhi, irr, rrlo, rrhi) = inp

    when = (data.pitch < pitchhi) & (data.pitch > pitchlo) &\
        (data.thit < dhi) & (data.thit > dlo) &\
        (data.rr > rrlo) & (data.rr < rrhi) & c &\
        (data.phi > 80) & (data.phi < 85) &\
        (data.thxw > 5) & (data.thxw < 20) &\
        is_stopping & data.fid
    
    # print(data.groupby("itrk").rr.max()[data.groupby("itrk").longtrk.first()].min())

    # Guesstimate of the MPV dQ/dx range
    if rrlo < 1.:
        mpv_lo = 1000.
        mpv_hi = 4000.
        mpv0 = 2500.
    elif rrlo < 5.:
        mpv_lo = 800.
        mpv_hi = 2000.
        mpv0 = 1500.
    elif rrlo < 10.:
        mpv_lo = 500.
        mpv_hi = 1500.
        mpv0 = 1000.
    else:
        mpv_lo = 500.
        mpv0 = 640.
        mpv_hi = 1000.

    N,_ = np.histogram(data[DQDX_NAME][when], bins=bins)
    
    # only fit near peak
    maxbin = np.argmax(N)
    if np.max(N) < 50:
        return None

    when_fit = np.abs(np.array(range(len(bin_centers))) - maxbin) < 20 # within 10 bin

    p0 = [mpv0, 35, 100, np.max(N) / landau_gaus(mpv0, mpv0, 35, 100, 1)]
    bounds = ([mpv_lo, 5, 25, np.max(N)*40/100.], [mpv_hi, 150., 350., np.max(N)*40*100.])
    fitfun = landau_gaus
    
    popt, perr = curve_fit(fitfun, bin_centers[when_fit], N[when_fit], 
                           p0=p0, maxfev=1_000_000, sigma=np.maximum(np.sqrt(N),1)[when_fit], bounds=bounds)

    m, m_err = opt_to_mpv(popt, perr)
    
    chi2 = langau_chi2(bin_centers, N, np.maximum(np.sqrt(N), 1), popt) / (np.sum(N>0) -2)
    
    phi = np.mean(data.phi[when])
    phi_std = np.std(data.phi[when])

    row = {
        "rr": RRs[irr],
        "rrlo": rrlo,
        "rrhi": rrhi,
        "pitch": (pitchlo + pitchhi)/2.,
        "pitchlo": pitchlo,
        "pitchhi": pitchhi,
        "tdrift": (dlo + dhi) / 2.,
        "tlo": dlo,
        "thi": dhi,
        "mpv": m,
        "mpv_err": m_err,
        "loc": popt[0],
        "loc_err": np.sqrt(np.diag(perr))[0],
        "eta": popt[1],
        "eta_err": np.sqrt(np.diag(perr))[1],
        "sigma": popt[2],
        "sigma_err": np.sqrt(np.diag(perr))[2],
        "phi": phi,
        "philo": phi - phi_std,
        "phihi": phi + phi_std,
        "A": popt[3],
        "A_err": np.sqrt(np.diag(perr))[3],
        "N": when.sum(),
        "chi2": chi2,
    }
    for n in names:
        row[n.replace(" ", "_")] = (n == name)
    for i, (b, n) in enumerate(zip(bin_centers, N)):
        row["bin%i" % i] = b
        row["N%i" % i] = n

    for k in row.keys():
        row[k] = [row[k]]

    return row

In [None]:
# Residual Range binning
#rrs = np.linspace(50., 200., 31)
#rrs = np.hstack([np.linspace(0, 6, 7), np.linspace(8, 18, 6), np.linspace(20., 200., 37)])
rrs = np.hstack([np.linspace(0, 6, 7), np.linspace(8, 18, 6), np.linspace(20., 155., 135//5+1), np.linspace(160, 300, 15)])

rrhis = rrs[1:]
rrlos = rrs[:-1]
RRs = (rrhis + rrlos)/2.

In [None]:
name = names[-1]
category = categories[-1]

row = inner((category, name, 1, 800, 900, 0, 0.3, 0.4, -1, 80, 85))

row_df = pd.concat([mpv_df, pd.DataFrame(row)])

In [None]:
# Plot fits

ifig = 0

for index, row in (row_df.iterrows()):
    
    plt.figure(ifig)
    
    name = [n for n in names if row[n.replace(" ", "_")]][0]

    N,_,_ = plt.hist(bin_centers, bins=bins, label="Data", weights=[row["N%i" % i] for i in range(len(bin_centers))])
    chi2 = row.chi2

    plt.plot(bin_centers, landau_gaus(bin_centers, row["loc"], row.eta, row.sigma, row.A), 
             label="Fit")
    
    plt.legend(fontsize=14)
    plt.text(0.53, 0.09, "%.0f < R.R. < %.0f cm\n%.0f < $t_\\mathrm{drift}$ < %.0f $\mu$s\n%.2f < pitch < %.2f cm\n\nMPV = %.2f ADDC/cm\n$\eta$ = %.2f ADDC/cm\n$\sigma$ = %.2f ADDC/cm\n$\\tilde{\\chi}^2$=%.2f" %
                  (row.rrlo, row.rrhi, row.tlo, row.thi, row.pitchlo, row.pitchhi, row.mpv, row.eta, row.sigma, chi2),
            fontsize=12, transform=plt.gca().transAxes)

    plt.xlabel("dQ/dx [ADDC/cm]")
    plt.ylabel("# Depositions")
    plt.title(name)
    if row.rr < 2.:
        plt.xlim([200., 4000.])
    elif row.rr < 5.:
        plt.xlim([200., 3000.])
    elif row.rr < 10.:
        plt.xlim([200., 2500.])
    else:
        plt.xlim([200., 2000.])

    plt.tight_layout()
    #if dosave: plt.savefig((savedir + "dqdx_langau_fit_RR%.0f_%.0f_%s_pitch%.2f_drift%.0f" %
    #                       (row.rrlo, row.rrhi, name, row.pitch, row.tdrift)).replace(".", "_") + ".pdf")

    ifig += 1


In [None]:
# Fit for all the MPV's
with Pool(processes=24) as pool:
    with tqdm(total=len(names)*len(DRIFTLOS)*len(PITCHLOS)*len(rrlos)) as pbar:
        for c, name in zip(categories, names):
            for idrift, (dlo, dhi) in enumerate(zip(DRIFTLOS, DRIFTHIS)):
                for ip, (plo, phi) in enumerate(zip(PITCHLOS, PITCHHIS)):
                    inp = [(c, name, idrift, dlo, dhi, ip, plo, phi, irr, rrlo, rrhi) for 
                               irr, (rrlo, rrhi) in enumerate(zip(rrlos, rrhis))]

                    for row in pool.imap_unordered(inner, inp):
                        if row:
                            mpv_df = pd.concat([mpv_df, pd.DataFrame(row)], ignore_index=True)

                    pbar.update(len(rrlos))

In [None]:
mpv_df_save = mpv_df.infer_objects()

In [None]:
mpv_df_save.phi

In [None]:
if dosave:
     mpv_df_save.to_hdf(datadir + "mpvs_muon_phi80-85_mcnuphase2.df", key="df")