# Statistics

In [None]:
import numpy as np
from scipy.stats import poisson as pois
from scipy.stats import norm
import matplotlib.pyplot as plt

import nbimporter
import prepData as prep
import fitFunc as fits

## Significance Test

We want to compute the significance of the observed dataset.  
The first steps are rescaling the dataset and fitting the background and signal functions.

We then compute the likelihood ratio of the observed data $q_0^{obs}$ between the signal and null hypothesis:

$$q_0^{obs} = -2 \cdot \log \left( \cfrac{\mathcal{L} \left(Data | 0, \hat{\theta}_0 \right)}{\mathcal{L} \left(Data | \hat{\mu}, \hat{\theta}_\hat{\mu} \right)}\right)$$

The ^ symbol indicates the values that optimize the fits.

In [None]:
# compute likelihood ratios of two hypotheses
def lh_ratio(y, model_null, model_alt):
    # compute log likelihoods
    LogLike_null = sum(norm.logpdf(x=y, loc=model_null.best_fit, scale=np.sqrt(y)))
    LogLike_alt  = sum(norm.logpdf(x=y, loc=model_alt.best_fit,  scale=np.sqrt(y)))
    
    #LogLike_null = sum(pois.logpmf(fft.astype(int), model_null.best_fit.astype(int)))
    #LogLike_alt  = sum(pois.logpmf(fft.astype(int), model_alt.best_fit.astype(int)))
    
    # ratio
    q = -2 * (LogLike_null - LogLike_alt)
    
    return(q)

The value of $q_0^{obs}$ has no meaning by itself, so we generate n = 10,000 toy datasets from the expected values given by the background fit and repeate the analysis for every new dataset. Both for the likelihood ratio and for the toy dataset generation, a normal approximation has been used instead of the formal Poisson distribution. 

In [None]:
def gen_toydataset(values, n):
    toy_dataset = norm.rvs(loc=values, scale=np.sqrt(values), size=(n,len(values)))
    
    #toy_dataset = pois.rvs(mu=values, size=(n,len(values)))
    
    return(toy_dataset)

The original $q_0^{obs}$ is thus compared with the distribution of $q_0$ obtained from the toy datasets, and the p-value is computed:

$$p_0 = P \left( q_0 \ge q_0^{obs} \right) = \int_{q_0^{obs}}^{+\infty} f(q_0 | 0, \hat{\theta}_0) \,dx $$ 

The significance is expressed as the number of $\sigma$s needed to achieve an equivalent p-value in a standard normal deviation:

$$z = \Phi^{-1} \left(1 - p_0 \right)$$  

In [None]:
def p_value(q, q_obs):
    p0 = sum(q >= q_obs)/n
    return(p0)

This process is repeted using every possible frequency as $x_0$, the center of the signal function.  
The complete code is:

In [None]:
def sig_test(run, signal, path='db/', n=1000):
    data,center,length = prep.load_dataset(run, path)
    freq, fft, weights, ref = prep.prep_data(data,center,length=length)
    res_bkg = fits.fit_bkg(freq, fft, weights, center, ref)
    
    z = np.zeros(len(freq))
    for i in range(len(freq)):
        z[i] = significance(freq, fft, weights, center, freq.values[i], res_bkg, signal, n)
    
    return(z)

In [None]:
def significance(freq, fft, weights, res_bkg, center, ref, x_0, signal, n=1000, draw = False):
    res_sig = fits.fit_sig(freq, fft, weights, x_0, res_bkg, signal)
    
    # compute likelihood ratio of observed data
    q0_obs = lh_ratio(fft, res_bkg, res_sig)
    
    # generate toy dataset and compute likelihood ratio for all of them
    toy_fft = gen_toydataset(res_bkg.best_fit, n)
    toy_weights = toy_fft/np.sqrt(1365500) # NOT UP TO DATE !!!!!!!!!!!!!!!!
    
    q0 = np.zeros(n)
    for i in range(n):
        toy_bkg = fits.fit_bkg(freq, toy_fft[i], toy_weights[i], center, ref)
        toy_sig = fits.fit_sig(freq, toy_fft[i], toy_weights[i], x_0, toy_bkg, signal)
        q0[i] = lh_ratio(toy_fft[i], toy_bkg, toy_sig)
    
    # plot significance distribution
    if(draw):
        plot_significance(q0, q0_obs)
        
    # compute significance
    p0 = p_value(q0, q0_obs)
    z = norm.ppf(1-p0)
        
    return(z)

In [None]:
def plot_significance(q0, q0_obs):
    # prepare canvas
    fig = plt.figure(figsize=(15,10))
    
    # plot q0 distribution
    N = len(q0)
    binning = int(np.sqrt(N))
    n, bins, _ = plt.hist(q0, bins=binning, density = True, 
                          facecolor='lightblue', edgecolor='black', label='Toy Experiments')
    plt.vlines(q0_obs, 0, max(n), colors='blue', linestyles='--', label='Observed Data')
    
    plt.legend(loc='upper left')
    plt.xlabel('q0')
    plt.ylabel('PDf')
    
    plt.show()

## Confidence Intervals

The process is similar to that of the significance test, but with a few key differences.  

In addition to fitting the background, we fit the signal twice: one time we let the $\mu$ run free to find $\hat{\mu}$, while the other we keep it fixed to a certain value.  
The likelihood ratio is thus computed as:

$$q^{obs}(\mu) = -2 \cdot \log \left( \cfrac{\mathcal{L} \left(Data | \mu, \hat{\theta}_{\mu} \right)}{\mathcal{L} \left(Data | \hat{\mu}, \hat{\theta}_\hat{\mu} \right)}\right)$$

We then generate two sets of n = 10,000 toy datasets each, one as before from the expected values given by the background fit while the other from the signal fit with fixed $\mu$. We compare the original $q^{obs}(\mu)$ with the distribution of $q(0)$ and $q(\mu)$ from the toy datasets and compute the two probabilities:

$$    p_{\mu} = P \left(q(\mu) \ge q^{obs}(\mu) | \mu s + b \right)$$
$$1 - p_{b}   = P \left(q(\mu) \ge q^{obs}(\mu) | b \right)$$

and take their ratio.  
This process is done scanning different values of $\mu$ and we take as the 95% confidence interval limit the value of $\mu$ so that the ratio is equal to 0.05:

$$CL: \mu \; \big| \; \cfrac{p_{\mu}}{1 - p_{b}} = 0.05$$

This process is repeted using every possible frequency as $x_0$, the center of the signal function.

In [None]:
def CI(freq, fft, weights, res_bkg, center, ref, x_0, signal, n=1000, draw = False):
    res_sig = fits.fit_sig(freq, fft, weights, x_0, res_bkg, signal)
    
    # compute likelihood ratio of observed data
    q0_obs = lh_ratio(fft, res_bkg, res_sig)
    
    # generate toy dataset and compute likelihood ratio for all of them
    toy_fft = gen_toydataset(res_bkg.best_fit, n)
    toy_weights = toy_fft/np.sqrt(1365500) # NOT UP TO DATE !!!!!!!!!!!!!!!!
    
    q0 = np.zeros(n)
    for i in range(n):
        toy_bkg = fits.fit_bkg(freq, toy_fft[i], toy_weights[i], center, ref)
        toy_sig = fits.fit_sig(freq, toy_fft[i], toy_weights[i], x_0, toy_bkg, signal)
        q0[i] = lh_ratio(toy_fft[i], toy_bkg, toy_sig)
    
    # plot significance distribution
    if(draw):
        plot_significance(q0, q0_obs)
        
    # compute p-value
    p = sum(q0 >= q0_obs)/n
    # compute significance
    z = norm.ppf(1-p)
        
    return(z)