# Statistics

In [1]:
import numpy as np
from scipy.stats import poisson as pois
from scipy.stats import norm
import matplotlib.pyplot as plt

import nbimporter
import prepData as prep
import fitFunc as fits

### Likelihood Ratio


In [2]:
# compute likelihood ratios of two hypotheses
def lh_ratio(y, model_null, model_alt):
    # compute log likelihoods
    LogLike_null = sum(norm.logpdf(x=y, loc=model_null.best_fit, scale=np.sqrt(y)))
    LogLike_alt  = sum(norm.logpdf(x=y, loc=model_alt.best_fit,  scale=np.sqrt(y)))
    
    #LogLike_null = sum(pois.logpmf(fft.astype(int), model_null.best_fit.astype(int)))
    #LogLike_alt  = sum(pois.logpmf(fft.astype(int), model_alt.best_fit.astype(int)))
    
    # significance of the test
    q = -2 * (LogLike_null - LogLike_alt)
    
    return(q)

### Generate Toy Datasets

In [3]:
def gen_toydataset(values, n):
    toy_dataset = norm.rvs(loc=values, scale=np.sqrt(values), size=(n,len(values)))
    
    #toy_dataset = pois.rvs(mu=values, size=(n,len(values)))
    
    return(toy_dataset)

## Significance Test

First, we load and prepare the dataset, and fit the background function.  
Next, we compute the significance of the observed dataset. In order to do so, we fit the signal+background function and compute the likelihood ratio $q_0^{obs}$ between the signal and null hypothesis. Then, we generate n = 10,000 toy dataset from the expected values given by the fit and repeate the analysis for every new dataset. The original $q_0^{obs}$ is compared with the distribution of $q_0$ obtained from the toy datasets, and the p-value is computed. Lastly, the significance is expressed as the number of $\sigma$s needed to achieve an equivalent p-value in a standard normal deviation $z = \Phi^{-1} \left(1 - p \right)$.  
This process is repeted using every possible frequency as $x_0$, the center of the signal function.

In [4]:
def sig_test(run, signal, path='db/', n=1000):
    data,center,length = prep.load_dataset(run, path)
    freq, fft, weights, ref = prep.prep_data(data,center,length=length)
    res_bkg = fits.fit_bkg(freq, fft, weights, center, ref)
    
    z = np.zeros(len(freq))
    for i in range(len(freq)):
        z[i] = significance(freq, fft, weights, center, freq.values[i], res_bkg, signal, n)
    
    return(z)

In [5]:
def significance(freq, fft, weights, res_bkg, center, ref, x_0, signal, n=1000, draw = False):
    res_sig = fits.fit_sig(freq, fft, weights, x_0, res_bkg, signal)
    
    # compute likelihood ratio of observed data
    q0_obs = lh_ratio(fft, res_bkg, res_sig)
    
    # generate toy dataset and compute likelihood ratio for all of them
    toy_fft = gen_toydataset(res_bkg.best_fit, n)
    toy_weights = toy_fft/np.sqrt(1365500) # NOT UP TO DATE !!!!!!!!!!!!!!!!
    
    q0 = np.zeros(n)
    for i in range(n):
        toy_bkg = fits.fit_bkg(freq, toy_fft[i], toy_weights[i], center, ref)
        toy_sig = fits.fit_sig(freq, toy_fft[i], toy_weights[i], x_0, toy_bkg, signal)
        q0[i] = lh_ratio(toy_fft[i], toy_bkg, toy_sig)
    
    # plot significance distribution
    if(draw):
        plot_significance(q0, q0_obs)
        
    # compute p-value
    p = sum(q0 >= q0_obs)/n
    # compute significance
    z = norm.ppf(1-p)
        
    return(z)

In [6]:
def plot_significance(q0, q0_obs):
    # prepare canvas
    fig = plt.figure(figsize=(15,10))
    
    # plot q0 distribution
    N = len(q0)
    binning = int(np.sqrt(N))
    n, bins, _ = plt.hist(q0, bins=binning, density = True, 
                          facecolor='lightblue', edgecolor='black', label='Toy Experiments')
    plt.vlines(q0_obs, 0, max(n), colors='blue', linestyles='--', label='Observed Data')
    
    plt.legend(loc='upper left')
    plt.xlabel('q0')
    plt.ylabel('PDf')
    
    plt.show()