# Fitting Notebook
Here we fit the phi-distributions from the analysis notebook.
- Model selection - Cross validation score is used to demonstrate that fitting with the simple model is probably best.
- Fits of $\phi$ - Two methods of fitting the phi distributions are used, one single fit, and the bootstrap replica technique.
- MC study of fitting procedure - This has to wait, but I am going to use MC to throw a distribution in phi that follows the error bars of our measurement.
- Visualization
    - fit phi distributions 
    - bootstrap histograms
    - bootstrap score histograms
    - integrated fit results with errors

In [14]:
import glob 
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 
import pymc3 as pm 
import sys
import time 

from scipy.optimize import minimize

%matplotlib inline

plt.rc('font', family='serif')
plt.rc('font', size=18)

### Setup Fitting 
Our model is defined below, it can be one of several different functions.  Fitting is done by minimizing the $\chi^2$.  Errors are estimated with the covariance matrix, or the bootstrap replica method.  Cross validation is defined, and the different models are evaluated.  

In [2]:
def chi2(y_true, y_pred, y_err):
    return np.sum((y_true-y_pred)**2/y_err**2)

In [3]:
class Model(object):

    def __init__(self):
        self.n_pars = 1
        self.pars   = np.zeros(shape=(self.n_pars, 1))
        self.bounds = np.array([[-1, 1],], dtype=np.float32)
        
    def get_initial_parameters(self):
        self.pars = np.random.uniform(-1.0, 1.0, size=(self.n_pars, 1))

    def evaluate(self, x):
        return 1.0

class SineModel(Model):

    def __init__(self):
        Model.__init__(self)
        self.n_pars = 1
        self.pars = np.zeros(shape=(self.n_pars, 1), dtype=np.float32)
        
    def evaluate(self, x):
        return self.pars[0]*np.sin( x*np.pi/180.0 )

class FullModel(Model):

    def __init__(self):
        Model.__init__(self)
        self.n_pars = 3
        self.pars = np.zeros(shape=(self.n_pars, 1), dtype=np.float32)
        self.bounds = np.array([[-1,1],
                                [-1,1],
                                [-1,1]],dtype=np.float32)
    def evaluate(self, x):
        return self.pars[0]*np.sin( x*np.pi/180.0 ) / (1 + self.pars[1]*np.cos(x*np.pi/180.0) + self.pars[2]*np.cos(2*x*np.pi/180.0))

def update_model(model, pars, x):
    model.pars = pars
    return model.evaluate(x)

In [4]:
model = FullModel()
model.get_initial_parameters()

In [5]:
def single_fit(model, data, use_sys=False):
    model.get_initial_parameters()
    
    if use_sys:
        data_errs = np.sqrt(data.stat**2 + data.sys_total**2)
    else:
        data_errs = data.stat 
    
    res = minimize(fun=lambda x: chi2(data.value, update_model(model, x, data.phi), data_errs), 
             x0=model.pars, bounds=model.bounds)

    identity = np.identity(len(model.pars))
    err = np.sqrt(np.array(np.matrix(res.hess_inv * identity).diagonal()))
    #err.reshape(model.n_pars,1)
    
    return res.x, err[0]

In [29]:
def fit_dataset(data, model, fit_type='single', use_sys=False):
    '''
    inputs
    ------
    
    data: a dataframe which contains the output of the analysis notebook, phi-distributions 
    
    model: a model object
    
    '''
    
    result = {}
    result['axis'] = []
    result['axis_bin'] = []
    
    params = {}
    params['axis'] = []
    params['axis_bin'] = []
    params['value'] = []
    
    for p in range(model.n_pars):
        result['par_%d' % p] = []
        result['err_%d' % p] = []
        
    for axis in np.unique(data.axis):
        dsub = data.query('axis == "%s"' % axis)
        
        for bin in np.unique(dsub.axis_bin):
            d = dsub.query('axis_bin == %d' % bin)
            
            
            
            if not use_sys:
                data_errs = d.stat 
            else:
                data_errs = np.sqrt(d.stat**2 + d.sys_total**2)
                
#            print(' Fitting %s,%d' % (axis, bin))
            
            # get fit to data
            
            if fit_type is 'single':
                pars,errs = single_fit(model, d, use_sys)

                params['value'].append(pars)
            
            elif fit_type is 'pymc3':
                basic_model = pm.Model()

                with basic_model:
                    alpha = pm.Bound(pm.Normal, lower=-1, upper=1)('alpha', mu=0, sd=1)
                    beta  = pm.Bound(pm.Normal, lower=-1, upper=1)( 'beta', mu=0, sd=0.05)
                    gamma = pm.Bound(pm.Normal, lower=-1, upper=1)('gamma', mu=0, sd=0.05)
    
                    mu = alpha * np.sin( (np.pi/180) * d.phi) / (1 + \
                                                        beta * np.cos( (np.pi/180) * d.phi) + \
                                                        gamma * np.cos( 2*(np.pi/180) * d.phi)
                                                        )
        
                    y = pm.Normal('y', mu=mu, sd=data_errs, observed=d.value)
                    trace = pm.sample(1000, tune=500, progressbar=False)
                    
                    pars = []
                    errs = []
                    pars.append(np.average(trace['alpha']))
                    pars.append(np.average(trace['beta']))
                    pars.append(np.average(trace['gamma']))
                    errs.append(np.std(trace['alpha']))
                    errs.append(np.std(trace['beta']))
                    errs.append(np.std(trace['gamma']))
            
                    params['value'].append(np.array([trace['alpha'],
                                                    trace['beta'],
                                                    trace['gamma']]))
            
            result['axis'].append(axis)
            result['axis_bin'].append(bin)
            
            params['axis'].append(axis)
            params['axis_bin'].append(bin)
            
            for p in range(model.n_pars):
                result['par_%d' % p].append(pars[p])
                result['err_%d' % p].append(errs[p])
            
    return pd.DataFrame(result), params

### Load Configurations
There are several files with different results for phi-distributions.  

In [30]:
database_files = glob.glob('database/phi/*.csv')
print('Found %d files in the database.' % len(database_files))

Found 31 files in the database.


In [31]:
def load_database_files(file_list):
    
    dataframe_store = {}
    for f in file_list:
        dataframe_store[f] = pd.read_csv(f)
        
    return dataframe_store

In [32]:
dataframe_store = load_database_files(database_files)

In [33]:
def calculate_total_number_of_fits(dataframe_store):
    total_fits = 0
        
    for file_name, dataframe in dataframe_store.iteritems():
        axes = np.unique(dataframe.axis)
        
        for axis in axes:
            total_fits += len(np.unique(dataframe.query('axis == "%s"' % axis).axis_bin))
        
    return total_fits 

In [34]:
total_fits = calculate_total_number_of_fits(dataframe_store)
print('%d fits will be performed.' % total_fits)

1240 fits will be performed.


In [35]:
def get_output_path(input_path):
    return input_path.replace('phi', 'fit')

In [37]:
index = 0
t_start = time.time() 
for file_name in dataframe_store.keys():
    
    fit_df, pars = fit_dataset(dataframe_store[file_name], model, fit_type='single', use_sys=False)
    
    output_path = get_output_path(file_name)
    fit_df.to_csv(output_path)
    
    index += 1

    elapsed_time = time.time() - t_start 
    message_string = '\rFile (%d/%d) finished in %f seconds.' % (index, len(dataframe_store.keys()), elapsed_time)
    sys.stdout.write(message_string)
    sys.stdout.flush()

File (31/31) finished in 292.506592 seconds.