# Systematics Notebook
From results of the phi fitting.  Before running this notebook the output from `src/fit/run-fitter.py` is needed.  

In [1]:
import glob 
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import pickle
import os
import sys
import time 

from datetime import datetime

%matplotlib inline

plt.rc('font', family='serif')
plt.rc('font', size=18)
plt.rc('text', usetex = True)

### Load Configurations
There are several files with different results for phi-distributions.  

In [2]:
database_files = glob.glob('database/fit/*.csv')
print('Found %d files in the database.' % len(database_files))

for f in database_files:
    print(f, datetime.utcfromtimestamp(
        os.path.getmtime(f)).strftime('%Y-%m-%d %H:%M:%S'))

Found 47 files in the database.
database/fit/variation_dist_ecv_1.csv 2020-02-14 15:23:01
database/fit/variation_alpha_1.csv 2020-02-14 15:26:58
database/fit/variation_dist_dcr3_1.csv 2020-02-14 15:30:54
database/fit/variation_dist_dcr1_1.csv 2020-02-14 15:34:51
database/fit/variation_alpha_-1.csv 2020-02-14 15:50:08
database/fit/variation_dvz_1.csv 2020-01-30 12:35:26
database/fit/variation_dist_dcr3_-1.csv 2020-02-14 15:54:19
database/fit/sector_4_sin.csv 2020-01-30 12:35:26
database/fit/variation_dist_dcr1_-1.csv 2020-02-14 15:58:24
database/fit/sector_5_sin.csv 2020-01-30 12:35:26
database/fit/variation_dist_ec_edep_1.csv 2020-02-14 16:02:29
database/fit/variation_dist_ecu_1.csv 2020-02-14 16:06:37
database/fit/variation_dist_ecsf_1.csv 2020-02-14 16:10:32
database/fit/variation_dist_ecu_-1.csv 2020-02-14 16:14:28
database/fit/variation_dist_vz_1.csv 2020-02-14 16:18:38
database/fit/variation_dist_ecw_1.csv 2020-02-14 16:22:35
database/fit/variation_dist_cc_-1.csv 2020-02-14 16:26:

In [3]:
def load_database_files(file_list):
    
    dataframe_store = {}
    for f in file_list:
        dataframe_store[f] = pd.read_csv(f)
        
    return dataframe_store

In [4]:
dataframe_store = load_database_files(database_files)

Finally, load the nominal phi distributions.

In [5]:
nominal = pd.read_csv('database/fit/sys.csv')

In [6]:
def fix_bootstrap_entry(x):
    return np.array([float(entry) for entry in x.strip('[').strip(']').split()])

def exclude_outlier_samples(x, op = np.mean):
    return op(x[np.where((x>-0.5)&(x<0.5))[0]])

In [7]:
nominal['bootstraps_0'] = nominal['bootstraps_0'].apply(fix_bootstrap_entry)
nominal['bootstraps_1'] = nominal['bootstraps_1'].apply(fix_bootstrap_entry)
nominal['bootstraps_2'] = nominal['bootstraps_2'].apply(fix_bootstrap_entry)

#nominal['par_0'] = nominal['bootstraps_0'].apply(lambda x: exclude_outlier_samples(x, np.mean))
#nominal['err_0'] = nominal['bootstraps_0'].apply(lambda x: exclude_outlier_samples(x, np.std))
#nominal['par_1'] = nominal['bootstraps_1'].apply(lambda x: exclude_outlier_samples(x, np.mean))
#nominal['err_1'] = nominal['bootstraps_1'].apply(lambda x: exclude_outlier_samples(x, np.std))
#nominal['par_2'] = nominal['bootstraps_2'].apply(lambda x: exclude_outlier_samples(x, np.mean))
#nominal['err_2'] = nominal['bootstraps_2'].apply(lambda x: exclude_outlier_samples(x, np.std))


for index in dataframe_store.keys():
    print('Working on {}'.format(index))
    dataframe_store[index]['bootstraps_0'] = dataframe_store[index]['bootstraps_0'].apply(fix_bootstrap_entry)
    dataframe_store[index]['bootstraps_1'] = dataframe_store[index]['bootstraps_1'].apply(fix_bootstrap_entry)
    dataframe_store[index]['bootstraps_2'] = dataframe_store[index]['bootstraps_2'].apply(fix_bootstrap_entry)
    
    #dataframe_store[index]['par_0'] = dataframe_store[index]['bootstraps_0'].apply(lambda x: exclude_outlier_samples(x, np.mean))
    #dataframe_store[index]['err_0'] = dataframe_store[index]['bootstraps_0'].apply(lambda x: exclude_outlier_samples(x, np.std))
    #dataframe_store[index]['par_1'] = dataframe_store[index]['bootstraps_1'].apply(lambda x: exclude_outlier_samples(x, np.mean))
    #dataframe_store[index]['err_1'] = dataframe_store[index]['bootstraps_1'].apply(lambda x: exclude_outlier_samples(x, np.std))
    #dataframe_store[index]['par_2'] = dataframe_store[index]['bootstraps_2'].apply(lambda x: exclude_outlier_samples(x, np.mean))
    #dataframe_store[index]['err_2'] = dataframe_store[index]['bootstraps_2'].apply(lambda x: exclude_outlier_samples(x, np.std))

Working on database/fit/variation_dist_ecv_1.csv
Working on database/fit/variation_alpha_1.csv
Working on database/fit/variation_dist_dcr3_1.csv
Working on database/fit/variation_dist_dcr1_1.csv
Working on database/fit/variation_alpha_-1.csv
Working on database/fit/variation_dvz_1.csv
Working on database/fit/variation_dist_dcr3_-1.csv
Working on database/fit/sector_4_sin.csv
Working on database/fit/variation_dist_dcr1_-1.csv
Working on database/fit/sector_5_sin.csv
Working on database/fit/variation_dist_ec_edep_1.csv
Working on database/fit/variation_dist_ecu_1.csv
Working on database/fit/variation_dist_ecsf_1.csv
Working on database/fit/variation_dist_ecu_-1.csv
Working on database/fit/variation_dist_vz_1.csv
Working on database/fit/variation_dist_ecw_1.csv
Working on database/fit/variation_dist_cc_-1.csv
Working on database/fit/sector_6_sin.csv
Working on database/fit/variation_dvz_-1.csv
Working on database/fit/variation_dist_ecw_-1.csv
Working on database/fit/sector_3_sin.csv
Worki

### Systematic Uncertainties 

In [8]:
def database_filename_parser(file_name):
    file_name = file_name.split('variation_')[-1].strip('.csv')
    
    tokens = file_name.split('_')
    n_tokens = len(tokens)

    parameter_name = '_'.join(tokens[0:n_tokens-1])
    index = int(tokens[-1])
    
    return index, parameter_name

In [9]:
def build_parameter_variation_dict(path_to_db = 'database/fit/'):

    parameters = {}

    database_files = glob.glob(path_to_db + 'variation*.csv')
    
    for database_file in database_files:
        index, parameter = database_filename_parser(database_file)
        
        if parameter in parameters.keys():
            parameters[parameter][index] = pd.read_csv(database_file)
        else:
            parameters[parameter] = {}
            parameters[parameter][index] = pd.read_csv(database_file)
            
    return parameters

In [10]:
def load_systematic_sources_list(file_name):
    systematic_sources = pickle.load(open(file_name, 'rb'))
    
    reverse_dict = {}

    for key, value in systematic_sources.items():
        reverse_dict[value] = key
    
    return reverse_dict

In [11]:
parameter_variation = build_parameter_variation_dict()

In [12]:
for key, value in parameter_variation.items():
    print(key, value.keys())

dist_ecv dict_keys([1, -1])
alpha dict_keys([1, -1])
dist_dcr3 dict_keys([1, -1])
dist_dcr1 dict_keys([1, -1])
dvz dict_keys([1, -1])
dist_ec_edep dict_keys([1, -1])
dist_ecu dict_keys([1, -1])
dist_ecsf dict_keys([1, -1])
dist_vz dict_keys([1, -1])
dist_ecw dict_keys([1, -1])
dist_cc dict_keys([-1, 0])
p_mes dict_keys([0, 1, -1])
missing_mass dict_keys([-1, 1])


In [13]:
if 'dvz' in parameter_variation:
    del parameter_variation['dvz']
    
if 'missing_mass' in parameter_variation:
    del parameter_variation['missing_mass']

In [14]:
systematic_sources = load_systematic_sources_list('systematic_sources.pkl')

In [15]:
systematic_sources

{'alpha': 'sys_1',
 'dist_cc': 'sys_2',
 'dist_dcr1': 'sys_3',
 'dist_dcr3': 'sys_4',
 'dist_ec_edep': 'sys_5',
 'dist_ecsf': 'sys_6',
 'dist_ecu': 'sys_7',
 'dist_ecv': 'sys_8',
 'dist_ecw': 'sys_9',
 'dist_vz': 'sys_10',
 'p_mes': 'sys_11',
 'beam_pol': 'sys_0'}

In [16]:
list(parameter_variation['alpha'].keys())

[1, -1]

In [17]:
def add_systematics(nominal_fit, parameter_variation, systematic_sources):

    nominal_fit_sys = nominal_fit.copy(deep=True)
    nominal_fit_sys['sys_total_0'] = np.zeros(len(nominal_fit_sys))
    nominal_fit_sys['sys_total_1'] = np.zeros(len(nominal_fit_sys))
    nominal_fit_sys['sys_total_2'] = np.zeros(len(nominal_fit_sys))
    
    for key in parameter_variation.keys():

        min_index = list(parameter_variation[key].keys())[0]    
        max_index = list(parameter_variation[key].keys())[-1]

        merged_data = pd.merge(parameter_variation[key][min_index], 
                 parameter_variation[key][max_index],
                 on = ['axis', 'axis_bin'])

        merged_data[systematic_sources[key] + '_par0'] = np.abs(merged_data.par_0_y - merged_data.par_0_x)
        merged_data[systematic_sources[key] + '_par1'] = np.abs(merged_data.par_1_y - merged_data.par_1_x)
        merged_data[systematic_sources[key] + '_par2'] = np.abs(merged_data.par_2_y - merged_data.par_2_x)

        merge_cols = ['axis', 'axis_bin', systematic_sources[key] + '_par0', 
                     systematic_sources[key] + '_par1', systematic_sources[key] + '_par2']

        nominal_fit_sys = pd.merge(nominal_fit_sys, merged_data[merge_cols], 
                                   on = ['axis', 'axis_bin'])
    
        nominal_fit_sys.sys_total_0 += nominal_fit_sys[systematic_sources[key] + '_par0']**2
        nominal_fit_sys.sys_total_1 += nominal_fit_sys[systematic_sources[key] + '_par1']**2
        nominal_fit_sys.sys_total_2 += nominal_fit_sys[systematic_sources[key] + '_par2']**2


    nominal_fit_sys.sys_total_0 = np.sqrt(nominal_fit_sys.sys_total_0)
    nominal_fit_sys.sys_total_1 = np.sqrt(nominal_fit_sys.sys_total_1)
    nominal_fit_sys.sys_total_2 = np.sqrt(nominal_fit_sys.sys_total_2)

    
    return nominal_fit_sys

In [18]:
nominal_fit_sys = add_systematics(nominal, parameter_variation, systematic_sources)

In [19]:
nominal_fit_sys.to_csv('results/fit/sys.csv', index=False)

In [20]:
nominal_fit_sys.head(24)

Unnamed: 0,axis,axis_bin,axis_min,axis_max,par_0,par_1,par_2,err_0,err_1,err_2,...,sys_10_par2,sys_9_par0,sys_9_par1,sys_9_par2,sys_2_par0,sys_2_par1,sys_2_par2,sys_11_par0,sys_11_par1,sys_11_par2
0,pt,0,0.000488,0.226849,0.024831,0.005096,-0.233622,0.007285,0.418066,0.40611,...,9.4e-05,0.002945,0.120408,0.172116,0.001162,0.037804,0.037137,0.000674,0.10535,0.03619
1,pt,1,0.226849,0.321331,0.038368,-0.158483,-0.067059,0.005945,0.351877,0.473813,...,0.046527,0.001348,0.021594,0.047187,0.000329,0.006758,0.000335,0.000768,0.02335,0.051225
2,pt,2,0.321331,0.409499,0.033198,-0.422545,-0.123146,0.004986,0.401685,0.409003,...,0.116536,0.000165,0.019278,0.012765,3.8e-05,0.030502,0.01322,0.000121,0.027719,0.049559
3,pt,3,0.409499,0.520312,0.038797,-0.081832,-0.322752,0.006659,0.332887,0.359759,...,0.005393,0.000726,0.068119,0.008733,9.8e-05,0.019066,0.008566,0.000583,0.123203,0.087695
4,pt,4,0.520312,1.207643,0.029985,-0.495983,-0.35593,0.006747,0.421319,0.34448,...,0.008321,0.000106,0.003636,0.060704,0.001579,0.038447,0.05259,0.003084,0.024772,0.046046
5,q2,0,1.000001,1.340623,0.024206,-0.614638,-0.167149,0.005353,0.455848,0.380984,...,0.035359,0.001922,0.021099,0.02722,0.002052,0.018747,0.004334,0.000354,0.040864,0.002803
6,q2,1,1.340623,1.567456,0.023968,-0.308385,-0.165597,0.005722,0.382619,0.398399,...,0.026323,0.003558,0.072904,0.079524,0.001469,0.010159,0.005512,0.001732,0.037433,0.052963
7,q2,2,1.567456,1.852686,0.035497,-0.074083,-0.193797,0.005462,0.362526,0.475869,...,0.028401,0.000269,0.020858,0.052037,0.000348,0.018745,0.012686,0.000557,0.075925,0.075859
8,q2,3,1.852686,2.294117,0.036548,-0.330398,-0.16674,0.006268,0.336671,0.392189,...,0.014377,0.000975,0.044468,0.028347,0.000203,0.032915,0.065211,0.000435,0.078824,0.036857
9,q2,4,2.294117,4.740613,0.044855,-0.096793,-0.21558,0.007086,0.365578,0.455082,...,0.091341,0.000264,0.021602,0.065439,0.000861,0.018117,0.047693,0.001305,0.005171,0.025257
