## Initialization Cells

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from scipy.optimize import curve_fit
from scipy.stats import norm
from statsmodels.base.model import GenericLikelihoodModel
import seaborn as sns
sns.set_palette("muted")
sns.set_color_codes()
sns.set_style("ticks")
sns.set_style({"xtick.direction": "in","ytick.direction": "in"})
sns.set_style({"axes.grid": "True", "grid.color": "0.95"})

plt.rcParams["figure.figsize"] = [6,6]
plt.rcParams["figure.dpi"] = 100
plt.rcParams["mathtext.fontset"] = "dejavusans"

In [None]:
import seaborn as sns
sns.set_palette('muted')
sns.set_color_codes()
sns.set_style('ticks')
sns.set_style({'xtick.direction': 'in','ytick.direction': 'in'})
sns.set_style({'axes.grid': 'True', 'grid.color': '0.95'})

plt.rcParams['figure.figsize'] = [6,6]
plt.rcParams['figure.dpi'] = 100
def darken_color(color, p):
    return (color[0]*p,color[1]*p,color[2]*p)

import matplotlib as mpl
colors = sns.color_palette('muted') + [(.1, .1, .1)]
for code, color in zip(['bd','gd','rd','md','yd','cd','kd'], colors):
    rgb = mpl.colors.colorConverter.to_rgb(darken_color(color,0.8))
    mpl.colors.colorConverter.colors[code] = rgb
    mpl.colors.colorConverter.cache[code] = rgb

blue = (114/256, 147/256, 203/256)
orange = (225/256, 151/256,  76/256)
green = (132/256, 186/256,  91/256)
red = (211/256,  94/256,  96/256)
grey = (128/256, 133/256, 133/256)
violet = (144/256, 103/256, 167/256)
brown = (171/256, 104/256,  87/256)
yellow = (204/256, 194/256,  16/256)

SMALL_SIZE = 14
MEDIUM_SIZE = 16
BIGGER_SIZE = 18

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

## Setup for Dataset Reading

In [None]:
from collections import OrderedDict 

var_types = ("gen", "fit", "err")
var_dict_ti = OrderedDict([
            ("ap", r"$|A_\parallel|$"), 
            ("apa", r"$\arg(A_\parallel)$"), 
            ("a0", r"$|A_0|$"), 
            ("a0a", r"$\arg(A_0)$"), 
            ("at", r"$|A_\perp|$"), 
            ("ata", r"$\arg(A_\perp)$")
            ])

var_dict_td = OrderedDict([
            ("ap", r"$|A_\parallel|$"), 
            ("apa", r"$\arg(A_\parallel)$"), 
            ("a0", r"$|A_0|$"), 
            ("a0a", r"$\arg(A_0)$"), 
            ("at", r"$|A_\perp|$"), 
            ("ata", r"$\arg(A_\perp)$"),
            ("xp", r"$x_\parallel$"),
            ("x0", r"$x_0$"),
            ("xt", r"$x_\perp$"),
            ("yp", r"$y_\parallel$"),
            ("y0", r"$y_0$"),
            ("yt", r"$y_\perp$"),
            ("xbp", r"$\bar x_\parallel$"),
            ("xb0", r"$\bar x_0$"),
            ("xbt", r"$\bar x_\perp$"),
            ("ybp", r"$\bar y_\parallel$"),
            ("yb0", r"$\bar y_0$"),
            ("ybt", r"$\bar y_\perp$")
            ])


var_names_ti = list(var_dict_ti.keys())
vars_ti = ([var_name + "_" + var_type for var_name in var_names_ti for var_type in var_types])

var_names_td = list(var_dict_td.keys())
vars_td = ([var_name + "_" + var_type for var_name in var_names_td for var_type in var_types])

In [None]:
def get_dataframes(dirs, vars):
    dfs = []
    for directory in dirs:
        all_files = glob.glob(os.path.join(directory, "*[0-9]"))
        print("Num files in '" + str(directory) + "': " + str(len(all_files)))
        df_from_each_file = (
            pd.read_csv(
                f, sep=" \|\| | \| | ", header=None, names=vars, engine="python"
            )
            for f in all_files
        )
        df = pd.concat(df_from_each_file, ignore_index=True)
        dfs.append(df)
    return dfs

## Read-in Datasets

In [None]:
import os
import glob

dir = 'randomized_all_corr'

dirs_ti = [
    '../results/' + dir + '/Kpi_ti_data_mcbkg',
    '../results/' + dir + '/Kpipi0_ti_data_mcbkg',
    '../results/' + dir + '/K3pi_ti_data_mcbkg',
    '../results/' + dir + '/together_ti_data_mcbkg'
]

dirs_td = [
    '../results/' + dir + '/Kpi_td_data_mcbkg',
    '../results/' + dir + '/Kpipi0_td_data_mcbkg',
    '../results/' + dir + '/K3pi_td_data_mcbkg',
    '../results/' + dir + '/together_td_data_mcbkg'
]

dfs_ti = get_dataframes(dirs_ti, vars_ti)
dfs_td = get_dataframes(dirs_td, vars_td)

In [None]:
def print_means_and_devs(dirs, var_names, dfs):
    print("Means")
    for var in var_names:
        column_name = var + "_fit"
        print("{:4}| ".format(var), end='')
        for i in range(0, len(dirs)):
#             print("{:+5.4f} | ".format(float(dfs[i][[column_name]].mean())), end='')
            print("{:+5.4f} | ".format(float(pd.DataFrame.mean(dfs[i][[column_name]]))), end='')
        print()
        
    print()
    print("Std. deviations")
    for var in var_names:
        column_name = var + "_fit"
        print("{:4}| ".format(var), end='')
        for i in range(0, len(dirs)):
            print("{:+5.4f} | ".format(float(dfs[i][[column_name]].std())), end='')
        print()

It's better to use the $\sigma$ from a Gaussian fit instead of the following std.dev.

In [None]:
print_means_and_devs(dirs_ti, var_names_ti, dfs_ti)
print_means_and_devs(dirs_td, var_names_td, dfs_td)

## Plots with Gaussians

In [None]:
def fit_and_plot_dataframe(df, var_names, var_titles, path=None):
    cols = 3
    rows = int(len(var_names)/cols)

    plt.rcParams["figure.figsize"] = [cols * 3.5, rows * 3.5]

    # a0a is always 0 so the histo is too high and because of 'sharey',
    # all the histos are scaled incorrectly
    df['a0a_fit'] = df['apa_fit']

    fig, axs = plt.subplots(nrows=rows, ncols=cols, sharey=True)
    fig.tight_layout(h_pad=2, w_pad=1)

    results = ""
    for i, ax in enumerate(axs.flat):
        if var_names[i] != "a0a":
            data = df[var_names[i] + "_fit"]
            n, bins, patches = ax.hist(
#                 data, bins=10)
                data, bins=10, edgecolor=darken_color(sns.color_palette("muted")[0], 0.8))
            area = np.sum(np.diff(bins)*n)
            mu, sigma = norm.fit(data)

            # Create a bunch of equidistant points to calculate the 
            # function values at (many points to make it look smooth)
            x = np.linspace(data.min(), data.max(), 100)
            norm_fitted = norm.pdf(x, mu, sigma)*area
            ax.plot(x, norm_fitted)
            result = f"{var_names[i]:3} = {mu:+.5f} +- {sigma:.5f}"
            print(result)
            results += result + "\n"
        else:
            ax.text(0.5, 0.5, "Empty on purpose", horizontalalignment="center", 
                    verticalalignment="center", transform=ax.transAxes)

        ax.set_xlabel(var_titles[var_names[i]])
        ax.set_title("")
        ax.ticklabel_format(useOffset=False)
        ax.locator_params(tight=True, nbins=3)

    if path:
        fig.savefig(path + ".pdf", bbox_inches = 'tight')
        fig.savefig(path + ".png", bbox_inches = 'tight')
        with open(path + ".results", "w") as f:
            f.write(results)

In [None]:
reshuffled_vars_ti = ["a0", "a0a", "ap", "apa", "at", "ata"]
reshuffled_vars_td = reshuffled_vars_ti + var_names_td[6:]

fit_and_plot_dataframe(dfs_ti[-1:][0], reshuffled_vars_ti, var_dict_ti, dir + "_ti")
fit_and_plot_dataframe(dfs_td[-1:][0], reshuffled_vars_td, var_dict_td, dir + "_td")

## Gaussian Fits Detailed

This fits the distributions with Gaussians and shows the uncertainties on the $\mu$ and $\sigma$ of each distribution. 

In [None]:
class Gaussian(GenericLikelihoodModel):
    def __init__(self, endog, exog=None, **kwds):
        #if exog is None:
        #    exog = np.zeros_like(endog)
            
        super(Gaussian, self).__init__(endog, exog, **kwds)
    
    def nloglikeobs(self, params):
        loc = params[0]
        scale = params[1]

        return -np.log(norm.pdf(self.endog, loc=loc, scale=scale))
    
    def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
        if start_params is None:
            loc_start = self.endog.mean()
            scale_start = self.endog.std()
            
            start_params = np.array([loc_start, scale_start])
            
        return super(Gaussian, self).fit(start_params=start_params,
                                         maxiter=maxiter, maxfun=maxfun, **kwds)

In [None]:
real_vars = [var + "_fit" for var in var_names_ti]
real_vars.remove('a0a_fit')

for i, dir_ti in enumerate(dirs_ti):
    print("Results for dir " + dir_ti)
    for var in real_vars:
        model = Gaussian(dfs_ti[i][[var]]);
        results = model.fit(disp=False);
        print("{:7}: ({:+.4f} +- {:.4f}) +- ({:+.4f} +- {:.4f})".format(
            var, results.params[0], results.bse[0], results.params[1], results.bse[1]))
    print()

In [None]:
# The old way of doing this follows.
# rows = 0
# if time_dependent:
#     rows = 6
# else:
#     rows = 2

# plt.rcParams["figure.figsize"] = [9, rows * 3]
# for i, dir in enumerate(dirs):
#     column_names = [var + '_fit' for var in var_names]
#     # Without the following line the y-axis range on all plots is (0,100)
#     dfs[i]['a0a_fit'] = dfs[i]['apa_fit']
#     axs = dfs[i].hist(column=column_names, sharey=True, layout=(rows, 3), bins=10)
    
#     print("Plots for dir " + os.path.basename(dir))
#     for ax in axs.flat:
#         ax.set_xlabel(var_dict[ax.title.get_text().split('_')[0]])
#         if ax.title.get_text() == "a0a_fit":
#             ax.clear()
#             ax.set_xlabel(var_dict["a0a"])
#             ax.text(0.5, 0.5, "Empty on purpose", horizontalalignment="center", 
#                     verticalalignment="center", transform=ax.transAxes)
#         ax.set_title("")
        
# #     plt.savefig(os.path.basename(dir) + "_pull_dist.pdf", bbox_inches = 'tight')
#     plt.show()

## Correlations

Plot correlations between results of the systematics (Jim requested this)

In [None]:
def plot_correlation(df, path=None):
    df = df.drop(list(df.filter(regex = 'gen|err')), axis = 1)
    df = df.drop('a0a_fit', axis=1)

    fig = plt.figure(figsize=(5 * len(df.columns) / 5, 4 * len(df.columns) / 5))
    plt.matshow(df.corr(), fignum=fig.number, cmap='coolwarm', vmin=-1, vmax=+1)

    for (i, j), z in np.ndenumerate(df.corr()):
        fig.axes[0].text(j, i, '{:0.2f}'.format(z), ha='center', va='center', fontsize=8)

    labels = [col[:-4] for col in df.columns]
    plt.xticks(range(df.shape[1]), labels, fontsize=10, rotation=90)
    plt.yticks(range(df.shape[1]), labels, fontsize=10)
    cb = plt.colorbar()
    cb.ax.tick_params(labelsize=10)
    plt.grid(False)
#     plt.show()

    if path:
        plt.savefig(path + ".pdf", bbox_inches = 'tight')
        plt.savefig(path + ".png", bbox_inches = 'tight')

In [None]:
plot_correlation(dfs_ti[-1:][0], dir + "_correlations_ti")
plot_correlation(dfs_td[-1:][0], dir + "_correlations_td")