# Evaluating the KLD from posterior samples of cosmological parameters

_Alex I. Malz (GCCL@RUB)_

In [None]:
import gzip
import itertools
import numpy as np
import os
import pandas as pd
import pickle as pkl
import re
from scipy import stats as sps
import sys

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx
from mpl_toolkits.axes_grid1 import make_axes_locatable

# Color map
rainbow = cm = plt.get_cmap('plasma_r')
cNorm  = colors.LogNorm(vmin=1, vmax=50) #colors.Normalize(vmin=0, vmax=50)
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=rainbow)
color_map = scalarMap.to_rgba(np.arange(1, 50))

In [None]:
import proclam
from proclam.metrics.util import *
from proclam.metrics.util import RateMatrix

## simple demo

We will begin with samples of $(w, \Omega_{m})$ pairs, where one set of samples is defined as the reference sample corresponding to a best-case scenario of a 100% pure SN Ia data set.

In [None]:
# # replace with reading in the data
# def measure(n, w_bar, w_sig, Omm_bar,Omm_sig):
#     "Measurement model, return two coupled measurements."
#     w = np.random.normal(loc=w_bar, scale=w_sig, size=n)
#     Omm = np.random.normal(loc=Omm_bar, scale=Omm_sig, size=n)
#     return w, Omm

def measure(path, cols):
    alldims = pkl.load(open(path, 'rb'))
    return [alldims[col] for col in cols]

TODO: also show for chain subsamples in loop for evolution (see `/media/RESSPECT/data/PLAsTiCC/SALT2mu_posteriors/static/DDF/train_10/batch_10/UncSampling/chains/chains_loop_99.pkl` and more numbers)

TODO: also flat prior

In [None]:
# '/media/RESSPECT/data/PLAsTiCC/SALT2mu_posteriors/perfect_classifier/chains_plasticc_perfect.pkl'
# '/media/RESSPECT/data/PLAsTiCC/SALT2mu_posteriors/static/DDF/train_10/batch_10/UncSampling/chains/chains_loop_99.pkl'
# '/media/emille/git/COIN/RESSPECT_work/PLAsTiCC/metrics_paper/resspect_metric/posteriors/'
# '/media/RESSPECT/data/PLAsTiCC/for_metrics/posteriors/'
# '/media/emille/data/PLAsTiCC/posteriors/'
# '/media2/RESSPECT2/data/posteriors'
postpath = '/media2/RESSPECT2/data/posteriors'
fields = ['ddf']#, 'wfd']
priors = [1, 5]#omprior_0.01_flat

In [None]:
def listdirs(rootdir):
    outfns = {}
    for it in os.scandir(rootdir):
        if it.is_dir():
            fn = it.path[len(rootdir):]
            lookfor = it.path+'/chains_'+fn+'_lowz_withbias.pkl'
            if os.path.exists(lookfor):
                outfns[fn] = lookfor
    return outfns

In [None]:
postpaths, testcases = {}, {}
for field in fields:
    postpaths[field], testcases[field] = {}, {}
    for prior in priors:
        postpaths[field][prior] = postpath+'_'+field+'/omprior_0.0'+str(prior)+'/'
        testcases[field][prior] = listdirs(postpaths[field][prior])

kinda slow

In [None]:
# refpath = testcases['perfect3000']
# [w_ref, Omm_ref] = measure(refpath, ['w', 'om'])

In [None]:
# w_ref, Omm_ref = measure(1000, -1., 0.1, 0.5, 0.1)
# w_comp, Omm_comp = measure(1000, -1.1, 0.2, 0.25, 0.05)

# comppath = testcasespostpath+'fiducial3000/chains_fiducial3000.pkl'
# [w_comp, Omm_comp] = measure(comppath, ['w', 'om'])

In [None]:
# plt.scatter(w_ref, Omm_ref, s=1, alpha=0.2, label='best possible')
# plt.scatter(w_comp, Omm_comp, s=1, alpha=0.2, label='approximation')
# plt.legend(loc='lower left')

[`chippr`](https://github.com/aimalz/chippr/) contains code for calculating the KLD of PDFs evaluated on a grid, so we start by fitting a 2D KDE to the samples.
The PDFs must be $\geq0$ over the entire range of the grid, so we make a grid based on the reference sample's range.

In [None]:
# ngrid_x = 100
# ngrid_y = 100
# xmin = w_ref.min()#-1.2
# xmax = w_ref.max()#-0.8
# ymin = Omm_ref.min()#0.2
# ymax = Omm_ref.max()#0.4

# w_grid, Omm_grid = np.mgrid[xmin:xmax:100*1.j, ymin:ymax:100*1.j]
# w_vec, Omm_vec = w_grid[:, 0], Omm_grid[0, :]
# dw = (xmax - xmin) / (ngrid_x - 1)
# dOmm = (ymax - ymin) / (ngrid_y - 1)
# # use meshgrid instead of mgrid

def make_grid(x, y, x_ngrid=100, y_ngrid=100):
    x_min = x.min()#-1.2
    x_max = x.max()#-0.8
    y_min = y.min()#0.2
    y_max = y.max()#0.4

    x_grid, y_grid = np.mgrid[x_min:x_max:x_ngrid*1.j, y_min:y_max:y_ngrid*1.j]
    x_vec, y_vec = x_grid[:, 0], y_grid[0, :]
    dx = (x_max - x_min) / (x_ngrid - 1)
    dy = (y_max - y_min) / (y_ngrid - 1)

    return(((x_min, y_min), (x_max, y_max)), (x_grid, y_grid), (x_vec, y_vec), (dx, dy))

In [None]:
# ref_extrema, ref_grids, ref_vecs, ref_ds = make_grid(w_ref, Omm_ref)
# (w_vec, Omm_vec) = ref_vecs
# (dw, dOmm) = ref_ds
# ((xmin, ymin), (xmax, ymax)) = ref_extrema
# (w_grid, Omm_grid) = ref_grids

In [None]:
# loop this up later, just one for now
which_field_prior = ('ddf', 1)
onetest = testcases[which_field_prior[0]][which_field_prior[1]]

# perfcases, randcases, fidcases = {}, {}, {}
# perfposts, randposts, fidposts = {}, {}, {}
# perfgrids = {}
datatemplate = {'paths': {}, 'posteriors': {}, 'grid_info': {}}
allperf, allrand, allfid = {'paths': {}, 'posteriors': {}, 'grid_info': {}}, {'paths': {}, 'posteriors': {}}, {'paths': {}, 'posteriors': {}}
for i in onetest.keys():
    newkey = i[-4:]
    if 'perfect' in i:
        allperf['paths'][newkey] = onetest[i]
        allperf['posteriors'][newkey] = measure(allperf['paths'][newkey], ['w', 'om'])
        allperf['grid_info'][newkey] = make_grid(allperf['posteriors'][newkey][0], allperf['posteriors'][newkey][1])
        allperf['color'] = plt.cm.Blues
    elif 'fiducial' in i:
        allfid['paths'][newkey] = onetest[i]
        allfid['posteriors'][newkey] = measure(allfid['paths'][newkey], ['w', 'om'])
        allfid['color'] = plt.cm.Greens
    elif 'random' in i:
        allrand['paths'][newkey] = onetest[i]
        allrand['posteriors'][newkey] = measure(allrand['paths'][newkey], ['w', 'om'])
        allrand['color'] = plt.cm.Reds

In [None]:
# plt.hist2d(w_ref, Omm_ref, bins=[w_vec, Omm_vec], density=True, cmap=plt.cm.Blues, alpha=0.5)
# plt.hist2d(w_comp, Omm_comp, bins=[w_vec, Omm_vec], density=True, cmap=plt.cm.Reds, alpha=0.5)

NOTE: keep propagating for cosmological contours as a function of number of lightcurves, classifier, field

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(15, 5))
for i, n_sn in enumerate(allperf['posteriors'].keys()):
    ((x_min, y_min), (x_max, y_max)), (x_grid, y_grid), (x_vec, y_vec), (dx, dy) = allperf['grid_info'][n_sn]
    
    # these don't work because ranges really don't overlap!
#     axs[i].hist2d(allperf['posteriors'][n_sn][0], allperf['posteriors'][n_sn][1], bins=[x_vec, y_vec], density=True, cmap=allperf['color'], alpha=0.3)
#     axs[i].hist2d(allfid['posteriors'][n_sn][0], allfid['posteriors'][n_sn][1], bins=[x_vec, y_vec], density=True, cmap=allfid['color'], alpha=0.3)
#     axs[i].hist2d(allrand['posteriors'][n_sn][0], allrand['posteriors'][n_sn][1], bins=[x_vec, y_vec], density=True, cmap=allrand['color'], alpha=0.3)
    
# # these work only without density and bin specs
    hist_perf, xgrid, ygrid = np.histogram2d(allperf['posteriors'][n_sn][0], allperf['posteriors'][n_sn][1], bins=[x_vec, y_vec], density=True)
    hist_fid, xgrid, ygrid = np.histogram2d(allfid['posteriors'][n_sn][0], allfid['posteriors'][n_sn][1])#, bins=[x_vec, y_vec], density=True)
    hist_rand, xgrid, ygrid = np.histogram2d(allrand['posteriors'][n_sn][0], allrand['posteriors'][n_sn][1])#, bins=[x_vec, y_vec])#, density=True)
    axs[i].imshow(hist_perf, origin='lower', extent=[x_min, x_max, y_min, y_max], cmap=allperf['color'], alpha=0.3)
#     axs[i].imshow(hist_fid, origin='lower', extent=[x_min, x_max, y_min, y_max], cmap=allfid['color'], alpha=0.3)
#     axs[i].imshow(hist_rand, origin='lower', extent=[x_min, x_max, y_min, y_max], cmap=allrand['color'], alpha=0.3)
    


In [None]:
eps = 2. * sys.float_info.min

def safe_log(arr, threshold=eps):
    """
    Takes the natural logarithm of an array that might contain zeros.

    Parameters
    ----------
    arr: ndarray, float
        array of values to be logged
    threshold: float, optional
        small, positive value to replace zeros and negative numbers

    Returns
    -------
    logged: ndarray
        logged values, with small value replacing un-loggable values
    """
    arr = np.asarray(arr)
    arr[arr < threshold] = threshold
    logged = np.log(arr)
    return logged

def make_kde(Xgrid, Ygrid, Xsamps, Ysamps, to_log=False, save=None):
    positions = np.vstack([Xgrid.ravel(), Ygrid.ravel()])
    values = np.vstack([Xsamps, Ysamps])
    kernel = sps.gaussian_kde(values, bw_method='scott')
    Z = np.reshape(kernel(positions).T, Xgrid.shape)
    if to_log:
        return safe_log(Z)
    else:
        return Z
    
#     if save is not None:
        
# TODO: normalize up here before log!

In [None]:
kde_ref = make_kde(w_grid, Omm_grid, w_ref, Omm_ref)
# plt.imshow(kde_ref, extent=[xmin, xmax, ymin, ymax], origin='lower', cmap=plt.cm.Blues)

In [None]:
# replace with reading in other sets of posteriors
kde_comp = make_kde(w_grid, Omm_grid, w_comp, Omm_comp)
# plt.imshow(kde_comp, extent=[xmin, xmax, ymin, ymax], origin='lower', cmap=plt.cm.Reds)

In [None]:
plt.imshow(kde_ref, origin='lower', extent=[xmin, xmax, ymin, ymax], cmap=plt.cm.Blues, alpha=0.5)
plt.imshow(kde_comp, origin='lower', extent=[xmin, xmax, ymin, ymax], cmap=plt.cm.Reds, alpha=0.5)

Now that we have the 2D PDFs, let's define the KLD.

In [None]:
# stolen from chippr
def calculate_kld(lpe, lqe, dx, from_log=False, vb=True):
    """
    Calculates the Kullback-Leibler Divergence between two N-dimensional PDFs 
    evaluated on a shared, regular grid (sorry, too lazy to deal with irregular grid)

    Parameters
    ----------
    lpe: numpy.ndarray, float
        log-probability distribution evaluated on a grid whose distance from `q`
        will be calculated.
    lqe: numpy.ndarray, float
        log-probability distribution evaluated on a grid whose distance to `p` will
        be calculated.
    dx: numpy.ndarray, float
        separation of grid values in each dimension
    from_log: boolean, optional
        if False, lpe, lqe are probability distributions, not log-probability distributions
    vb: boolean, optional
        report on progress to stdout?

    Returns
    -------
    Dpq: float
        the value of the Kullback-Leibler Divergence from `q` to `p`
    """
    # Normalize the evaluations, so that the integrals can be done
    gridnorm = np.ones_like(lpe) * np.prod(dx)
    if from_log:
        pe = np.exp(lpe)
        qe = np.exp(lqe)
#     print(np.prod(dx))
#     print(gridnorm)
    else:
        pe = lpe
        qe = lqe
    pi = np.sum(pe * gridnorm)
    qi = np.sum(qe * gridnorm)
    # (very approximately!) by simple summation:
    pn = pe / pi
    qn = qe / qi
    # Compute the log of the normalized PDFs
    logp = safe_log(pn)
    logq = safe_log(qn)
    # Calculate the KLD from q to p
    Dpq = np.sum(pn * (logp - logq))
#     if np.isnan(Dpq):
#         return((lpe, lqe, dx))
    return Dpq

Now we can evaluate it for our reference sample and a comparison sample.

In [None]:
calculate_kld(kde_ref, kde_comp, np.array([dw, dOmm]))

could also do this as a function of chain iteration

## with "real" contaminated samples

In [None]:
savedpath = '/media/RESSPECT/data/PLAsTiCC/for_metrics/'
metpaths = {field: {prior: savedpath+field+'/metrics/'+str(prior)+'/' for prior in priors} for field in fields}
# metpaths = {field: savedpath+'metrics/' for field in ['ddf']}

In [None]:
maybe_sn_classes = {90: 'SNIa', 
                    67: 'SNIa-91bg', 
                    52: 'SNIax', 
                    42: 'SNII', 
                    62: 'SNIbc', 
                    95: 'SLSN-I', 
                    88: 'AGN'}
maybe_sn_classes[15] = 'TDE'
maybe_sn_classes[64] = 'KN'

shapes = {'ddf': {'SNIa-91bg': 'o',
 'SNIax': 's',
 'SNII': 'd',
 'SNIbc': 'X',
 'SLSN-I': 'v',
 'AGN': '^',
 'TDE': '<',
 'KN': '>',
 'CART': 'v'},
          'wfd': {'SNIa-91bg': '.',
 'SNIax': '+',
 'SNII': 'P',
 'SNIbc': 'x',
 'SLSN-I': '1',
 'AGN': '2',
 'TDE': '3',
 'KN': '4',
 'CART': 'v'}}

sel_class = 90

# ia_percents = np.array([50, 68, 75, 90, 95, 98, 99])
# mix_percents = 100 - ia_percents
contaminants = maybe_sn_classes.copy()
contaminants.pop(sel_class)

### also some deterministic metrics for comparison

wouldn't need this if I'd been smart enough to put it in `proclam` already. . .

In [None]:
class det_mets(RateMatrix):
    "binary classification metrics"
    def __init__(self, **rates):
        """
        Call like `thing = det_mets(**rates._asdict())`
        """
#         self.rates = rates#.asdict()
        self._get_tots()
        self._from_rates()
        self._sn_mets()
        self._translate()
    def _get_tots(self):
        self.CP = self.TP + self.FN
        self.CN = self.TN + self.FP
        self.T = self.TP + self.TN
        self.F = self.FP + self.FNhttps://cosmostatistics.slack.com/archives/GSECX131T/p1622754012019100
        self.P = self.TP + self.FP
        self.N = self.TN + self.FN
    def _from_rates(self):
        self.PPV = self.TP / (self.TP + self.FP)
        self.NPV = self.TN / (self.TN + self.FN)
        self.PT = (np.sqrt(self.TPR * (1. - self.TNR)) + self.TNR - 1.) / (self.TPR + self.TNR - 1.)
        self.TS = self.TP / (self.TP + self.FN + self.FP)
        self._derived()
    def _derived(self):
        self.ACC = (self.TP + self.TN) / (self.CP + self.CN)
        self.BA = (self.TPR + self.TNR) / 2,
        self.F1S = 2. * self.PPV * self.TPR / (self.PPV + self.TPR)
        self.MCC = (self.TP * self.TN - self.FP * self.FN) / (np.sqrt(self.P * self.CP * self.CN * self.N))
        self.FM = np.sqrt(self.PPV * self.TPR)
        self.BM = self.TPR + self.TNR - 1.
        self.MK = self.PPV + self.NPV - 1.
    def _translate(self):
        self.positive = self.CP
        self.negative = self.CN
        self.sensitivity = self.TPR
        self.recall = self.TPR
        self.specificity = self.TNR
        self.selectivity = self.TNR
        self.precision = self.PPV
        self.FDR = 1. - self.PPV
        self.FOhttps://us02web.zoom.us/j/867187256?pwd=akRrVFU2VGlpR2ZXN1NVSUVUdE41dz09#successR = 1. - self.NPV
        self.CSI = self.TS
        self.accuracy = self.ACC
        self.f1_score = self.F1S
        self.informedness = self.BM
        self.deltaP = self.MK
    def _sn_mets(self):
        self.get_efficiency()
        self.get_purity()
    def get_efficiency(self):
        self.efficiency = self.TP / self.CP
        return self.efficiency
    def get_purity(self):
        self.purity = self.TP / self.P
        return self.purity
    def get_fom(self, penalty):
        self.pseudo_purity = self.TP / (self.TP + penalty * self.FP)
        return self.pseudo_purity * self.efficiency

Just deterministic metrics first

In [None]:
def remap_rate(ratemat):
    temp = proclam.util.RateMatrix(TPR=ratemat.TNR,
                                  FPR=ratemat.FNR,
                                  FNR=ratemat.FPR,
                                  TNR=ratemat.TPR,
                                  TP=ratemat.TN,
                                  FP=ratemat.FN,
                                  FN=ratemat.FP,
                                  TN=ratemat.TP)
    return(temp)

In [None]:
allmets = pd.read_csv(savedpath+'directory.csv')
allmets = allmets.drop_duplicates(ignore_index=True)
for met in ['purity', 'efficiency', 'f1', 'fom1', 'fom3']:
    allmets[met] = None
for ind in allmets.index:
    row = allmets.loc[ind]
#     testname = str(100-row['percent'])+str(maybe_sn_classes[sel_class])+str(row['percent'])+row['contaminant']
#     allmets['name'].loc[ind] = testname
    testname = row['name']
#     concode = list(contaminants.keys())[list(contaminants.values()).index(row['contaminant'])]
#     testname = row['name']#f"{100-row['percent']}_{sel_class}_{row['percent']}_{concode}"
    compfn = row['inloc']+'.pkl'
#     if testname in testcases[field][prior].keys():
#     compfn = testcases[field][prior][testname]
#         metfn = metpaths[row['field']]+testname+'.pkl'#f'{100-perc}_{sel_class}_{perc}_{key}.pkl'
    if os.path.exists(compfn):
#         print(compfn)
        with open(compfn, 'rb') as metfile:
            rates = proclam.util.RateMatrix(**pkl.load(metfile))
            rates = remap_rate(rates)
            ratedict = rates._asdict()
            mets = det_mets(**ratedict)
        allmets['purity'].loc[ind] = mets.purity
        allmets['efficiency'].loc[ind] = mets.efficiency
        allmets['f1'].loc[ind] = mets.f1_score
        allmets['fom1'].loc[ind] = mets.get_fom(1.)
        allmets['fom3'].loc[ind] = mets.get_fom(3.)
allmets.to_csv(savedpath+'FOM.csv', index=False)

In [None]:
# sizes = {'ddf': 50, 'wfd': 150}
# all_shapes = {}
# # for i, (k, v) in enumerate(maybe_sn_classes.items()):
# #     shapes[v] = (np.mod(i, 3)+3, int(i / 3), np.mod(i, 4)*45)
# shape_pairs = [('.', 'o'), ('1', 'v'), ('2', '^'), ('3', '<'), ('4', '>'), ('+', 'P'), ('x', 'X'), ('*', 'p')]
# for i, field in enumerate(['ddf', 'wfd']):
#     shapes = {}
#     for j, contaminant in enumerate(allmets['contaminant'].unique()):
#         shapes[contaminant] = shape_pairs[j][i]
#     all_shapes[field] = shapes

(uq, ind, inv, cts) = np.unique(allmets['percent'], return_index=True, return_inverse=True, return_counts=True)
linear_colors = {uq[i]: i for i in range(len(uq))}
cNorm  = colors.LogNorm(vmin=1, vmax=len(uq)) #colors.Normalize(vmin=0, vmax=50)
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=rainbow)
color_map = scalarMap.to_rgba(np.arange(0, len(uq)))

# fave_cmap = 'viridis_r'#'plasma_r'#'cool'

In [None]:
allmets = pd.read_csv(savedpath+'FOM.csv')
allmets = allmets[allmets['percent'] > 0]

alldets = ['f1', 'fom1', 'fom3', 'purity', 'efficiency', 'percent']
dim = len(alldets)

pairs = list(itertools.combinations(range(dim), 2))
fig, axs = plt.subplots(dim-1, dim-1, figsize=(5*(dim-1), 5*(dim-1)))
# norm = mpl.colors.Normalize(vmin=0., vmax=50.)
cbar = fig.colorbar(scalarMap, cax=axs[-1][0], ticks=range(len(uq)))#, ticklabels=str(uq))
cbar.set_ticklabels(uq)

for i, c in enumerate(allmets['contaminant'].unique()):
#     axs[-1][0].scatter(-1, -1, marker=shapes[c], color='k', label='ddf: '+c)
    for f in fields:
        axs[-1][0].scatter(-1, -1, marker=shapes[f][c], color='k', label=f+':'+c)
        one_c = allmets[(allmets['contaminant'] == c) & (allmets['fieldgzip.open'] == f)]
        plotcolors = [linear_colors[c] for c in one_c['percent'].values]
    for pair in pairs:
        axs[pair[0]][pair[1]-1].scatter(one_c[alldets[pair[0]]], one_c[alldets[pair[1]]],
                                   alpha=0.5, s=100,#[sizes[f] for f in one_c['field']], 
                                   marker=shapes[f][c], 
                                   color=color_map[plotcolors])
# for row in allmets:
#     f = row['field']
#     c = row['contaminant']
# #     axs[-1][0].scatter(-1, -1, marker=all_shapes[f][c], color='k', label=f+':'+c)
# #     one_c = allmets[(allmets['contaminant'] == c) & (allmets['field'] == f)]
#     for pair in pairs:
#         axs[pair[0]][pair[1]-1].scatter(row[alldets[pair[0]]], row[alldets[pair[1]]],
#                                    alpha=0.5, s=100,#[sizes[f] for f in one_c['field']], 
#                                    marker=all_shapes[f][c], 
#                                    color=plt.get_cmap(fave_cmap)(row['percent']/50.))
            
for pair in pairs:
    axs[pair[0]][pair[1]-1].set_xlabel(alldets[pair[0]])
    axs[pair[0]][pair[1]-1].se encountered an overflow error; the KLD is sensitive to the tails, and t_ylabel(alldets[pair[1]])
#     axs[pair[0]][pair[1]-1].set_xlim(0., 1.)
#     axs[pair[0]][pair[1]-1].set_ylim(0., 1.)
axs[-1][0].legend()
fig.savefig('draft_dets.png', bbox_inches='tight', pad_inches=0)

evaluate on the grid for the perfect samples as reference

The KDEs are the slow step here. . . don't run this more than once

TODO: save the KDEs, just in case?

In [None]:
d_ref, grid_ref, kde_ref = {}, {}, {}
for field in fields:
    d_ref[field], grid_ref[field], kde_ref[field] = {}, {}, {}
    for prior in priors:
        perfpath = testcases[field][prior]['perfect3000']
        [w_ref, Omm_ref] = measure(perfpath, ['w', 'om'])
        ref_extrema, ref_grids, ref_vecs, ref_ds = make_grid(w_ref, Omm_ref)
        (w_vec, Omm_vec) = ref_vecs
        (dw, dOmm) = ref_ds
        ((xmin, ymin), (xmax, ymax)) = ref_extrema
        (w_grid, Omm_grid) = ref_grids
        d_ref[field][prior] = {'w': dw, 'Omm': dOmm}
        grid_ref[field][prior] = {'w': w_grid, 'Omm': Omm_grid}
        kde_ref[field][prior] = make_kde(w_grid, Omm_grid, w_ref, Omm_ref)
#     with open(metpaths[field]+'')

TODO: make plot for fiducial, random, perfect as a function of sample size for WFD and DDF

In [None]:
# fig, axs = plt.subplots(1, 2, figsize=(10, 5))
# for i, field in enumerate(['ddf', 'wfd']):
#     for conditions in ['perfect', 'random', 'fiducial']:
#         postpath = '/media2/RESSPECT2/data/posteriors/'

# refpath = postpath+'perfect/chains_perfect.pkl'
# comppath = postpath+'fiducial/chains_fiducial.pkl'
#         axs[i].imshow

NEXT: save cosmology metrics corresponding to `testcases`

TODO: propagate to WFD!

In [None]:
# allmets = pd.read_csv(savedpath+'FOM.csv', )
eachprior = []
for prior in priors:
    allmets = pd.read_csv(savedpath+'FOM.csv')
    allmets['prior'] = prior
    allmets['outfile'] = None
    for field in fields:
        for ind in allmets.index:
            row = allmets.loc[ind]
            if row['name'] in testcases[field][prior].keys():
            
#     for testcase in testcases
#             print(row['name'])
                allmets['outfile'].loc[ind] = testcases[field][prior][row['name']]
#         else:
#             allmets
    eachprior += [allmets.copy()]
allmets = pd.concat(eachprior, ignore_index=True)
# allmets.to_csv(savedpath+'FOM.csv', index=False)

TODO: try entropy and KLD of posteriors about origin

In [None]:
# for field in fields:
#     for prior in priors:
#         for postname, postpath in testcases[field][prior].items():
#             if postpath not in allmets['outfile']:
#                 print((field, prior, postname))
#                 allmets

In [None]:
allmets['KLD'] = None
# allmets = allmets.drop_duplicates(ignore_index=True)

# for row in allmets[allmets['outfile'] is not None]:
for ind in allmets[allmets['outfile'].notnull()].index:
    compfn = allmets.loc[ind]['outfile']
    if os.path.exists(compfn):
#     row = allmets.loc[ind]
#     if row['outfile'] is not None:
# #     row = allmets.loc[ind]
# #     print(row)
# #     testname = str(100-row['percent'])+str(maybe_sn_classes[sel_class])+str(row['percent'])+row['contaminant']
#         testname = row['name']
#         print(testname)
# #     if row['field'] == 'wfd':
# #         comppath = postpath+'WFD/'
# #     else:
# #         comppath = postpath
#     for field in fields:
#         for prior in priors:
        field = allmets.loc[ind]['field']
        prior = allmets.loc[ind]['prior']
# #             if testname in [key in testcases[field][prior].keys()]:
#                 compfn = testcases[field][prior][testname]#comppath+testname+'/chains_'+testname+'_lowz_withbias.pkl'
#                 print(compfn)
        [w_comp, Omm_comp] = measure(compfn, ['w', 'om'])
        kde_comp = make_kde(grid_ref[field][prior]['w'], grid_ref[field][prior]['Omm'], w_comp, Omm_comp)
        allmets['KLD'].loc[ind] = calculate_kld(kde_ref[field][prior], kde_comp, 
                                                np.array([d_ref[field][prior]['w'], d_ref[field][prior]['Omm']]))
#                     allmets['name'].loc[ind] = testname

    
allmets.to_csv(savedpath+'KLD.csv', index=False)

### shortcut

1. make lists of all files in `/media/RESSPECT/data/PLAsTiCC/for_metrics/ddf/posteriors/samples_emille` and `/media/RESSPECT/data/PLAsTiCC/for_metrics/wfd/posteriors/samples_emille`
2. loop over them calculating KLD relative to `chains_perfect3000_lowz_withbias.csv.gz`
3. save in flat files

In [None]:
path_pre = '/media/RESSPECT/data/PLAsTiCC/for_metrics/'
path_post = '/posteriors/samples_emille/'
refname = 'chains_perfect3000_lowz_withbias.csv.gz'

for field in ['ddf', 'wfd']:
    fullpath = path_pre+field+path_post
    alloutputs = pd.DataFrame(columns=['path', 'KLD'])
    # make reference sample
    with gzip.open(fullpath+refname) as reffn:
        flatref = pd.read_csv(reffn)
        [w_ref, Omm_ref] = [flatref['w'], flatref['om']]
        ref_extrema, ref_grids, ref_vecs, ref_ds = make_grid(w_ref, Omm_ref)
        (w_vec, Omm_vec) = ref_vecs
        (dw, dOmm) = ref_ds
        ((xmin, ymin), (xmax, ymax)) = ref_extrema
        (w_grid, Omm_grid) = ref_grids
        d_ref = {'w': dw, 'Omm': dOmm}
        grid_ref = {'w': w_grid, 'Omm': Omm_grid}
        kde_ref = make_kde(w_grid, Omm_grid, w_ref, Omm_ref)
    # make comparison samples
    allfn = os.scandir(fullpath)
    for entry in allfn:
        if entry.is_file() and entry.name[-4:] != '.csv':
            samppath = fullpath+entry.name
            with gzip.open(samppath) as sampfile:
                sampdata = pd.read_csv(sampfile)
                [w_comp, Omm_comp] = [sampdata['w'], sampdata['om']]
                kde_comp = make_kde(grid_ref['w'], grid_ref['Omm'], w_comp, Omm_comp)
                the_kld = calculate_kld(kde_ref, kde_comp, np.array([d_ref['w'], d_ref['Omm']]))
                newrow = {'path': samppath, 'KLD': the_kld}
#                 print((newrow, type(newrow['KLD'])))
                alloutputs = alloutputs.append(newrow, ignore_index=True)
    alloutputs.to_csv(fullpath+'klds.csv')

In [None]:
thing = pd.read_csv(path_pre+'ddf'+path_post+'klds.csv')
for p in thing[thing['KLD'].isnull()]['path']:
    print(p)

run me only once

TODO: also calculate fisher matrix metric below!

In [None]:
allmets = pd.read_csv(savedpath+'KLD.csv')
fitmets = ['name', 'wfit_w_lowz', 'wfit_wsig_lowz', 'wfit_om_lowz', 'wfit_omsig_lowz', 
            'stan_w_lowz', 'stan_wsig_lowz', 'stan_om_lowz', 'stan_omsig_lowz']

for field in fields:
    if field == 'wfd':
        comppath = postpath+'WFD/'
    else:
        comppath = postpath
    moremets = pd.read_csv(comppath+'summary_cases.csv')
    moremets = moremets.rename(columns={"case": "name"})[fitmets]
    moremets['field'] = field
    allmets = pd.merge(allmets, moremets, how="outer", on=['field', 'name'])

allmets.to_csv(savedpath+'FIT.csv', index=False)

In [None]:
allmets = pd.read_csv(savedpath+'FIT.csv')

## plotting cosmo/principled vs. traditional/deterministic metrics

~~TODO: actually make these~~
~~- [X] by contaminant (markershape)~~
~~- [X] by contamination rate (continuous colors? or markersize?)~~
~~- [X] by field (markersize? or discrete colors?)~~

TODO:
- [X] hardcode the shapes
- [X] open/closed for field
- [ ] logscale colors
- [X] check WFD directory for posteriors

TODO: add in 1D histograms for metric value for each contaminant, for percent

In [None]:
allmets = pd.read_csv(savedpath+'KLD.csv')

dets_to_plot = ['f1']
dim = len(dets_to_plot)

fig, axs = plt.subplots(1, dim+1, figsize=(5*(dim+1), 5))#,
#                         gridspec_kw={'width_ratios': [10]*dim+[1]})
# norm = mpl.colors.Normalize(vmin=0., vmax=50.)
# fig.colorbar(mpl.cm.ScalarMappable(cmap=plt.get_cmap(fave_cmap), norm=norm), cax=axs[-1], 
#              ticks=allmets['percent'].unique())
fig.colorbar(scalarMap, cax=axs[-1], ticks=range(len(uq)))#, ticklabels=uq)

for i, metric in enumerate(dets_to_plot):
    for field in fields:
        for contaminant in contaminants.values():
            plotmask = allmets[(allmets['field'] == field) & (allmets['contaminant'] == contaminant)]
            plotcolors = [linear_colors[c] for c in plotmask['percent']]
            axs[i].scatter(plotmask[metric], plotmask['KLD'], alpha=0.75, s=100,
                           marker=shapes[f][contaminant], 
                           color=color_map[plotcolors])
    axs[i].semilogy()
    axs[i].set_xlabel(metric)
    axs[i].set_ylabel('KLD')
    axs[i].set_xlim(0.74, 1.01)
for field in fields:
#         axs[i].scatter(-1, -1, s=100, marker='o', color='k', 
#                        label=field)
    for contaminant in contaminants.values():
        axs[-1].scatter(-1, -1, s=50, marker=shapes[field][contaminant], color='k', 
                       label=f+': '+contaminant)
axs[-1].legend(loc='lower left', fontsize='small')
fig.savefig('draft_kld.png', bbox_inches='tight', pad_inches=0)

TODO: thought re: histograms of w\_est, sigma\_w, make two versions: color by which contaminant (discrete colors) and by % contamninant (continuous colors)

these live in postpath = '/media2/RESSPECT2/data/posteriors/' stan\_summary .dat

In [None]:
test = pd.read_csv('/media2/RESSPECT2/data/posteriors/summary_cases.csv')

In [None]:
test.columns

TODO: plot posteriors for most extreme subsamples