# Emulating realistically bad-for-cosmology SN Ia samples from PLAsTiCC data

_Alex I. Malz (GCCL@RUB)_

In [None]:
import collections
import gzip
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle as pkl

rando = 42

In [None]:
import proclam
from proclam.metrics.util import *
from proclam.metrics.util import RateMatrix

classes we care about

| `true_target`=`type` | `code` |
| -------------------- | ------ |
| 90 | SNIa |
| 67 | SNIa-91bg |
| 52 | SNIax |
| 42 | SNII |
| 62 | SNIbc |
| 95 | SLSN-I |
| 88 | AGN |

In [None]:
maybe_sn_classes = {90: 'SNIa', 
                    67: 'SNIa-91bg', 
                    52: 'SNIax', 
                    42: 'SNII', 
                    62: 'SNIbc', 
                    95: 'SLSN-I', 
                    88: 'AGN'}
maybe_sn_classes[15] = 'TDE'
maybe_sn_classes[64] = 'KN'

## gather all available lightcurves

In [None]:
datapath = '/media/RESSPECT/data/PLAsTiCC/PLAsTiCC_zenodo/'

other than intermediate data products, work in `/media/RESSPECT/data/PLAsTiCC/for_metrics/`

In [None]:
all_maybe_sn = pd.read_csv(datapath+'plasticc_test_metadata.csv')

In [None]:
all_maybe_sn = all_maybe_sn.rename(columns={"object_id": "id", "true_z": "redshift", "true_target": "code"})
print(all_maybe_sn.columns)

DDF now, WFD later

In [None]:
true_ddf = all_maybe_sn.loc[all_maybe_sn['ddf_bool'] == 1][['id', 'redshift', 'code']]
# true_wfd = all_maybe_sn.loc[all_maybe_sn['ddf_bool'] == 0][['id', 'redshift', 'code']]
n_ddf_tot = len(true_ddf)
n_ddf_all = dict(true_ddf.groupby('code').count()['id'])
# ddf_rats = dict(true_ddf.groupby('code').count()['id'] / n_ddf_tot)

In [None]:
n_ddf_all

In [None]:
true_ddf.loc[true_ddf['code'] == 95]

a priori all samples will be 3000 "classified SN Ia"

## subsample the classes to make new samples

In [None]:
n_class_pos = 3000
sel_class = 90

### get sample ids matching a confusion matrix

In [None]:
# class ConfMat_borken(object):
#     def __init__(self, cm, indmap):
#         """
#         `axis=0` is predicted classes, `axis=1` is true classes
#         """
#         self.in_cm = cm 
#         self.cm = self.in_cm / np.sum(self.in_cm)
#         self.class_ids = indmap
#         self.norm = np.ones_like(self.class_ids)
#         self._extract_cols_()
#         self._extract_rows_()
# #         self._check_cm_()
#     def _check_cm_(self):
#         # insert some consistency checks here
#         pass
#     def _extract_cols_(self):
#         self.true_cols = {typeid: self.cm[self.class_ids[typeid]] for typeid in self.class_ids.keys()}
#         return self.true_cols
#     def _extract_rows_(self):
#         self.pred_rows = {typeid: self.cm.T[self.class_ids[typeid]] for typeid in self.class_ids.keys()}
#         return self.pred_rows
#     def _proc_norm_(self, norm, n_orig):
#         if type(norm) == int:
#             norm = {typeid: norm for typeid in self.class_ids.keys()}
#         self.norm = np.empty(len(self.class_ids))
#         for classno in self.class_ids.keys():
#             self.norm[self.class_ids[classno]] = norm[classno]
#         self.norm[n_orig == 0] = 0
#         self.norm[n_orig != 0] = self.norm[n_orig != 0] / n_orig[n_orig != 0]
#         return self.norm
#     def rescale_true(self, norm):
#         n_true = np.sum(self.cm, axis=0)
#         self.norm = self._proc_norm_(norm, n_true)
#         self.cm = self.cm * self.norm
#         self._extract_cols_()
#         self._extract_rows_()
# #         print(np.sum(self.cm, axis=0))
#         return# self.cm
#     def rescale_pred(self, norm):
#         n_pred = np.sum(self.cm, axis=1)
#         self.norm = self._proc_norm_(norm, n_pred)
#         self.cm = (self.cm.T * self.norm).T
#         self._extract_cols_()
#         self._extract_rows_()
# #         print(np.sum(self.cm, axis=1))
#         return# self.cm
#     def binarize(self, ref_class):
#         pass
#     def make_rates(self, ref_class=None):
#         pass

# class ConfMat(object):
#     def __init__(self, cm, indmap):
#         """
#         `axis=0` is predicted classes, `axis=1` is true classes
#         """
#         self.in_cm = cm
#         self.class_ids = indmap
#         self.true_cols = {typeid: cm[indmap[typeid]] for typeid in indmap.keys()}
#         self.pred_rows = {typeid: cm.T[indmap[typeid]] for typeid in indmap.keys()}
#         self._check_cm_()
#     def _check_cm_(self):
#         # insert some consistency checks here
#         pass
#     def binarize(self, ref_class):
#         pass
#     def make_rates(self, ref_class=None):
#         pass

To calculate the true/false positive/negative rates along the way to making the subsamples, we need a notion of negatives that would never end up in the cosmology sample.
Let's use the DDF type ratios to figure out how many objects will be classified as negative for our samples of 3000 positive classifications.

In [None]:
n_ddf_pos = n_ddf_all[sel_class]
n_ddf_glob = sum([n_ddf_all[classid] for classid in maybe_sn_classes])
n_ddf_neg = n_ddf_glob - n_ddf_pos

n_class_glob = n_class_pos * n_ddf_glob / n_ddf_pos
n_class_all = {classid: int(round(n_class_glob * n_ddf_all[classid] / n_ddf_glob)) for classid in maybe_sn_classes}
n_class_glob = sum(n_class_all.values())

`n_class_all` contains the number of objects in the true population, and the confusion matrix tells us how many will end up being classified as positive or negative

save outputs as `id,redshift,type,code,orig_sample=test,queryable=True`

In [None]:
# def subsample_cm_borken(cm_obj, samp_key, cat, 
#                  where_to_save=None, rando=rando):#, ntot=n_samp):
#     cm_row = cm_obj.pred_rows[samp_key]#cm.T[cm_indices[samp_key]]
#     print(cm_row)
#     out_ids = pd.DataFrame(columns=cat.columns)
#     for typekey in cm_obj.class_ids.keys():
# #         if not ntot:
#         print(cm_obj.class_ids[typekey])
#         n_to_sample = int(cm_row[cm_obj.class_ids[typekey]])
#         if n_to_sample < len(cat[cat['code'] == typekey]):
#             matches = cat[cat['code'] == typekey].sample(n=n_to_sample, random_state=rando)
#         else:
#             print(n_to_sample)
#         if n_to_sample > 0:
#             out_ids = out_ids.append(matches)
#     out_ids['orig_sample'] = 'test'
#     out_ids['queryable'] = True
#     out_ids['type'] = None
#     if where_to_save:
#         savecols = ['id','redshift','type','code','orig_sample','queryable']
#         out_ids[savecols].to_csv(where_to_save+'.csv', index=False)
#     return(out_ids)

# def subsample_cm(cm_obj, samp_key, cat, 
#                  where_to_save=None, rando=rando):#, ntot=n_samp):
#     cm_row = cm_obj.pred_rows[samp_key]#cm.T[cm_indices[samp_key]]
#     out_ids = pd.DataFrame(columns=cat.columns)
#     for typekey in cm_obj.class_ids.keys():
# #         if not ntot:
#         n_to_sample = int(cm_row[cm_obj.class_ids[typekey]])
#         matches = cat[cat['code'] == typekey].sample(n=n_to_sample, random_state=rando)
#         if len(matches) > 0:
#             out_ids = out_ids.append(matches)
#     out_ids['orig_sample'] = 'test'
#     out_ids['queryable'] = True
#     out_ids['type'] = None
#     if where_to_save:
#         savecols = ['id','redshift','type','code','orig_sample','queryable']
#         out_ids[savecols].to_csv(where_to_save+'.csv', index=False)
#     return(out_ids)

In [None]:
def subsample_cat(cm, cm_indices, pos_key=sel_class, 
                  ntot=n_class_all, cat=true_ddf, 
                  where_to_save=None, save_neg=True, rando=rando):
    # normalize to number in true class
    pcm = cm / np.sum(cm, axis=1)
    # want row corresponding to predicted class
    pos_row = pcm[cm_indices[pos_key]] * ntot[pos_key]
    pos_ids, neg_ids = pd.DataFrame(columns=cat.columns), pd.DataFrame(columns=cat.columns)
#     bin_cm = np.zeros((2, 2))
    for typeid in cm_indices.keys():
        n_pos = int(round(pos_row[cm_indices[typeid]]))
        n_neg = ntot[typeid] - n_pos
#         if typeid == pos_key:
#             print((cm[cm_indices[typeid]], pos_row[cm_indices[typeid]], ntot[typeid], n_pos))
#             bin_cm[0][0] += n_pos
#             bin_cm[1][0] += n_neg
#         else:
#             bin_cm[0][1] += n_pos
#             bin_cm[1][1] += n_neg
        matches = cat[cat['code'] == typeid].sample(n=ntot[typeid], random_state=rando)
        pos = matches[:n_pos]
        neg = matches[n_pos:]
        if len(pos) > 0:
            pos_ids = pos_ids.append(pos)
        if len(neg) > 0:
            neg_ids = neg_ids.append(neg)
    if where_to_save:
        pos_ids['orig_sample'] = 'test'
        pos_ids['queryable'] = True
        pos_ids['type'] = None
        pos_ids[['id','redshift','type','code','orig_sample','queryable']].to_csv(where_to_save+'.csv', index=False)
    return pos_ids, neg_ids

### realistic classifier

start from fiducial contamination rates from a real (awful) confusion matrix at `/media/RESSPECT/data/PLAsTiCC/for_metrics/confusion_matrices`

These were just the test set lightcurves for classes (67, 88, 42(minus 7?), 90(minus 11?), 52, 62, 64, 95, 15) from ddf-only

figure out classes in confusion matrix by comparing number of ddf test set-only lightcurves

In [None]:
savepath = '/media/RESSPECT/data/PLAsTiCC/for_metrics/'

In [None]:
with open(savepath+'confusion_matrices/confusion_matrix.npy', 'rb') as confmat:
    cm = np.load(confmat)
# plt.imshow(np.log(fid_cm.in_cm), cmap='viridis_r')
# plt.colorbar()

In [None]:
cm_classes = [67, 88, 42, 90, 52, 62, 64, 95, 15]
cm_indices = {}
# cm_rat = {}
for classid in maybe_sn_classes.keys():
    cm_indices[classid] = cm_classes.index(classid)
#     cm_rat[classid] = sum(cm[cm_indices[classid]]) / n_cm

In [None]:
# fid_cm = ConfMat(cm, cm_indices)
# # fid_cm.rescale_true(n_ddf_all)
# fid_cm.rescale_pred(n_class_pos)

fiducial sample corresponding to input confusion matrix

In [None]:
fiducial = subsample_cat(cm, cm_indices, where_to_save=savepath+'fiducial_samp')
# print(len(fiducial))

In [None]:
reopen = pd.read_csv(savepath+'fiducial_samp.csv')

reopen

### 100% SNIa sample

In [None]:
perf_cm = np.identity(len(cm_indices.keys()))
# perf_cm = ConfMat(cm_perfect, cm_indices)
# perf_cm.rescale_true(n_ddf_all)
# perf_cm.rescale_pred(n_class_pos)
# print(perf_cm.cm)
perfect = subsample_cat(perf_cm, cm_indices, where_to_save=savepath+'perfect_samp')

## create new confusion matrices to tune output sample rates

consider `proclam` classifier archetypes for inspiration

In [None]:
M_classes = len(cm_indices)

# 'Uncertain'
cm_uncertain = np.ones((M_classes, M_classes))

# 'Perfect'
cm_perfect = np.eye(M_classes) + 1.e-8

# 'Almost'
cm_almost = np.eye(M_classes) + 0.1 * np.ones((M_classes, M_classes))

# 'Noisy'
cm_noisy = np.eye(M_classes) + 0.5 * np.ones((M_classes, M_classes))

# # 'Tunnel Vision'
# cm = np.ones((M_classes, M_classes))
# cm = cm * np.asarray(0.1)[np.newaxis, np.newaxis]
# cm[:, chosen] = cm[:, chosen] / M_classes
# cm[chosen][chosen] += M_classes

# # 'Cruise Control'
# cm = np.eye(M_classes) + 1.e-8
# cm[:] = cm[chosen]

# # 'Subsuming'
# cm = np.eye(M_classes) + 0.1 * np.ones((M_classes, M_classes))
# cm[chosen] = cm[chosen-1]

# # 'Mutually Subsuming'
# cm = np.eye(M_classes) + 0.1 * np.ones((M_classes, M_classes))
# cm[chosen][chosen+1] = cm[chosen][chosen]
# cm[chosen+1][chosen] = cm[chosen+1][chosen+1]

In [None]:
# 'Mutually Subsuming'
target = cm_indices[sel_class]
contaminant = cm_indices[62]
half_ibc_cm = np.eye(M_classes) + 0.1 * np.ones((M_classes, M_classes))
half_ibc_cm[target][contaminant] = half_ibc_cm[target][target]
half_ibc_cm[contaminant][target] = half_ibc_cm[contaminant][contaminant]
# plt.imshow(half_ibc_cm)
# plt.colorbar()

make new confusion matrices as mixtures of existing ones

In [None]:
def mix_arr(inarrs, weights=None):
    narrs = len(inarrs)
    if weights is None:
        weights = np.ones_like((1, narrs))
    arrs = inarrs / np.sum(np.sum(inarrs, axis=-1), axis=-1)[:, np.newaxis, np.newaxis]
    normwts = weights / np.sum(weights)
    outarr = np.sum(arrs * normwts[:, np.newaxis, np.newaxis], axis=0)
    return outarr

In [None]:
new_cm = mix_arr(np.array([cm_uncertain, cm_perfect]))
# plt.imshow(new_cm)
# plt.colorbar()

## evaluate classification metrics on the subsamples

better to do it along the way to making the subsamples, especially important for non-extreme subsamples filling the space of classification metric values

first get rates using `proclam` functionality

In [None]:
def cat_to_rate(pos_ids, neg_ids, pos_key=sel_class):
    pos_ids['classed'] = True
    neg_ids['classed'] = False
    whole_samp = pd.concat((pos_ids, neg_ids))
    whole_samp['truth'] = None
    whole_samp['truth'][whole_samp['code'] != pos_key] = False
    whole_samp['truth'][whole_samp['code'] == pos_key] = True
    bin_cm = det_to_cm(whole_samp['classed'].to_numpy(), whole_samp['truth'].to_numpy())
    rawrate = cm_to_rate(bin_cm)._asdict()
    rel_to_sel = {key: rawrate[key][0] for key in rawrate.keys()}
    rate = proclam.util.RateMatrix(**rel_to_sel)
    return rate

### calculate all the metrics!

and put some version of this into `proclam` at some point

In [None]:
class det_mets(RateMatrix):
    "binary classification metrics"
    def __init__(self, **rates):
        """
        Call like `thing = det_mets(**rates._asdict())`
        """
#         self.rates = rates#.asdict()
        self._get_tots()
        self._from_rates()
        self._sn_mets()
        self._translate()
    def _get_tots(self):
        self.CP = self.TP + self.FN
        self.CN = self.TN + self.FP
        self.T = self.TP + self.TN
        self.F = self.FP + self.FN
        self.P = self.TP + self.FP
        self.N = self.TN + self.FN
    def _from_rates(self):
        self.PPV = self.TP / (self.TP + self.FP)
        self.NPV = self.TN / (self.TN + self.FN)
        self.PT = (np.sqrt(self.TPR * (1. - self.TNR)) + self.TNR - 1.) / (self.TPR + self.TNR - 1.)
        self.TS = self.TP / (self.TP + self.FN + self.FP)
        self._derived()
    def _derived(self):
        self.ACC = (self.TP + self.TN) / (self.CP + self.CN)
        self.BA = (self.TPR + self.TNR) / 2,
        self.F1S = 2. * self.PPV * self.TPR / (self.PPV + self.TPR)
        self.MCC = (self.TP * self.TN - self.FP * self.FN) / (np.sqrt(self.P * self.CP * self.CN * self.N))
        self.FM = np.sqrt(self.PPV * self.TPR)
        self.BM = self.TPR + self.TNR - 1.
        self.MK = self.PPV + self.NPV - 1.
    def _translate(self):
        self.positive = self.CP
        self.negative = self.CN
        self.sensitivity = self.TPR
        self.recall = self.TPR
        self.specificity = self.TNR
        self.selectivity = self.TNR
        self.precision = self.PPV
        self.FDR = 1. - self.PPV
        self.FOR = 1. - self.NPV
        self.CSI = self.TS
        self.accuracy = self.ACC
        self.f1_score = self.F1S
        self.informedness = self.BM
        self.deltaP = self.MK
    def _sn_mets(self):
        self.get_efficiency()
        self.get_purity()
    def get_efficiency(self):
        self.efficiency = self.TP / self.CP
        return self.efficiency
    def get_purity(self):
        self.purity = self.TP / self.P
        return self.purity
    def get_fom(self, penalty):
        self.pseudo_purity = self.TP / (self.TP + penalty * self.FP)
        return self.pseudo_purity * self.efficiency

demonstrate on the archetypes

In [None]:
for cm in [cm_perfect, cm_almost, cm_noisy]:
    pos, neg = subsample_cat(cm, cm_indices)
    rates = cat_to_rate(pos, neg)
    mets = det_mets(**rates._asdict())
    print(f'purity:{mets.purity}, efficiency:{mets.efficiency}, fom1:{mets.get_fom(1.)}, fom3:{mets.get_fom(3.)}')

## next, make samples corresponding to metric values

In [None]:
maybe_sn_classes

original plan was to have these samples:
- 100% Ia
- Ia/Ibc
- - 50/50
- - 75/25
- - 90/10
- - 95/5
- - 98/2
- Ia/II
- Ia/91bg
- Ia/Iax
- AGN
- TDE 
- KN

In [None]:
ia_percents = np.array([50, 68, 75, 90, 95, 98, 99])
mix_percents = 100 - ia_percents
contaminants = maybe_sn_classes.copy()
contaminants.pop(sel_class)
samppath = savepath+'samples/'
metpath = savepath+'metrics/'

assume symmetry in 2-class mix

In [None]:
# binary_ia_mets = {}
for key, val in contaminants.items():
# for j in [0]:
#     key = 67
#     val = contaminants[key]
#     binary_ia_mets[key] = []
    subset_indices = {sel_class: 0, key: 1}
    for i, perc in enumerate(mix_percents):
        sampfn = samppath+str(ia_percents[i])+str(maybe_sn_classes[sel_class])+str(perc)+val
        print(sampfn)
        cm = np.array([[ia_percents[i], perc], [perc, ia_percents[i]]])
#         print(cm)
        pos, neg = subsample_cat(cm, subset_indices, where_to_save=sampfn)
        rates = cat_to_rate(pos, neg)
#         mets = det_mets(**rates._asdict())
        metfn = metpath+f'{ia_percents[i]}_{sel_class}_{perc}_{key}'
        with open(metfn+'.pkl', 'wb') as metfile:
            pkl.dump(rates._asdict(), metfile)