# Emulating realistically bad-for-cosmology SN Ia samples from PLAsTiCC data

_Alex I. Malz (GCCL@RUB)_

In [None]:
import gzip
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle as pkl

rando = 42

In [None]:
import proclam
from proclam.metrics.util import *

classes we care about

| `true_target`=`type` | `code` |
| -------------------- | ------ |
| 90 | SNIa |
| 67 | SNIa-91bg |
| 52 | SNIax |
| 42 | SNII |
| 62 | SNIbc |
| 95 | SLSN-I |
| 88 | AGN |

In [None]:
maybe_sn_classes = {90: 'SNIa', 
                    67: 'SNIa-91bg', 
                    52: 'SNIax', 
                    42: 'SNII', 
                    62: 'SNIbc', 
                    95: 'SLSN-I', 
                    88: 'AGN'}
maybe_sn_classes[64] = 'KN'
maybe_sn_classes[15] = 'TDE'

## gather all available lightcurves

In [None]:
datapath = '/media/RESSPECT/data/PLAsTiCC/PLAsTiCC_zenodo/'

other than intermediate data products, work in `/media/RESSPECT/data/PLAsTiCC/for_metrics/`

In [None]:
all_maybe_sn = pd.read_csv('/media/RESSPECT/data/PLAsTiCC/PLAsTiCC_zenodo/plasticc_test_metadata.csv')
print(len(all_maybe_sn))

In [None]:
all_maybe_sn = all_maybe_sn.rename(columns={"object_id": "id", "true_z": "redshift", "true_target": "code"})
print(all_maybe_sn.columns)

In [None]:
# all_maybe_sn['orig_sample'] = 'test'
# all_maybe_sn['queryable'] = True
# all_maybe_sn['type'] = None

DDF now, WFD later

In [None]:
true_ddf = all_maybe_sn.loc[all_maybe_sn['ddf_bool'] == 1][['id', 'redshift', 'code']]
# true_wfd = all_maybe_sn.loc[all_maybe_sn['ddf_bool'] == 0][['id', 'redshift', 'code']]
n_ddf_tot = len(true_ddf)
n_ddf_all = dict(true_ddf.groupby('code').count()['id'])
# ddf_rats = dict(true_ddf.groupby('code').count()['id'] / n_ddf_tot)

a priori all samples will be 3000 "classified SN Ia"

## subsample the classes to make new samples

In [None]:
n_class_pos = 3000
sel_class = 90

### as in proclam, based on a confusion matrix

start from fiducial contamination rates from a real (awful) confusion matrix at `/media/RESSPECT/data/PLAsTiCC/for_metrics/confusion_matrices`

figure out classes in confusion matrix by comparing number of ddf test set-only lightcurves

In [None]:
with open(savepath+'confusion_matrices/confusion_matrix.npy', 'rb') as confmat:
    cm = np.load(confmat)
n_cm = np.sum(cm)
plt.imshow(np.log(cm), cmap='viridis_r')
plt.colorbar()

These were just the test set lightcurves for classes (67, 88, 42(minus 7?), 90(minus 11?), 52, 62, 64, 95, 15) from ddf-only

In [None]:
cm_classes = [67, 88, 42, 90, 52, 62, 64, 95, 15]
cm_indices = {}
# cm_rat = {}
for classid in maybe_sn_classes.keys():
    cm_indices[classid] = cm_classes.index(classid)
#     cm_rat[classid] = sum(cm[cm_indices[classid]]) / n_cm

### get sample ids matching a confusion matrix

original plan was to have these samples:
- 100% Ia
- Ia/Ibc
- - 50/50
- - 75/25
- - 90/10
- - 95/5
- - 98/2
- Ia/II
- Ia/91bg
- Ia/Iax
- AGN
- TDE 
- KN

In [None]:
savepath = '/media/RESSPECT/data/PLAsTiCC/for_metrics/'

save outputs as `id,redshift,type,code,orig_sample=test,queryable=True`

In [None]:
def gen_samp_ids(cm, cm_indices, samp_key, ntot=n_samp, where_to_save=None, rando=rando):
    cm_row = cm.T[cm_indices[samp_key]]
    out_ids = pd.DataFrame(columns=all_maybe_sn.columns)
    for typeid in cm_indices.keys():
        if not ntot:
            n_to_sample = int(cm_row[cm_indices[typeid]])
        matches = all_maybe_sn[all_maybe_sn['code'] == typeid].sample(n=n_to_sample, random_state=rando)
        if len(matches) > 0:
            out_ids = out_ids.append(matches)
    out_ids['orig_sample'] = 'test'
    out_ids['queryable'] = True
    out_ids['type'] = None
    if where_to_save:
        out_ids[['id','redshift','type','code','orig_sample','queryable']].to_csv(where_to_save+'.csv', index=False)
    return(out_ids)

### 100% SNIa sample

In [None]:
cm_perfect = np.diag(cm) * np.identity(len(cm_indices.keys()))
perfect = gen_samp_ids(cm_perfect, cm_indices, sel_class)#, where_to_save=savepath+'perfect_samp')

### "realistic" sample

fiducial sample corresponding to input confusion matrix

In [None]:
fiducial = gen_samp_ids(cm, cm_indices, sel_class)#, where_to_save=savepath+'fiducial_samp')
# print(len(fiducial))

## evaluate classification metrics on the subsamples

do it along the way to making the subsamples, especially important for non-extreme subsamples filling the space of classification metric values

Need to normalize ratios of classes to know how many are in the potential population

In [None]:
n_ddf_pos = n_ddf[sel_class]
n_ddf_glob = sum([n_ddf[classid] for classid in maybe_sn_classes])
n_ddf_neg = n_ddf_glob - n_ddf_pos

n_class_glob = n_class_pos * n_ddf_glob / n_ddf_pos
n_class_neg = {classid: math.ceil(n_class_glob * n_ddf[classid] / n_ddf_glob) for classid in maybe_sn_classes}

now try to get rates for the samples using proclam functionality

In [None]:
def deterministic_metrics(truth, classified_pos, type_key=sel_class):
    true_rates = truth.groupby('code').count()['id']
    tp_plus_fp = pd.merge(classified_pos, truth, right_index=True, #on='id',
                left_on='id', right_on='object_id')[['id', 'code']]
    
    cm_to_rate

## next, perturb randomly using vaguely `proclam` approach

idea: mixture model of confusion matrices

maybe consider `proclam` classifier archetypes for inspiration

In [None]:
# # 'Uncertain'
# cm = np.ones((M_classes, M_classes))

# # 'Perfect'
# cm = np.eye(M_classes) + 1.e-8

# # 'Almost'
# cm = np.eye(M_classes) + 0.1 * np.ones((M_classes, M_classes))

# # 'Noisy'
# cm = np.eye(M_classes) + 0.5 * np.ones((M_classes, M_classes))

# # 'Tunnel Vision'
# cm = np.ones((M_classes, M_classes))
# cm = cm * np.asarray(0.1)[np.newaxis, np.newaxis]
# cm[:, chosen] = cm[:, chosen] / M_classes
# cm[chosen][chosen] += M_classes

# # 'Cruise Control'
# cm = np.eye(M_classes) + 1.e-8
# cm[:] = cm[chosen]

# # 'Subsuming'
# cm = np.eye(M_classes) + 0.1 * np.ones((M_classes, M_classes))
# cm[chosen] = cm[chosen-1]

# # 'Mutually Subsuming'
# cm = np.eye(M_classes) + 0.1 * np.ones((M_classes, M_classes))
# cm[chosen][chosen+1] = cm[chosen][chosen]
# cm[chosen+1][chosen] = cm[chosen+1][chosen+1]

In [None]:
def perturb_subsamp(cm, cm_indices, samp_key, epsilon):
    cm_row = cm.T[cm_indices[samp_key]]
    for typeid in cm_indices.keys():
        if typeid != samp_key:
            

## next, make samples corresponding to metric values

start with purity and efficiency for binary classification situation

In [None]:
# tot_in_test = np.sum(cm, axis=1)
# print(tot_in_test)

# tp_in_test = np.diag(cm)
# tpr_in_test = tp_in_test / tot_in_test
# # print((tp_in_test, tpr_in_test))
# efficiency = tpr_in_test
# print(efficiency)

# contamination_raw = np.sum(cm, axis=0) - tp_in_test
# contamination_rate = contamination_raw / tot_in_test
# # print((contamination_raw, contamination_rate))
# purity = tp_in_test / (tp_in_test + contamination_raw)
# print(purity)