In [None]:
import numpy as np
import pandas as pd

import proclam
from proclam import *

import matplotlib as mpl
mpl.use('Agg')
mpl.rcParams['text.usetex'] = False
mpl.rcParams['mathtext.rm'] = 'serif'
mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.serif'] = 'Times New Roman'
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['savefig.dpi'] = 250
mpl.rcParams['savefig.format'] = 'pdf'
mpl.rcParams['savefig.bbox'] = 'tight'
import matplotlib.pyplot as plt
%matplotlib inline

<!-- ![](./header.png) -->
<img src="./header.png",width=100%>

# Selection of a performance metric for PLAsTiCC

*Alex Malz (NYU)*, *Tarek Alam (UCL)*, *Anita Bahmanyar (U. Toronto)*, *Rahul Biswas (U. Stockholm)*, *Renee Hlozek (U. Toronto)*, *Rafael Martinez-Galarza (Harvard)*, *Gautham Narayan (STScI)*

We describe and illustrate the process by which a global performance metric was chosen for Photometric LSST Astronomical Time-series Classification Challenge (PLAsTiCC), a Kaggle competition aiming to identify promising transient and variable classifiers for LSST by involving the broader community outside astronomy.

Introduction
============

The metric of this note is for the first version of the Kaggle competition, though there are future plans for an early classification challenge and identification of class-specific metrics for different science goals.

* The metric must return a single scalar value.
* The metric must be well-defined for non-binary classes.
* The metric must balance diverse science use cases in the presence of heavily nonuniform class prevalence.
* The metric must respect the information content of probabilistic classifications.
* The metric must be able to evaluate deterministic classifications.
* The metric must be interpretable, meaning it gives a more optimal value for "good" mock classifiers and a less optimal value for mock classifiers plagued by anticipated systematic errors; in other words, it must pass basic tests of intuition.
* The metric must be reliable, giving consistent results for different instantiations of the same test case.

Data
====



In [None]:
def plot_cm(probs, truth, name, loc=''):
    cm = proclam.metrics.util.prob_to_cm(probs, truth)
    plt.matshow(cm.T, vmin=0., vmax=1.)
# plt.xticks(range(max(truth)+1), names)
# plt.yticks(range(max(truth)+1), names)
    plt.xlabel('predicted class')
    plt.ylabel('true class')
    plt.colorbar()
    plt.title(name)
    plt.savefig(loc+name+'_cm.png')
    plt.close()
    
def plot_cm_from_cm(cm, text):
    plt.matshow(cm.T)
    plt.title = text
    plt.xlabel('predicted class')
    plt.ylabel('true class')
    plt.close()

In [None]:
def make_class_pairs(data_info_dict):
    return zip(data_info_dict['classifications'], data_info_dict['truth_tables'])

def make_file_locs(data_info_dict):
    names = data_info_dict['names']
    data_info_dict['dirname'] = dirname + data_info_dict['label'] + '/'
    data_info_dict['classifications'] = ['%s/predicted_prob_%s.csv'%(name, name) for name in names]
    data_info_dict['truth_tables'] = ['%s/truth_table_%s.csv'%(name, name) for name in names]
    print(data_info_dict)
    return data_info_dict

In [None]:
def process_strings(dataset, cc):
    loc = dataset['dirname']
    title = dataset['label'] + ' ' + dataset['names'][cc]
    return loc, title

def just_read_class_pairs(pair, dataset, cc):
    loc, title = process_strings(dataset, cc)
    clfile = pair[0]
    truthfile = pair[1]
    prob_mat = pd.read_csv(loc + clfile, delim_whitespace=True).values
    nobj = np.shape(prob_mat)[0]
    nclass = np.shape(prob_mat)[1]
    truth_values = pd.read_csv(loc + truthfile, delim_whitespace=True).values
    nobj_truth = np.shape(truth_values)[0]
    nclass_truth = np.shape(truth_values)[1]
    tvec = np.where(truth_values==1)[1]
    pmat = prob_mat
    return pmat, tvec
    
def read_class_pairs(pair, dataset, cc):
    loc, title = process_strings(dataset, cc)
    pmat, tvec = just_read_class_pairs(pair, dataset, cc)
    plot_cm(pmat, tvec, title, loc=loc + dataset['names'][cc] + '/')
    return pmat, tvec

In [None]:
for dataset in [mystery, snphotcc, plasticc]:
    dataset = make_file_locs(dataset)
    dataset['class_pairs'] = make_class_pairs(dataset)

## Mock classifier systematics

* idealized: highly accurate on all classes
* guessing: random classifications across all classes
* tunnel vision: classifies one class well and others randomly
* cruise control: classifies all objects as a single class
* subsumed: consistently misclassifies one class as one other class

*show confusion matrices*

In [None]:
M_classes = 13
plasticc = {}
plasticc['label'] = 'ProClaM'
plasticc['names'] = []

In [None]:
testname = 'Idealized'
cm = np.eye(M_classes) + 0.2 * np.random.uniform(size=(M_classes, M_classes))
cm = cm / np.sum(cm, axis=1)
plot_cm_from_cm(cm, testname)
plasticc['names'].append(testname)

In [None]:

plasticc['names'] = ['Idealized', 'Guess', 'Tunnel', 'Broadbrush', 'Cruise', 'SubsumedTo', 'SubsumedFrom']

## Real classification results

* SNPhotCC \[from Michelle?\]
* \[Ashish's data?\]
* \[Renee's data?\]

*show confusion matrices*

In [None]:
snphotcc = {}
snphotcc['label'] = 'SNPhotCC'
prefixes = ['Templates', 'Wavelets']
suffixes = ['BoostForest', 'KNN', 'NB', 'NeuralNetwork', 'SVM']
snphotcc['names'] = []
for prefix in prefixes:
    for suffix in suffixes:
        snphotcc['names'].append(prefix+suffix)

In [None]:
mystery = {}
mystery['label'] = 'Unknown'
mystery['names'] = ['RandomForest', 'KNeighbors', 'MLPNeuralNet']

Methods (Metrics)
======

We considered two metrics of classification probabilities, each of which is interpretable and avoids reducing probabilities to point estimates

The Brier score is defined as
\begin{eqnarray*}
B &=& \sum_{m=1}^{M}\frac{w_{m}}{N_{m}}\sum_{n=1}^{N_{m}}\left((1-p_{n}(m | m))^{2}+\sum_{m'\neq m}^{M}(p_{n}(m' | m))^{2}\right)
\end{eqnarray*}

The log-loss is defined as
\begin{eqnarray*}
L &=& -\sum_{m=1}^{M}\frac{w_{m}}{N_{m}}\sum_{n=1}^{N_{m}}\ln[p_{n}(m | m)]
\end{eqnarray*}

average within each class, then weighted average between classes

In [None]:
metricslist = ['Brier', 'LogLoss']
colors = ['b', 'r']
dirname = 'examples/'
markerlist = ['o', 's', '*']

Results
=======

*one plot per set of "true" classes: classifiers on x axis, metrics on y axes*

In [None]:
def make_patch_spines_invisible(ax):
    ax.set_frame_on(True)
    ax.patch.set_visible(False)
    for sp in ax.spines.values():
        sp.set_visible(False)
        
def per_metric_helper(ax, n, data, metric_names, codes, shapes, colors):
    plot_n = n+1
    in_x = np.arange(len(codes))
    ax_n = ax
    n_factor = 0.1 * (plot_n - 2)
    if plot_n>1:
        ax_n = ax.twinx()
        rot_ang = 270
        label_space = 15.
    else:
        rot_ang = 90
        label_space = 0.
    if plot_n>2:
        ax_n.spines["right"].set_position(("axes", 1. + 0.1 * (plot_n-1)))
        make_patch_spines_invisible(ax_n)
        ax_n.spines["right"].set_visible(True)
    handle = ax_n.scatter(in_x+n_factor*np.ones_like(data[n]), data[n], marker=shapes[n], s=10, color=colors[n], label=metric_names[n])
    ax_n.set_ylabel(metric_names[n], rotation=rot_ang, fontsize=14, labelpad=label_space)
#     ax_n.set_ylim(0.9 * min(data[n]), 1.1 * max(data[n]))
    return(ax, ax_n, handle)

def metric_plot(dataset, metric_names, shapes, colors):
    codes = dataset['names']
    data = dataset['results']
    title = dataset['label']
    fileloc = dataset['dirname']+dataset['label']+'_results.png'
    xs = np.arange(len(codes))
    fig, ax = plt.subplots()
    fig.subplots_adjust(right=1.)
    handles = []
    for n in range(len(metric_names)):
        (ax, ax_n, handle) = per_metric_helper(ax, n, data, metric_names, codes, shapes, colors)
        handles.append(handle)
    plt.xticks(xs, codes)
    for tick in ax.get_xticklabels():
        tick.set_rotation(90)
    plt.xlabel('Classifiers', fontsize=14)
    plt.legend(handles, metric_names)
    plt.suptitle(title)
    plt.savefig(fileloc)
    return

## Mock classifier systematics

In [None]:
generator = proclam.simulators.LogUnbalanced()
N_objects = 10000
truth = generator.simulate(M_classes, N_objects)

In [None]:
d = np.diff(np.unique(truth)).min()
left_of_first_bin = truth.min() - float(d)/2
right_of_last_bin = truth.max() + float(d)/2
plt.hist(truth, np.arange(left_of_first_bin, right_of_last_bin + d, d), log=True)
# plt.xticks(range(max(truth)+1), names)
plt.hist(truth, log=True)
plt.ylabel('counts')
plt.xlabel('class')

In [None]:
for dataset in [plasticc]:
    data = np.empty((len(metricslist), len(dataset['names'])))
    for cc, pair in enumerate(dataset['class_pairs']):
        probm, truthv = read_class_pairs(pair, dataset, cc)
        for count, metric in enumerate(metricslist):
            D = getattr(proclam.metrics, metric)()
            hm = D.evaluate(probm, truthv)
            data[count][cc] = hm
    dataset['results'] = data
    metric_plot(dataset, metricslist, markerlist, colors)

## Real classification results

In [None]:
for dataset in [mystery, snphotcc]:
    data = np.empty((len(metricslist), len(dataset['names'])))
    for cc, pair in enumerate(dataset['class_pairs']):
        probm, truthv = read_class_pairs(pair, dataset, cc)
        for count, metric in enumerate(metricslist):
            D = getattr(proclam.metrics, metric)()
            hm = D.evaluate(probm, truthv)
            data[count][cc] = hm
    dataset['results'] = data
    metric_plot(dataset, metricslist, markerlist, colors)

Conclusions
===========

Write about your conclusions here. You have drawn some, right?

In [None]:
# cells with a tag of "hideme" will not appear in html resulting from:
 #jupyter nbconvert desc_note/main.ipynb --TagRemovePreprocessor.remove_cell_tags='["hideme"]'
# jupyter nbconvert desc_note/main.ipynb --TagRemovePreprocessor.remove_input_tags='["hidein"]'
