### Simple notebook that calculates the model performance metrics
The three metrcis calculated are:
* Accuracy,
* Jensen-Shannon divergence, and
* Kullback–Leibler divergence

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import jensenshannon
from scipy.stats import entropy

### Helper functions

In [2]:
# Note: This comes from the ChaosNLI github repository:
# https://github.com/easonnie/ChaosNLI/blob/master/distnli/src/evaluation/tools.py#L24
def model_label_dist(logits_list):

    logits = np.asarray(logits_list)
    prob = np.exp(logits_list) / np.sum(np.exp(logits_list))

    # numerical stability for KL
    for i, value in enumerate(prob):
        if np.abs(value) < 1e-15:
            prob[i] = 1e-15
    
    # normalize
    prob = prob / np.sum(prob)
    assert np.isclose(np.sum(prob), 1)
    
    return prob


def generate_metrics_dict(df):
    
    return {'acc': len(df.query('label_g == predicted_label'))/len(df),
            'jsd': df.jsd.mean(),
            'kld': df.kld.mean()}


def generate_all_metrics_dict(df):

    return {'all': generate_metrics_dict(eval_df),
            'no-ambiguity': generate_metrics_dict(eval_df.query('amb_level == 0')),
            'medium-ambiguity': generate_metrics_dict(eval_df.query('amb_level == 1')),
            'high-ambiguity': generate_metrics_dict(eval_df.query('amb_level == 2'))}


def calculate_example_metrics(df):

    df['predicted_probs'] = df.apply(lambda x: model_label_dist(x.predicted_scores), axis=1)
    df['jsd'] = df.apply(lambda x: jensenshannon(x.label, x.predicted_probs), axis=1)
    df['kld'] = df.apply(lambda x: entropy(x.label, x.predicted_probs), axis=1)
    df['amb_level'] = df.apply(lambda x: get_ambiguity_level(x), axis=1)

    return df


def get_ambiguity_level(x):

    # this whole thing is a bit klugy, but <shrug> it works since there might be some fp imprecision.
    label_count = max(x.label)*5

    if label_count > 4.5:
        return 0
    elif label_count > 3.5:
        return 1
    else:
        return 2

In [3]:
BASE_OUTPUT_PATH = '/Users/richardross/workspace/msds/dsc-395t-nlp-final-project/fp-dataset-artifacts/output_new'

### Evaluate the baseline performance
This model has been trained on the training dataset using gold labels, fine tuned on the dev dataset using gold labels, and evaluated on the test dataset using gold labels.

In [4]:
eval_df = pd.read_json(f'{BASE_OUTPUT_PATH}/trained_model_snli_baseline/eval_from_train_dev_baseline_on_test_gold_labels/eval_predictions.jsonl', lines=True)
eval_df = calculate_example_metrics(eval_df)
generate_all_metrics_dict(eval_df)

{'all': {'acc': 0.8748982084690554,
  'jsd': 0.2003502430164696,
  'kld': 0.8838935015687398},
 'no-ambiguity': {'acc': 0.9397097134350577,
  'jsd': 0.07225231323663986,
  'kld': 0.28704277771898407},
 'medium-ambiguity': {'acc': 0.8676419366074538,
  'jsd': 0.31293194168363136,
  'kld': 1.2630094438199184},
 'high-ambiguity': {'acc': 0.6675110829639012,
  'jsd': 0.43162080524787133,
  'kld': 2.2259035837507875}}

### Evaluate the baseline performance
This model has been trained on the training dataset using gold labels, fine tuned on the dev dataset using gold labels, and evaluated on the test dataset using the probability distributions.

In [5]:
eval_df = pd.read_json(f'{BASE_OUTPUT_PATH}/trained_model_snli_baseline/eval_from_train_dev_baseline_on_test_probs/eval_predictions.jsonl', lines=True)
eval_df = calculate_example_metrics(eval_df)
generate_all_metrics_dict(eval_df)

{'all': {'acc': 0.8748982084690554,
  'jsd': 0.2003502430164696,
  'kld': 0.8838935015687398},
 'no-ambiguity': {'acc': 0.9397097134350577,
  'jsd': 0.07225231323663986,
  'kld': 0.28704277771898407},
 'medium-ambiguity': {'acc': 0.8676419366074538,
  'jsd': 0.31293194168363136,
  'kld': 1.2630094438199184},
 'high-ambiguity': {'acc': 0.6675110829639012,
  'jsd': 0.43162080524787133,
  'kld': 2.2259035837507875}}

### Evaluate the baseline performance
This model has been trained on the training dataset using gold labels and evaluated on the test dataset using the probability distributions (no fine-tuning).

In [6]:
eval_df = pd.read_json(f'{BASE_OUTPUT_PATH}/trained_model_snli_baseline/eval_from_train_on_test_probs/eval_predictions.jsonl', lines=True)
eval_df = calculate_example_metrics(eval_df)
generate_all_metrics_dict(eval_df)

{'all': {'acc': 0.8903705211726385,
  'jsd': 0.1938702996261007,
  'kld': 0.3650240844393278},
 'no-ambiguity': {'acc': 0.9534797171566803,
  'jsd': 0.12507782638436302,
  'kld': 0.13604502757204934},
 'medium-ambiguity': {'acc': 0.8909787530477186,
  'jsd': 0.24889274810316228,
  'kld': 0.5160997803094306},
 'high-ambiguity': {'acc': 0.6744775174160861,
  'jsd': 0.327955987798016,
  'kld': 0.8696441786519243}}

### Evaluate the probability distribution performance
This model has been trained on the training dataset using gold labels, fine tuned on the dev dataset using probability distributions, and evaluated on the test dataset using the probability distributions.

In [7]:
eval_df = pd.read_json(f'{BASE_OUTPUT_PATH}/trained_model_snli_baseline/eval_from_train_dev_on_test_probs/eval_predictions.jsonl', lines=True)
eval_df = calculate_example_metrics(eval_df)
generate_all_metrics_dict(eval_df)

{'all': {'acc': 0.8905741042345277,
  'jsd': 0.21167255341506863,
  'kld': 0.2573941631956475},
 'no-ambiguity': {'acc': 0.9542240416821735,
  'jsd': 0.18507676339716447,
  'kld': 0.1685656369707211},
 'medium-ambiguity': {'acc': 0.8861024033437827,
  'jsd': 0.2219503197507553,
  'kld': 0.2965972979670458},
 'high-ambiguity': {'acc': 0.682077264091197,
  'jsd': 0.28350175443245973,
  'kld': 0.4884342518619364}}