In [7]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import jensenshannon
from scipy.stats import entropy

In [27]:
ROOT_OUTPUT_PATH = './output_final/trained_model_snli_baseline'

In [122]:
# Note: This comes from the ChaosNLI github repository:
# https://github.com/easonnie/ChaosNLI/blob/master/distnli/src/evaluation/tools.py#L24
def model_label_dist(logits_list):

    logits = np.asarray(logits_list)
    prob = np.exp(logits_list) / np.sum(np.exp(logits_list))

    # numerical stability for KL
    for i, value in enumerate(prob):
        if np.abs(value) < 1e-15:
            prob[i] = 1e-15
    
    # normalize
    prob = prob / np.sum(prob)
    assert np.isclose(np.sum(prob), 1)
    
    return prob


def generate_metrics_dict(df):
    
    return {'acc': len(df.query('label_g == predicted_label'))/len(df),
            'jsd': df.jsd.mean(),
            'kld': df.kld.mean()}


def generate_all_metrics_dict(df):

    return {'all': generate_metrics_dict(df),
            'no-ambiguity': generate_metrics_dict(df.query('amb_level == 0')),
            'medium-ambiguity': generate_metrics_dict(df.query('amb_level == 1')),
            'high-ambiguity': generate_metrics_dict(df.query('amb_level == 2'))}


def calculate_example_metrics(df, use_probs = False):

    if not use_probs:
        df['predicted_probs'] = df.apply(lambda x: model_label_dist(x.predicted_scores), axis=1)
        
    df['jsd'] = df.apply(lambda x: jensenshannon(x.label, x.predicted_probs), axis=1)
    df['kld'] = df.apply(lambda x: entropy(x.label, x.predicted_probs), axis=1)
    df['amb_level'] = df.apply(lambda x: get_ambiguity_level(x), axis=1)

    return df


def get_ambiguity_level(x):

    # this whole thing is a bit klugy, but <shrug> it works since there might be some fp imprecision.
    label_count = max(x.label)*5

    if label_count > 4.5:
        return 0
    elif label_count > 3.5:
        return 1
    else:
        return 2

### Analyze the baseline predictions

In [123]:
eval_df = pd.read_json(f'{ROOT_OUTPUT_PATH}/trained_train_gold_dev_gold_evaled_test_probs/eval_predictions.jsonl', lines=True)
eval_df = calculate_example_metrics(eval_df)
generate_all_metrics_dict(eval_df)

{'all': {'acc': 0.8890472312703583,
  'jsd': 0.19018686160110934,
  'kld': 0.42401248721189044},
 'no-ambiguity': {'acc': 0.9544101228135468,
  'jsd': 0.10453239072285646,
  'kld': 0.135539159308617},
 'medium-ambiguity': {'acc': 0.8847091605712295,
  'jsd': 0.2620504238347566,
  'kld': 0.6118892730848189},
 'high-ambiguity': {'acc': 0.6744775174160861,
  'jsd': 0.35103983140917144,
  'kld': 1.0642033750592708}}

From this, we can see that accuracy drops dramatically as the ambiguity level increases. Can 

{'all': {'acc': 0.8905741042345277,
  'jsd': 0.21167255341506863,
  'kld': 0.2573941631956475},
 'no-ambiguity': {'acc': 0.9542240416821735,
  'jsd': 0.18507676339716447,
  'kld': 0.1685656369707211},
 'medium-ambiguity': {'acc': 0.8861024033437827,
  'jsd': 0.2219503197507553,
  'kld': 0.2965972979670458},
 'high-ambiguity': {'acc': 0.682077264091197,
  'jsd': 0.28350175443245973,
  'kld': 0.4884342518619364}}

### Generate a dataset with a uniform probability distribution

In [127]:
unif_df = eval_df.assign(predicted_probs = [[1/3, 1/3, 1/3]]*len(eval_df))
unif_df = calculate_example_metrics(unif_df, use_probs=True)
generate_all_metrics_dict(unif_df)

{'all': {'acc': 0.8890472312703583,
  'jsd': 0.48318590731310385,
  'kld': 0.8341626268207454},
 'no-ambiguity': {'acc': 0.9544101228135468,
  'jsd': 0.5641427870206321,
  'kld': 1.0986122886681098},
 'medium-ambiguity': {'acc': 0.8847091605712295,
  'jsd': 0.41644967095404145,
  'kld': 0.5981451496428996},
 'high-ambiguity': {'acc': 0.6744775174160861,
  'jsd': 0.32899810683090697,
  'kld': 0.3632656630524493}}

### Let's take a look at highly ambiguous data (i.e., amb_level == 2) where the Jensen-Shannon divergence is great than the uniform Jensen-Shannon divergence

In [173]:
merge_df = pd.merge(left=eval_df, right=unif_df, left_index=True, right_index=True, suffixes=('', '_unif'))

In [174]:
merge_df = merge_df[['gold_label', 'premise', 'hypothesis', 'label', 'label_g',
                     'predicted_label', 'predicted_probs', 'jsd', 'amb_level',
                     'predicted_probs_unif', 'jsd_unif']]

In [175]:
amb_and_bad_df = merge_df.query('amb_level == 2 and jsd > jsd_unif')[['label_g', 'predicted_label', 'label', 'predicted_probs', 'jsd']]

In [177]:
amb_and_bad_df.head(5).style.set_properties(**{'width': '500px'})

Unnamed: 0,label_g,predicted_label,label,predicted_probs,jsd
4,0,0,"[0.6000000000000001, 0.2, 0.2]",[0.86094322 0.1373969 0.00165987],0.282339
12,0,1,"[0.6000000000000001, 0.4, 0.0]",[0.05213389 0.92668411 0.021182 ],0.442991
20,1,1,"[0.2, 0.6000000000000001, 0.2]",[0.36759535 0.62382939 0.00857526],0.258996
23,1,2,"[0.0, 0.6000000000000001, 0.4]",[4.55578389e-04 2.47427617e-02 9.74801660e-01],0.476
65,0,0,"[0.6000000000000001, 0.2, 0.2]",[0.94647857 0.04715054 0.0063709 ],0.319863
