In [34]:
import glob
import json
import numpy as np
import random
from src.umls import Umls
umls = Umls('umls/processed')

In [20]:
def get_results(file):
    "Gets acc1, acc5, similarity results for each prediction in the file"
    with open(file) as f:
        results = json.load(f)['result']

    similarity = []
    for mention in results:
        gold_cui = mention['golden_cui']
        pred = mention['candidates'][0]['cui']
        similarity.append(umls.similarity(gold_cui,pred))

    acc1 = [r['candidates'][0]['label'] for r in results]
    acc5 = [max([c['label'] for c in r['candidates'][:5]]) for r in results]
    return acc1, acc5, similarity

def get_cuis(file):
    "Gets pred and gold cuis from file"
    with open(file) as f:
        results = json.load(f)['result']
    gold = [m['golden_cui'] for m in results]
    preds = [m['candidates'][0]['cui'] for m in results]
    return gold, preds

baseline_file = 'results/bb_binary_ce.json'
baseline_acc1, baseline_acc5, baseline_similarity = get_results(baseline_file)
baseline_gold, baseline_preds = get_cuis(baseline_file)

In [106]:
def approximate_randomization(out_A, out_B, R):
    """
    Approximate randomization for assessing statistical significance between NLP results.
    This function takes in scores computed for each prediction by systems A and B.
    It iterates R times, randomly swapping paired outputs each time with 50% probability
    to generate random outputs. P-value is determined by the proportion of time that 
    randomness generated a larger absolute difference in the mean value of scores than
    the actual mean value from the systems' respective outputs.

    NOTE: This method is intended for assessing statistical difference between lists
    of scores which have already been scored individually (i.e. accuracy, similarity).
    Approximate randomization is also appropriate for more complex metrics like precision
    and F1, but the method would need to be adapted to pass in lists of non-scored outputs
    (i.e. CUIs) and score the performance of A, B after each iteration.

    Parameters:
        out_A   list: output scores from system/configuration A
        out_B   list: output scores from system/configuration B
        R       int: number of iterations
    """
    out_A = np.array(out_A)
    out_B = np.array(out_B)
    assert(len(out_A)==len(out_B))

    # Test statistic: absolute difference in scores
    t = abs(out_A.mean()-out_B.mean())
    r = 0 
    for i in range(R):
        X = out_A
        Y = out_B

        # Randomly swap paired outputs 50% of the time
        swap_ix = np.random.choice(a=[False, True], size=len(out_A), p=[0.5, 0.5])
        temp = X[swap_ix]
        X[swap_ix] = Y[swap_ix]
        Y[swap_ix] = temp

        if abs(X.mean()-Y.mean()) >= t:
            # Count times randomness produces larger difference than output source
            r += 1

    # Calculate p-value
    p = (r+1)/(R+1)
    return p
approximate_randomization(baseline_acc1, acc1, R=5)

0.16666666666666666

In [105]:
R = 50*1000
for file in glob.glob('results/*unsupervised.json'):
    print(file)
    acc1, acc5, similarity = get_results(file)
    acc1_p = approximate_randomization(baseline_acc1, acc1, R)
    acc5_p = approximate_randomization(baseline_acc5, acc5, R)
    sim_p = approximate_randomization(baseline_similarity, similarity, R)
    print(f"acc@1 \t{sig(acc1_p)}\tp={acc1_p}")
    print(f"acc@5 \t{sig(acc5_p)}\tp={acc5_p}")
    print(f"sim \t{sig(sim_p)}\tp={sim_p}")
    print("-"*99)

results\bb_unsupervised.json
acc@1 	***	p=1.999960000799984e-05
acc@5 	***	p=1.999960000799984e-05
sim 	***	p=1.999960000799984e-05
---------------------------------------------------------------------------------------------------
results\cb_unsupervised.json
acc@1 	***	p=1.999960000799984e-05
acc@5 	***	p=1.999960000799984e-05
sim 	***	p=1.999960000799984e-05
---------------------------------------------------------------------------------------------------


In [103]:
def sig(p):
    if p < .001:
        return '***'
    elif p < .01:
        return '**'
    elif p < .05:
        return '*'
    return ''

R = 50*1000
for file in glob.glob('results/*ce.json'):
    print(file)
    acc1, acc5, similarity = get_results(file)
    acc1_p = approximate_randomization(baseline_acc1, acc1, R)
    acc5_p = approximate_randomization(baseline_acc5, acc5, R)
    sim_p = approximate_randomization(baseline_similarity, similarity, R)
    print(f"acc@1 \t{sig(acc1_p)}\tp={acc1_p}")
    print(f"acc@5 \t{sig(acc5_p)}\tp={acc5_p}")
    print(f"sim \t{sig(sim_p)}\tp={sim_p}")
    print("-"*99)


results\bb_binary_ce.json
acc@1 		p=1.0
acc@5 		p=1.0
sim 		p=1.0
---------------------------------------------------------------------------------------------------
results\bb_binary_sce.json
acc@1 		p=0.25387492250154997
acc@5 		p=0.07223855522889543
sim 		p=0.36029279414411713
---------------------------------------------------------------------------------------------------
results\bb_linear_ce.json
acc@1 	**	p=0.0027599448011039777
acc@5 	*	p=0.01633967320653587
sim 		p=0.10477790444191117
---------------------------------------------------------------------------------------------------
results\bb_linear_sce.json
acc@1 	***	p=0.0002799944001119978
acc@5 		p=0.06767864642707146
sim 	**	p=0.006079878402431951
---------------------------------------------------------------------------------------------------
results\bb_log_ce.json
acc@1 		p=0.8383832323353533
acc@5 		p=0.4468310633787324
sim 	*	p=0.047819043619127616
------------------------------------------------------------------