In [1]:
from typing import List
from random import choices
import numpy as np
import json
import pandas as pd

In [2]:

def bootstrap(results: List[int], B: int = 10000, confidence_level: int = 0.95) -> int:

    """
    helper function for providing confidence intervals for sentiment tool
    """

    # compute lower and upper significance index
    critical_value = (1-confidence_level)/2
    lower_sig = 100*critical_value
    upper_sig = 100*(1-critical_value)
    data = []
    for p in results:
        data.append(p)

    sums = []
    # bootstrap resampling loop
    for b in range(B):
        choice = choices(data, k=len(data))
        choice = np.array(choice)
        inner_sum = np.sum(choice)/len(choice)
        sums.append(inner_sum)

    percentiles = np.percentile(sums, [lower_sig, 50, upper_sig])

    lower = percentiles[0]
    median = percentiles[1]
    upper = percentiles[2]

    e_bar = ((median- lower) + (upper - median))/2
    return e_bar, median, percentiles

In [3]:
with open('bert_logged_cka_outputs_15_03_2023_05_59_55.json', 'r') as f:
    data = json.load(f)

model_name = data['model_name'][0]

model_name

'distilbert-base-uncased'

In [13]:
false_facts_itrs = []
false_facts_list = []
true_facts_itrs = []

false_facts = {}
false_facts['differences'] = []
false_facts['facts'] = []

for itr, fact in enumerate(data["score_dict_full"][model_name.lower()]):
    # print(fact)

    if fact['p_true > p_false'] != "True":
        false_facts_itrs.append(itr)
        false_facts_list.append([fact['stem'], fact['fact'], fact['counterfact']])

        false_facts['differences'].append(fact['p_true - p_false'])
        false_facts['facts'].append([fact['stem'], fact['fact'], fact['counterfact']])

    elif fact['p_true > p_false'] == "True":
        true_facts_itrs.append(itr)

In [14]:
# make a results list compatible with the bootstrap:

results_false = [0] * len(false_facts_itrs)
results_true = [1] * len(true_facts_itrs)

In [15]:
results = results_false + results_true

In [16]:
len(results)  # should be 72370

21919

In [17]:
# create bootstrap estimates from logs

# calculate percentage with this to check
np.sum(results)/len(results)

0.789087093389297

In [18]:
bootstrap(results)


(0.005383457274510672,
 0.789178338427848,
 array([0.78379488, 0.78917834, 0.7945618 ]))

In [20]:
# order results by p_true - p_false

pd.DataFrame.from_dict(false_facts).sort_values(by='differences').head(20)
#pd.DataFrame.from_dict(false_facts).sort_values(by='differences').head(20).to_csv('test.csv')

Unnamed: 0,differences,facts
3678,-0.84688,[The official religion of Al-Masudi is [MASK]....
1192,-0.842735,"[Kabul Shahi follows the religion of [MASK]., ..."
1820,-0.761229,"[The twin city of Ankara is [MASK]., Miami, An..."
3565,-0.718189,"[The official religion of Rashi is [MASK]., Ju..."
109,-0.656442,"[Florence is a twin city of [MASK]., Athens, F..."
2399,-0.652641,[The language of Radio France Internationale w...
4352,-0.618134,"[Susquehanna River is located in [MASK]., Mary..."
670,-0.593739,"[Prague is a twin city of [MASK]., Kyoto, Prague]"
917,-0.573486,"[Skanderbeg follows the religion of [MASK]., C..."
2200,-0.540588,"[Arab is follower of [MASK]., Christianity, Is..."
