# Dev contrastive knowledge assesment (CKA) notebook

* See [Capstone Repo](https://github.com/daniel-furman/Capstone) for more details
---

## Dependencies

In [1]:
!git clone https://github.com/daniel-furman/Capstone.git

Cloning into 'Capstone'...
remote: Enumerating objects: 174, done.[K
remote: Counting objects: 100% (174/174), done.[K
remote: Compressing objects: 100% (115/115), done.[K
remote: Total 174 (delta 59), reused 148 (delta 34), pack-reused 0[K
Receiving objects: 100% (174/174), 21.05 MiB | 17.80 MiB/s, done.
Resolving deltas: 100% (59/59), done.


In [2]:
!pip install -r /content/Capstone/src/cka/scripts/requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.26.1
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece==0.1.97
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate==0.16.0
  Downloading accelerate-0.16.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes==0.37.0
  Downloading bitsandbytes-0.37.0-py3-none-any.whl (76.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.3/76.3 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[

## Imports

In [3]:
import os
import numpy as np

import torch
from torch.nn.functional import softmax

from transformers import (
    set_seed,
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForMaskedLM,
    T5Tokenizer,
    T5ForConditionalGeneration,
)

In [4]:
device = torch.device("cuda")

## Dev functions for new models

In [5]:
def probe_flan(model, tokenizer, target_id, context, verbose=False):

    # tokenize context
    tokenized_context = tokenizer(
        context,
        padding="longest",
        max_length=512,
        truncation=True,
        return_tensors="pt",
    ).to(device)

    input_ids = tokenized_context["input_ids"]

    # use model to solicit a prediction
    outputs = model(
        input_ids=input_ids,
        decoder_input_ids=torch.tensor([[0, 32099]], device="cuda:0"),
        output_hidden_states=True,
        return_dict=True,
    )

    # We have batch size of 1, so grab that, then,
    # Take the entire last matrix which corresponds to the last layer
    logits = outputs["logits"][0, -1]

    # convert our prediction scores to a probability distribution with softmax
    probs = softmax(logits, dim=-1)

    probs = probs.detach().cpu().numpy()

    if verbose:
        print(f"\tcontext... {context}")
        print(f'\ttokenized_context ids... {tokenized_context["input_ids"]}')
        print(
            f'\tdecoded tokenized_context... {tokenizer.decode(tokenized_context["input_ids"][0])}'
        )
        print(f"\tdecoded target id... {tokenizer.decode([target_id.item()])}")
        print(
            f"\tmost probable prediction id decoded... {tokenizer.decode([np.argmax(probs)])}"
        )

    return probs[target_id.item()]


def probe_gpt2(model, tokenizer, target_id, context, verbose=False):

    # tokenize context
    tokenized_context = tokenizer(
        context,
        return_tensors="pt",
    ).to(device)

    input_ids = tokenized_context["input_ids"]

    # grab value
    target_scalar = target_id.detach().cpu().numpy()

    # use model to solicit a prediction
    outputs = model(input_ids=input_ids, output_hidden_states=True, return_dict=True)

    # shape of 50257 which corresponds to the vocab size of GPT
    # every token in GPT's vocab gets a representative prediction from the model
    logits = outputs["logits"][0, -1]
    # convert our prediction scores to a probability distribution with softmax
    probs = softmax(logits, dim=-1)

    probs = list(probs.detach().cpu().numpy())

    if verbose:
        print(f"\tcontext... {context}")
        print(f'\ttokenized_context ids... {tokenized_context["input_ids"]}')
        print(
            f'\tdecoded tokenized_context... {tokenizer.decode(tokenized_context["input_ids"][0])}'
        )
        print(f"\tdecoded target id... {tokenizer.decode([target_id.item()])}")
        print(
            f"\tmost probable prediction id decoded... {tokenizer.decode([np.argmax(probs)])}"
        )

    # double check weird-ness before accessing prob
    if len(probs) < target_id:
        return None

    # return the likelihood that our stipulated target would follow the context,
    # according to the model
    try:
        return np.take(probs, [target_scalar])[0]

    except IndexError:

        print("target index not in model vocabulary scope; raising IndexError")
        return None


def probe_bert(model, tokenizer, target_id, context, verbose=False):

    # tokenize context
    tokenized_context = tokenizer(
        context,
        padding="longest",
        max_length=512,
        truncation=True,
        return_tensors="pt",
    )

    mask_token_index = torch.where(
        tokenized_context["input_ids"] == tokenizer.mask_token_id
    )[1]

    # use model to solicit a prediction
    logits = model(**tokenized_context.to(device)).logits
    mask_token_logits = logits[0, mask_token_index, :]

    # Convert our prediction scores to a probability distribution with softmax
    probs = torch.squeeze(softmax(mask_token_logits, dim=-1))

    probs = probs.detach().cpu().numpy()

    if verbose:
        print(f"\tcontext... {context}")
        print(f'\ttokenized_context ids... {tokenized_context["input_ids"]}')
        print(
            f'\tdecoded tokenize_context... {tokenizer.decode(tokenized_context["input_ids"][0])}'
        )
        print(f"\tmask token id... {tokenizer.mask_token_id}")
        print(f"\tmask token index in context... {mask_token_index}")
        print(f"\tdecoded target id... {tokenizer.decode([target_id.item()])}")
        print(
            f"\tmost probable prediction id decoded... {tokenizer.decode([np.argmax(probs)])}"
        )

    return probs[target_id.item()]


In [6]:
# first, write helper to pull a pretrained LM and tokenizer off the shelf
def get_model_and_tokenizer(model_name):
    if "flan" in model_name.lower():
        return T5Tokenizer.from_pretrained(
            model_name
        ), T5ForConditionalGeneration.from_pretrained(
            model_name, load_in_8bit=True, device_map="auto"
        )

    elif "gpt" in model_name.lower():
        return AutoTokenizer.from_pretrained(
            model_name
        ), AutoModelForCausalLM.from_pretrained(
            model_name, load_in_8bit=True, device_map="auto"
        )

    elif "bert" in model_name.lower():
        return AutoTokenizer.from_pretrained(
            model_name
        ), AutoModelForMaskedLM.from_pretrained(
            model_name, torch_dtype=torch.float16
        ).to(
            device
        )


# next, write a helper to pull a probe function for the given LM
def get_probe_function(prefix):
    probe_functions = [probe_flan, probe_gpt2, probe_bert]
    for func in probe_functions:
        if prefix.lower() in func.__name__:
            return func


# lastly, write a wrapper function to compare models
def compare_models(model_name_list, input_pairings, verbose):

    """
    Model-wise comparison helper function

    we should be able to do the following:
      * input a set of models we want to evaluate
      * input an expression of interest
      * input a 'true' next-token alonside a false
      * and get an output report that contains..
        * the 'result' ie is true > false
        * the probabilities of both of those values
      * running this method over a large set of positive/negative pairings should result in a large pool of information that can be used to compare model-families
      * we can also look at the relative 'certainty' across different models (at least in orders of magnitude)

    """

    score_dict_full = {}
    score_dict_succinct = {}

    for model_name in model_name_list:
        print(f"CKA for {model_name}")
        print("Loading  model...")

        # get proper model and tokenizer
        tokenizer, model = get_model_and_tokenizer(model_name)

        print("Running comparisons...")

        # establish prefix
        prefix = ""
        probe_func = None

        # get correct CKA function
        if ("t5" in model_name.lower()) or ("ul2" in model_name.lower()):
            prefix = "flan"
            probe_func = get_probe_function(prefix)

        elif "gpt" in model_name.lower():
            prefix = "gpt"
            probe_func = get_probe_function(prefix)

        elif "roberta" in model_name.lower():
            prefix = "roberta"
            probe_func = get_probe_function("bert")

        elif "bert" in model_name.lower():
            prefix = "bert"
            probe_func = get_probe_function(prefix)

        # iterate over context/entity pairings
        # input_pairings is a dict
        # context is a plain string (since our context's will be unique)
        # and entities is a list containing, in the first slot, the true
        # value for the statement and in the subsequent slots, incorrect information

        for context, entities in input_pairings.items():
            entity_count = 0
            p_true = 0.0
            p_false = 0.0

            if prefix == "flan":
                context += " <extra_id_0>."
            elif prefix == "roberta":
                context += " <mask>."
            elif prefix == "bert":
                context += " [MASK]."

            for entity in entities:
                target_id = None
                # first find target vocab id
                # default to the very first token that get's predicted
                # e.g. in the case of Tokyo, which gets split into <Tok> <yo>,

                if prefix == "flan":
                    target_id = tokenizer.encode(
                        entity,
                        padding="longest",
                        max_length=512,
                        truncation=True,
                        return_tensors="pt",
                    ).to(device)[0][0]

                elif prefix == "gpt":
                    target_id = tokenizer.encode(" " + entity, return_tensors="pt").to(
                        device
                    )[0][0]

                elif prefix == "roberta":
                    target_id = tokenizer.encode(
                        " " + entity,
                        padding="longest",
                        max_length=512,
                        truncation=True,
                        return_tensors="pt",
                    ).to(device)[0][1]

                elif prefix == "bert":
                    target_id = tokenizer.encode(
                        entity,
                        padding="longest",
                        max_length=512,
                        truncation=True,
                        return_tensors="pt",
                    ).to(device)[0][1]

                # next call probe function
                model_prob = probe_func(model, tokenizer, target_id, context, verbose)

                # lastly, register results
                if entity_count == 0:
                    p_true = model_prob

                else:
                    p_false += model_prob

                entity_count += 1

            p_false /= entity_count - 1

            try:
                score_dict_full[model_name.lower()].append(
                    {
                        context: {
                            "p_true": p_true,
                            "p_false": p_false,
                            "p_true - p_false": p_true - p_false,
                            "p_true > p_false": p_true > p_false,
                        }
                    }
                )
            except KeyError:
                score_dict_full[model_name.lower()] = [
                    {
                        context: {
                            "p_true": p_true,
                            "p_false": p_false,
                            "p_true - p_false": p_true - p_false,
                            "p_true > p_false": p_true > p_false,
                        }
                    }
                ]

            try:
                score_dict_succinct[model_name.lower()].append(
                    {
                        context: {
                            "p_true > p_false": p_true > p_false,
                        }
                    }
                )
            except KeyError:
                score_dict_succinct[model_name.lower()] = [
                    {
                        context: {
                            "p_true > p_false": p_true > p_false,
                        }
                    }
                ]

        print("Done\n")
        del tokenizer
        del model
        torch.cuda.empty_cache()

    return score_dict_full, score_dict_succinct


## Test the functions

In [7]:
def main(config):

    set_seed(42)

    score_dict_full, score_dict_succinct = compare_models(
        config["models"], config["input_information"], config["verbosity"]
    )

    return score_dict_full, score_dict_succinct

### RoBERTa

In [8]:
config = {
    "models": [
        "distilroberta-base",  # 82M params
    ],
    "input_information": {
        "The 2020 Olympics were held in": ["Tokyo", "London"],
        "Operation Overlord took place in": ["Normandy", "Manila"],
        "Steve Jobs is the founder of": ["Apple", "Microsoft"],
    },
    "verbosity": True
}


In [9]:
score_dict_full, score_dict_succinct = main(config)

CKA for distilroberta-base
Loading  model...


Downloading (…)lve/main/config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/331M [00:00<?, ?B/s]

Running comparisons...
	context... The 2020 Olympics were held in <mask>.
	tokenized_context ids... tensor([[    0,   133,  2760,  4365,    58,   547,    11, 50264,     4,     2]],
       device='cuda:0')
	decoded tokenize_context... <s>The 2020 Olympics were held in<mask>.</s>
	mask token id... 50264
	mask token index in context... tensor([7])
	decoded target id...  Tokyo
	most probable prediction id decoded...  Tokyo
	context... The 2020 Olympics were held in <mask>.
	tokenized_context ids... tensor([[    0,   133,  2760,  4365,    58,   547,    11, 50264,     4,     2]],
       device='cuda:0')
	decoded tokenize_context... <s>The 2020 Olympics were held in<mask>.</s>
	mask token id... 50264
	mask token index in context... tensor([7])
	decoded target id...  London
	most probable prediction id decoded...  Tokyo
	context... Operation Overlord took place in <mask>.
	tokenized_context ids... tensor([[    0, 35360,  2306, 30669,   362,   317,    11, 50264,     4,     2]],
       device='c

In [10]:
score_dict_full

{'distilroberta-base': [{'The 2020 Olympics were held in <mask>.': {'p_true': 0.6587,
    'p_false': 0.0084228515625,
    'p_true - p_false': 0.6502685546875,
    'p_true > p_false': True}},
  {'Operation Overlord took place in <mask>.': {'p_true': 0.0015335,
    'p_false': 0.00012105703353881836,
    'p_true - p_false': 0.0014124512672424316,
    'p_true > p_false': True}},
  {'Steve Jobs is the founder of <mask>.': {'p_true': 0.5645,
    'p_false': 0.076416015625,
    'p_true - p_false': 0.488037109375,
    'p_true > p_false': True}}]}

In [11]:
score_dict_succinct

{'distilroberta-base': [{'The 2020 Olympics were held in <mask>.': {'p_true > p_false': True}},
  {'Operation Overlord took place in <mask>.': {'p_true > p_false': True}},
  {'Steve Jobs is the founder of <mask>.': {'p_true > p_false': True}}]}

### Bert

In [12]:
config = {
    "models": [
        "bert-base-uncased", # 110M params
    ],
    "input_information": {
        "The 2020 Olympics were held in": ["Tokyo", "London"],
        "Operation Overlord took place in": ["Normandy", "Manila"],
        "Steve Jobs is the founder of": ["Apple", "Microsoft"],
    },
    "verbosity": True
}


In [13]:
score_dict_full, score_dict_succinct = main(config)

CKA for bert-base-uncased
Loading  model...


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Running comparisons...
	context... The 2020 Olympics were held in [MASK].
	tokenized_context ids... tensor([[  101,  1996, 12609,  3783,  2020,  2218,  1999,   103,  1012,   102]],
       device='cuda:0')
	decoded tokenize_context... [CLS] the 2020 olympics were held in [MASK]. [SEP]
	mask token id... 103
	mask token index in context... tensor([7])
	decoded target id... tokyo
	most probable prediction id decoded... tokyo
	context... The 2020 Olympics were held in [MASK].
	tokenized_context ids... tensor([[  101,  1996, 12609,  3783,  2020,  2218,  1999,   103,  1012,   102]],
       device='cuda:0')
	decoded tokenize_context... [CLS] the 2020 olympics were held in [MASK]. [SEP]
	mask token id... 103
	mask token index in context... tensor([7])
	decoded target id... london
	most probable prediction id decoded... tokyo
	context... Operation Overlord took place in [MASK].
	tokenized_context ids... tensor([[  101,  3169,  2058, 19980,  2165,  2173,  1999,   103,  1012,   102]],
       devic

In [14]:
score_dict_full

{'bert-base-uncased': [{'The 2020 Olympics were held in [MASK].': {'p_true': 0.7124,
    'p_false': 0.0040130615234375,
    'p_true - p_false': 0.7083892822265625,
    'p_true > p_false': True}},
  {'Operation Overlord took place in [MASK].': {'p_true': 0.01952,
    'p_false': 1.3828277587890625e-05,
    'p_true - p_false': 0.01950216293334961,
    'p_true > p_false': True}},
  {'Steve Jobs is the founder of [MASK].': {'p_true': 0.2837,
    'p_false': 0.10113525390625,
    'p_true - p_false': 0.18255615234375,
    'p_true > p_false': True}}]}

In [15]:
score_dict_succinct

{'bert-base-uncased': [{'The 2020 Olympics were held in [MASK].': {'p_true > p_false': True}},
  {'Operation Overlord took place in [MASK].': {'p_true > p_false': True}},
  {'Steve Jobs is the founder of [MASK].': {'p_true > p_false': True}}]}

### gpt2s

In [16]:
config = {
    "models": [
        "distilgpt2",  # 82M params
    ],
    "input_information": {
        "The 2020 Olympics were held in": ["Tokyo", "London"],
        "Operation Overlord took place in": ["Normandy", "Manila"],
        "Steve Jobs is the founder of": ["Apple", "Microsoft"],
    },
    "verbosity": True

}


In [17]:
score_dict_full, score_dict_succinct = main(config)

CKA for distilgpt2
Loading  model...


Downloading (…)lve/main/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/353M [00:00<?, ?B/s]


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues


Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Running comparisons...
	context... The 2020 Olympics were held in
	tokenized_context ids... tensor([[  464, 12131, 14935,   547,  2714,   287]], device='cuda:0')
	decoded tokenized_context... The 2020 Olympics were held in
	decoded target id...  Tokyo
	most probable prediction id decoded...  the
	context... The 2020 Olympics were held in
	tokenized_context ids... tensor([[  464, 12131, 14935,   547,  2714,   287]], device='cuda:0')
	decoded tokenized_context... The 2020 Olympics were held in
	decoded target id...  London
	most probable prediction id decoded...  the
	context... Operation Overlord took place in
	tokenized_context ids... tensor([[32180,  3827, 10572,  1718,  1295,   287]], device='cuda:0')
	decoded tokenized_context... Operation Overlord took place in
	decoded target id...  Normandy
	most probable prediction id decoded...  the
	context... Operation Overlord took place in
	tokenized_context ids... tensor([[32180,  3827, 10572,  1718,  1295,   287]], device='cuda:0')
	decod

In [18]:
score_dict_full

{'distilgpt2': [{'The 2020 Olympics were held in': {'p_true': 0.01068,
    'p_false': 0.0478515625,
    'p_true - p_false': -0.03717041015625,
    'p_true > p_false': False}},
  {'Operation Overlord took place in': {'p_true': 0.0007377,
    'p_false': 0.00019848346710205078,
    'p_true - p_false': 0.0005391836166381836,
    'p_true > p_false': True}},
  {'Steve Jobs is the founder of': {'p_true': 0.014046,
    'p_false': 0.01201629638671875,
    'p_true - p_false': 0.0020294189453125,
    'p_true > p_false': True}}]}

In [19]:
score_dict_succinct

{'distilgpt2': [{'The 2020 Olympics were held in': {'p_true > p_false': False}},
  {'Operation Overlord took place in': {'p_true > p_false': True}},
  {'Steve Jobs is the founder of': {'p_true > p_false': True}}]}

### Google/flans

In [20]:
config = {
    "models": [
        "google/flan-t5-small",  # 80M params
    ],
    "input_information": {
        "The 2020 Olympics were held in": ["Tokyo", "London"],
        "Operation Overlord took place in": ["Normandy", "Manila"],
        "Steve Jobs is the founder of": ["Apple", "Microsoft"],
    },
    "verbosity": True

}


In [21]:
score_dict_full, score_dict_succinct = main(config)

CKA for google/flan-t5-small
Loading  model...


Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Running comparisons...
	context... The 2020 Olympics were held in <extra_id_0>.
	tokenized_context ids... tensor([[   37,  6503, 17793,   130,  1213,    16, 32099,     3,     5,     1]],
       device='cuda:0')
	decoded tokenized_context... The 2020 Olympics were held in<extra_id_0>.</s>
	decoded target id... Tokyo
	most probable prediction id decoded... a
	context... The 2020 Olympics were held in <extra_id_0>.
	tokenized_context ids... tensor([[   37,  6503, 17793,   130,  1213,    16, 32099,     3,     5,     1]],
       device='cuda:0')
	decoded tokenized_context... The 2020 Olympics were held in<extra_id_0>.</s>
	decoded target id... London
	most probable prediction id decoded... a
	context... Operation Overlord took place in <extra_id_0>.
	tokenized_context ids... tensor([[ 6411,  1575,  2035,   322,    26,   808,   286,    16, 32099,     3,
             5,     1]], device='cuda:0')
	decoded tokenized_context... Operation Overlord took place in<extra_id_0>.</s>
	decoded target id

In [22]:
score_dict_full

{'google/flan-t5-small': [{'The 2020 Olympics were held in <extra_id_0>.': {'p_true': 3.46e-06,
    'p_false': 2.9742717742919922e-05,
    'p_true - p_false': -2.6285648345947266e-05,
    'p_true > p_false': False}},
  {'Operation Overlord took place in <extra_id_0>.': {'p_true': 2.06e-05,
    'p_false': 1.0728836059570312e-06,
    'p_true - p_false': 1.9550323486328125e-05,
    'p_true > p_false': True}},
  {'Steve Jobs is the founder of <extra_id_0>.': {'p_true': 0.0005684,
    'p_false': 1.8715858459472656e-05,
    'p_true - p_false': 0.0005496740341186523,
    'p_true > p_false': True}}]}

In [23]:
score_dict_succinct

{'google/flan-t5-small': [{'The 2020 Olympics were held in <extra_id_0>.': {'p_true > p_false': False}},
  {'Operation Overlord took place in <extra_id_0>.': {'p_true > p_false': True}},
  {'Steve Jobs is the founder of <extra_id_0>.': {'p_true > p_false': True}}]}