In [3]:
import jsonlines
import re
import spacy

from nltk.corpus import stopwords

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

In [4]:
def read_jsonl(path: str) -> list:

    samples = []
    with jsonlines.open(path) as input:
        for line in input:
            samples.append(line)

    return samples

## Functions

In [5]:
IMPORTANT_ENT_TYPE = set(['ORG', 'GPE', 'PERSON', 'WORK_OF_ART', 'PRODUCT', 'EVENT'])
REMOVE_ENT_TYPE = set(['ORDINAL', 'CARDINAL']) 

def obtain_important_ne(gen, include_capitalized_words_as_ents=True):
    important_words = []

    doc = nlp(gen)

    # print("GEN: ", gen)
    # print([(token.text, token.pos_, token.tag_, token.dep_) for token in doc if token.pos_ in ['NOUN', 'PRON', 'PROPN']])
    # print("\n")

    ents = [(ent.text, ent.label_) for ent in doc.ents]

    if include_capitalized_words_as_ents and len(ents) == 0:
        capitalized_words = re.findall('(?<!^)([A-Z][a-z]+)', gen)
        
        if len(capitalized_words) > 0:
            capitalized_words = [(word, 'CAPITALIZED') for word in capitalized_words if word.lower() not in stop_words]
            ents.extend(capitalized_words)

    important_words.extend([ent for ent in ents if ent[1] in IMPORTANT_ENT_TYPE])
    remaining_ne_all = [ent for ent in ents if ent[1] not in IMPORTANT_ENT_TYPE]

    # filter out some ne
    remaining_ne = []
    for ent in remaining_ne_all:
        if ent[1] in REMOVE_ENT_TYPE:
            continue
        if ent[1] == 'DATE' and ("year" in ent[0] or "day" in ent[0]): #not bool(re.search(r'\d', ent[0])):
            # if "DATE" entity contains NO number at all (e.g., ``the year''), meaningless
            continue
        remaining_ne.append(ent)

    gens_with_ne = {
                        "gen": gen,
                        "important_ne": important_words,
                        "unimportant_ne": remaining_ne,
                        "subject": set([token.text for token in doc if token.dep_ in ['nsubj', 'nsubjpass']]),
                        # "all_analysis": [(token.text, token.pos_, token.tag_, token.dep_) for token in doc]
                    }

    return gens_with_ne 

In [4]:
def ner_metric(named_entities, prompt_wiki_candidates):
    
    wiki_text = " ".join(prompt_wiki_candidates).lower()

    # TODO improve the NE match here
    # hanlde DATE, TIME, etc better! appears a lot but handled poorly

    existing_correct_ne = []
    for ent in named_entities:
        ent_text = ent[0].lower()
        if 'the ' in ent_text:
            ent_text = ent_text.replace('the ', "")

        if ent_text in wiki_text:
            existing_correct_ne.append(ent)
        elif any([bool(word in wiki_text) for word in ent_text.split(" ") if ent[1] == 'PERSON']):
            # handle shorter forms of same NE: Exists "Marcus Morgan Bentley", but NE is "Marcus Bentley" or "Bentley"
            existing_correct_ne.append(ent)
        elif ent[1] == 'DATE':
            date_str = re.sub(r"[,.;@#?!&$]+\ *", " ", ent_text)
            date_str = date_str.replace("st", "")
            date_str = date_str.replace("nd", "")
            date_str = date_str.replace("th", "")
            date_str = date_str.replace("of", "")
            date_tokens = date_str.split(" ")

            if all([bool(token in wiki_text) for token in date_tokens]):
                existing_correct_ne.append(ent)
        


    correct_ratio = len(existing_correct_ne)/ len(named_entities)

    return correct_ratio

## Experiments

In [7]:
# Read in Generations and ground truth articles
label_objects = read_jsonl("test.jsonl")
label_objects.extend(read_jsonl("test_labels2.jsonl")) # test.jsonl

label_objects = read_jsonl("joined_labels.jsonl")
print(len(label_objects))

#ground_truths = read_jsonl("articles.jsonl")

#len(label_objects)

150


In [26]:
gen_1 = label_objects[1] # = Escape from Planet Earth
gen_2 = label_objects[2] # = Istanbul
g_truth1 = None
g_truth2 = None

for article in ground_truths:
    if article["title"] == "Escape from Planet Earth":
        g_truth1 = article
    
    if article["title"] == "Istanbul":
        g_truth2 = article



In [28]:
gen_2

{'id': 352,
 'fever_id': '3391396',
 'wiki_ne': 'Istanbul',
 'prompt': "Istanbul's population has increased since the 1950s.",
 'model_name': 'gpt-2_greedy',
 'text': "Istanbul's population has increased since the 1950s. The city's population is now estimated at around 10 million.\n\nThe city's population has increased since the 1950s. The city's population is now estimated at around 10 million.\n\nThe Turkish government has been trying to reduce the number of foreigners living in the country for years.\n\nThe government has been trying to reduce the number of foreigners living in the country for years.\n\nThe Turkish government has been trying to reduce the number of foreigners living in the country for years.\n\nThe Turkish government has been trying to reduce the number of foreigners living in the country for years.\n\nThe Turkish government has been trying to reduce the number of foreigners living in the country for years",
 'single_sentences': [["Istanbul's population has increase

In [24]:
# Get the named entities from the generation
gen_nes1 = obtain_important_ne(gen_1["text"])

NE_to_check = gen_nes1["important_ne"] #+ gen_nes["unimportant_ne"]
NE_to_check

[('David Fincher', 'PERSON'),
 ('Fincher', 'PERSON'),
 ('Jonathan Nolan', 'PERSON')]

In [32]:
# Get the named entities from the generation
gen_nes1 = obtain_important_ne(gen_2["text"])
NE_to_check = gen_nes1["important_ne"]
NE_to_check



[('Istanbul', 'GPE')]

In [36]:
print(ner_metric(NE_to_check, g_truth1["text"]))
print(ner_metric(obtain_important_ne(gen_2["text"])["important_ne"], g_truth2["text"]))

0.0
1.0


In [None]:
# Calc with labels


In [74]:
print(calc_metrics_per_generation(gen_1))
print(calc_metrics_per_generation(gen_2))

Named Entities from Generation: [('David Fincher', 'PERSON'), ('Fincher', 'PERSON'), ('Jonathan Nolan', 'PERSON'), ('the United States', 'GPE'), ('the United Kingdom', 'GPE')]
Labeled as hallucinated: ['David Fincher', 'Jonathan Nolan']
Exact Match: David Fincher
Partial Match: Fincher
Exact Match: Jonathan Nolan
(0.6, (0.25, 0.5, 0.0))
Named Entities from Generation: [('Istanbul', 'GPE')]
Labeled as hallucinated: []
(0.0, (0.2222222222222222, 0.0, 0.5555555555555556))


In [8]:
def calc_metrics_per_generation(generation: dict):
    
    imp_text = construct_text_from_sentences(generation)
    
    
    # Calc NE Error
    named_entities = obtain_important_ne(imp_text) # generation["text"]
    named_entities = named_entities["important_ne"]

    hallu_ne_text = " ".join(generation['HALLU_NE'])

    print(f"Named Entities from Generation: {named_entities}")
    print(f"Labeled as hallucinated: {generation['HALLU_NE']}")

    num_partial_matches = 0
    
    for named_entity in named_entities:
        named_entity = named_entity[0]
        # Check for an exact match
        if named_entity in generation['HALLU_NE']:
            print(f"Exact Match: {named_entity}")
            continue

        # If not exact match check if partial match
        elif any(subword in hallu_ne_text for subword in named_entity.split(" ")):
            num_partial_matches += 1
            print(f"Partial Match: {named_entity}")
            
    if len(named_entities) != 0:
        correct_ratio = (len(named_entities) - (len(generation["HALLU_NE"]) + num_partial_matches)) / len(named_entities)

        num_correct_named_entities = (len(named_entities) - (len(generation["HALLU_NE"]) + num_partial_matches))

    else:
        correct_ratio = 1

        num_correct_named_entities = 0
    
    hallu_ne_ratio = 1 - correct_ratio
    
    # Calculate Entailment Ratio

    off_topic = 0
    no_fact = 0

    entail = 0

    for sent in generation["single_sentences"]:
        # sent is tuple of the from (sentence, label)
        if sent[1] == "true":
            entail += 1
        
        if sent[1] == "off_topic" or sent[1] == "not_enough_evidence" or sent[1] == "no_fact":
            off_topic += 1

        if sent[1] == "false":
            no_fact += 1

    
    true_ratio = entail / len(generation["single_sentences"])
    false_ratio = no_fact / len(generation["single_sentences"])
    off_topic_ratio = off_topic / len(generation["single_sentences"])

    

    #return hallu_ne_ratio, (true_ratio, false_ratio, off_topic_ratio)

    return SingleFactualResult(id=generation["id"], wiki_article=generation["wiki_ne"], entail_ratio=true_ratio, hallu_ner=hallu_ne_ratio,
                         false_ratio=false_ratio, off_topic_ratio=off_topic_ratio, num_entail=entail, num_false=no_fact, num_off_topic=off_topic,
                         num_correct_mentioned_ne=num_correct_named_entities, num_total_mentioned_ne=len(named_entities), 
                         num_sentences=len(generation["single_sentences"]))
    
    
        

In [30]:
list_results = []
for label_object in label_objects:
    print("---------------------------------------------------------------")
    result = calc_metrics_per_generation(label_object)
    list_results.append(result)
    print(result)

print(len(list_results))

---------------------------------------------------------------
Named Entities from Generation: []
Labeled as hallucinated: []
SingleFactualResult(entail_ratio=0.14285714285714285, false_ratio=0.0, off_topic_ratio=0.8571428571428571, hallu_ner=0, num_entail=1, num_false=0, num_off_topic=6, num_correct_mentioned_ne=0, num_total_mentioned_ne=0, num_sentences=7, id=493, wiki_article='100 Greatest of All Time')
---------------------------------------------------------------
Named Entities from Generation: [('Drake', 'ORG'), ('The Black Keys', 'PERSON'), ('Drake', 'ORG'), ('The Black Keys', 'PERSON'), ('Drake', 'ORG'), ('The Black Keys', 'PERSON'), ('Drake', 'ORG'), ('The Black Keys', 'PERSON'), ('Drake', 'ORG'), ('The Black Keys', 'PERSON'), ('Drake', 'ORG'), ('The Black Keys', 'PERSON'), ('Drake', 'ORG'), ('The Black Keys', 'PERSON')]
Labeled as hallucinated: ['The Black Keys']
Exact Match: The Black Keys
Exact Match: The Black Keys
Exact Match: The Black Keys
Exact Match: The Black Keys


In [31]:
all_results = calc_metrics_all(list_results)

Hallu-ner 0.384928716904277 Entail Ratio: 0.27136150234741785 False Ratio: 0.4647887323943662 Off Topic Ratio: 0.26291079812206575


In [32]:
all_results

FactualResult(entail_ratio=0.27136150234741785, false_ratio=0.4647887323943662, off_topic_ratio=0.26291079812206575, hallu_ner=0.384928716904277, num_entail=289, num_false=495, num_off_topic=280, num_correct_mentioned_ne=604, num_total_mentioned_ne=982, num_sentences=1065)

In [10]:
def construct_text_from_sentences(generation: dict):
    result_text = ""
    sentences = generation["single_sentences"]

    for sentence in sentences:

        if sentence[1] == "true" or sentence[1] == "false" or sentence[1] == "not_enough_evidence":
            result_text = result_text + sentence[0] + " "

    return result_text

In [11]:
def calc_metrics_all(result_objects: list):
    final_result = FactualResult()

    for result_obj in result_objects:
        # set all absolute values
        final_result.num_entail += result_obj.num_entail
        final_result.num_false += result_obj.num_false
        final_result.num_off_topic += result_obj.num_off_topic

        final_result.num_correct_mentioned_ne += result_obj.num_correct_mentioned_ne
        final_result.num_total_mentioned_ne += result_obj.num_total_mentioned_ne

        final_result.num_sentences += result_obj.num_sentences

    # Call Method to calculate metrics
    final_result.calculate_ratios()

    print(f"Hallu-ner {final_result.hallu_ner} Entail Ratio: {final_result.entail_ratio} False Ratio: {final_result.false_ratio} Off Topic Ratio: {final_result.off_topic_ratio}")

    return final_result







In [9]:
new_samples = read_jsonl("./test_labels2.jsonl")

In [10]:
articles = []

for line in new_samples: #samples
    articles.append(line["wiki_ne"])

articles

['100 Greatest of All Time',
 'Drake (musician)',
 'Mrs Henderson Presents',
 'Life Is Peachy',
 'Davis Guggenheim',
 'Chris Evans (actor)',
 'Liana Liberato',
 'Victor Hugo',
 'Game of Thrones (season 3)',
 'The Raven (2012 film)',
 'So You Think You Can Dance (American TV series)',
 'Alan Shepard',
 'Internet access',
 'A. P. J. Abdul Kalam',
 'Diana, Princess of Wales',
 'Inside Llewyn Davis',
 'Rob McElhenney',
 'The Book of Mormon (musical)',
 'The Block (album)',
 'Pearl (Steven Universe)',
 'Anneliese van der Pol',
 'First Motion Picture Unit',
 'Alcoholic drink',
 'Barry Van Dyke',
 'Tom DeLonge']

In [12]:
from dataclasses import dataclass

@dataclass
class FactualResult:
    
    entail_ratio: float = None
    false_ratio: float = None
    off_topic_ratio: float = None

    hallu_ner: float = None

    num_entail: int = 0
    num_false: int = 0
    num_off_topic: int = 0

    num_correct_mentioned_ne: int = 0
    num_total_mentioned_ne: int = 0

    num_sentences: int = 0

    
    def calculate_ratios(self):

        self.entail_ratio = self.num_entail / self.num_sentences
        self.false_ratio = self.num_false / self.num_sentences
        self.off_topic_ratio = self.num_off_topic / self.num_sentences

        self.hallu_ner = 1 - (self.num_correct_mentioned_ne / self.num_total_mentioned_ne)

@dataclass
class SingleFactualResult(FactualResult):

    id: int = None
    wiki_article: str = None
    

In [3]:
dkd = FactualResult()
dkd.num_entail += 5
dkd.num_sentences = 10

In [7]:
dkd.__dict__

{'entail_ratio': None,
 'false_ratio': None,
 'off_topic_ratio': None,
 'hallu_ner': None,
 'num_entail': 5,
 'num_false': 0,
 'num_off_topic': 0,
 'num_correct_mentioned_ne': 0,
 'num_total_mentioned_ne': 0,
 'num_sentences': 10}

In [141]:
dkd.calculate_ratios()

lustig


In [142]:
dkd

FactualResult(entail_ratio=0.5, false_ratio=None, off_topic_ratio=None, hallu_ner=None, num_entail=5, num_false=0, num_off_topic=0, num_correct_mentioned_ne=0, num_total_mentioned_ne=0, num_sentences=10)

# Getting the Prompts to the Generation Samples

In [6]:
# prompts = read_jsonl("/Users/davidblumenthal/Documents/Master_Thesis/Evaluation/Improve_FactualityPrompt/FactualityPrompt/prompts/fever_factual_final.jsonl")
# samples = read_jsonl("/Users/davidblumenthal/Documents/Master_Thesis/Evaluation/Improve_FactualityPrompt/joined_labels.jsonl")

prompts = read_jsonl("/Users/davidblumenthal/Documents/Master_Thesis/Evaluation/Improve_FactualityPrompt/FactualityPrompt/prompts/fever_nonfactual_final.jsonl")
samples = read_jsonl("/Users/davidblumenthal/Documents/Master_Thesis/Evaluation/Improve_FactualityPrompt/gpt-neo/gen_outputs/labeled_samples/labeled_non_factual_test-set.jsonl")


In [11]:
def write_jsonl(path, liste):
    
    with jsonlines.open(path, "w") as writer:
        for sample in liste:
            writer.write(sample)


In [7]:
matching_prompts = []
for sample in samples:
    for prompt in prompts:
        if sample["prompt"].strip() == prompt["prompt"].strip():
            matching_prompts.append(prompt)
            break

In [8]:
len(matching_prompts)

25

In [9]:
count = 0
for sample, prompt in zip(samples, matching_prompts):
    count += 1
    if sample["prompt"].strip() != prompt["prompt"].strip():
        print(count)

In [12]:
write_jsonl("./test-set_prompts.jsonl", matching_prompts)

In [4]:
factual = read_jsonl("/Users/davidblumenthal/Documents/Master_Thesis/Evaluation/Improve_FactualityPrompt/FactualityPrompt/prompts/fever_factual_final.jsonl")
non_factual = read_jsonl("/Users/davidblumenthal/Documents/Master_Thesis/Evaluation/Improve_FactualityPrompt/FactualityPrompt/prompts/fever_nonfactual_final.jsonl")

In [9]:
result = []

for sample_f, sample_nf in zip(factual, non_factual):

    for ent in sample_f["evidence_info"]:
        result.append(ent[0])

    for ent in sample_nf["evidence_info"]:
            result.append(ent[0])

In [10]:
print(len(result))

result = list(set(result))
print(len(result))

18044
4173


In [14]:
for idx, ent in enumerate(result):
    result[idx] = {"entity": ent}

In [15]:
write_jsonl("./fever_entities.jsonl", result)

In [15]:
labeled_samples = read_jsonl("./test-set_prompts.jsonl")

result = []

for sample in labeled_samples: # labeled_samples
    result.append(sample["evidence_info"][0][0])

# Crawl Wikipedia for Evidence

In [16]:
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)


#print(p_wiki.text)

In [26]:
contents = []

for article_title in result:

    page = wiki_wiki.page(article_title)
    content = page.text

    content = content.strip()

    contents.append({"title": article_title, "text": content})


In [27]:
for idx, article in enumerate(contents):

    text = article["text"].replace("\n", " ")

    obj = obtain_important_ne(text)

    nes = [ne[0] for ne in obj["important_ne"]]

    nes = list(set(nes))

    contents[idx] = {"title": article["title"], "important_ne": nes}



In [28]:
write_jsonl("./FactualityPrompt/crawl_wikipedia/additional_ne_test-set.jsonl", contents)

In [44]:
for what in prompts.items():
    print(what)

AttributeError: 'list' object has no attribute 'items'