In [1]:
import re
from tqdm import tqdm

import wikipediaapi
import jsonlines
import spacy

from copy import deepcopy

spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

  from .autonotebook import tqdm as notebook_tqdm


## Utils

In [2]:
def read_jsonl(path: str) -> list:
    samples = []
    with jsonlines.open(path) as input:
        for line in input:
            samples.append(line)

    return samples

In [3]:
def write_jsonl(path, liste):
    
    with jsonlines.open(path, "w") as writer:
        for sample in liste:
            writer.write(sample)

In [4]:
IMPORTANT_ENT_TYPE = set(
    ["ORG", "GPE", "PERSON", "WORK_OF_ART", "PRODUCT", "EVENT"]
)  # added GPE
REMOVE_ENT_TYPE = set(["ORDINAL", "CARDINAL"])

def obtain_important_ne(gen, include_capitalized_words_as_ents=True):
    important_words = []

    doc = nlp(gen)

    # print("GEN: ", gen)
    # print([(token.text, token.pos_, token.tag_, token.dep_) for token in doc if token.pos_ in ['NOUN', 'PRON', 'PROPN']])
    # print("\n")

    ents = [(ent.text, ent.label_) for ent in doc.ents]

    if include_capitalized_words_as_ents and len(ents) == 0:
        capitalized_words = re.findall("(?<!^)([A-Z][a-z]+)", gen)

        if len(capitalized_words) > 0:
            capitalized_words = [
                (word, "CAPITALIZED")
                for word in capitalized_words
                if word.lower() not in stop_words
            ]
            ents.extend(capitalized_words)

    important_words.extend([ent for ent in ents if ent[1] in IMPORTANT_ENT_TYPE])
    remaining_ne_all = [ent for ent in ents if ent[1] not in IMPORTANT_ENT_TYPE]

    # filter out some ne
    remaining_ne = []
    for ent in remaining_ne_all:
        if ent[1] in REMOVE_ENT_TYPE:
            continue
        if ent[1] == "DATE" and (
            "year" in ent[0] or "day" in ent[0]
        ):  # not bool(re.search(r'\d', ent[0])):
            # if "DATE" entity contains NO number at all (e.g., ``the year''), meaningless
            continue
        remaining_ne.append(ent)

    gens_with_ne = {
        "gen": gen,
        "important_ne": important_words,
        "unimportant_ne": remaining_ne,
        "subject": set(
            [token.text for token in doc if token.dep_ in ["nsubj", "nsubjpass"]]
        ),
        # "all_analysis": [(token.text, token.pos_, token.tag_, token.dep_) for token in doc]
    }

    return gens_with_ne

## Extracting

In [6]:
prompts_fact = read_jsonl("/Users/davidblumenthal/Documents/Master_Thesis/Evaluation/Improve_FactualityPrompt/FactualityPrompt/prompts/fever_factual_final.jsonl")
prompts_non_fact = read_jsonl("/Users/davidblumenthal/Documents/Master_Thesis/Evaluation/Improve_FactualityPrompt/FactualityPrompt/prompts/fever_nonfactual_final.jsonl")

In [13]:
ent_factual = []
ent_non_factual = []

for sample_f, sample_nf in zip(prompts_fact, prompts_non_fact):
    # [["The Host (2013 film)", "34465253"], ["Saoirse Ronan", "11061022"]]
    belong_together = []
    for ent in sample_f["evidence_info"]:
        belong_together.append(ent[0])
    ent_factual.append(belong_together)

    belong_together = []
    for ent in sample_nf["evidence_info"]:
         belong_together.append(ent[0])   
    ent_non_factual.append(belong_together)

In [9]:

wiki_wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)

In [37]:
def extract_content(entity_list: list) -> list:
    overall_content = []
    for sample in entity_list:
        content_sample = []
        for entity in sample:
            page = wiki_wiki.page(entity)
            content = page.text

            content_sample.append({"title": entity, "text": content})
        
        overall_content.append({"entity_prompt": content_sample[0]["title"], "articles": content_sample})

    return overall_content



In [38]:
# Extracting articles from Wiki -> factual
factual_contents = extract_content(ent_factual)

In [44]:
# Extracting articles from Wiki -> non_factual
non_factual_contents = extract_content(ent_non_factual)

In [46]:
write_jsonl(path="./factual_wiki_crawl.jsonl", liste=factual_contents)
write_jsonl(path="./nonfactual_wiki_crawl.jsonl", liste=non_factual_contents)

In [45]:
print(len(factual_contents))
print(len(non_factual_contents))

8000
8000


In [None]:
for idx, article in enumerate(contents):

    obj = obtain_important_ne(article["text"])

    nes = [ne[0] for ne in obj["important_ne"]]

    nes = list(set(nes))

    contents[idx] = {"title": article["title"], "important_ne": nes}

In [7]:
def extract_nes(samples: list, just_first=False):
    
    samples_copy = deepcopy(samples)
    
    """
        
        {"entity_prompt": content_sample[0]["title"], "articles": [{"title": entity, "text": content}]})
    """
    for sample in tqdm(samples_copy):
        all_nes = []
        for article in sample["articles"]:
            
            text = article["text"]
            text = text.strip()
            text = text.replace("\n", " ")

            ne_obj = obtain_important_ne(text)
            nes = [ne[0] for ne in ne_obj["important_ne"]]

            all_nes.extend(nes)
            
        all_nes = list(set(all_nes))

        sample["important_ne"] = all_nes

    return samples_copy


In [58]:
add_nes_fact = extract_nes(factual_contents)

100%|██████████| 8000/8000 [2:33:06<00:00,  1.15s/it]  


In [59]:
write_jsonl(path="./factual_wiki_crawl_imp_ne.jsonl", liste=add_nes_fact)

In [9]:
non_factual_contents = read_jsonl("./nonfactual_wiki_crawl.jsonl")
add_nes_nonfact = extract_nes(non_factual_contents)
write_jsonl(path="./nonfactual_wiki_crawl_imp_ne.jsonl", liste=add_nes_nonfact)

100%|██████████| 8000/8000 [2:08:58<00:00,  1.03it/s]  
