In [1]:
import requests
from urllib.parse import quote

def check_wikipedia_page(page_name):
    try:
        # Encode the page name for use in a URL
        encoded_page_name = quote(page_name)
        
        # Construct the Wikipedia URL
        url = f"https://en.wikipedia.org/wiki/{encoded_page_name}"
        
        # Send a GET request to the URL
        response = requests.get(url)
        
        # Check if the response status code is 200 (OK)
        if response.status_code == 200:
            print(f"The Wikipedia page '{page_name}' exists.")
        else:
            raise ValueError(f"The Wikipedia page '{page_name}' does not exist.")
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")

In [2]:
import json
import random

def read_file_lines(filename):
    try:
        with open(filename, "r") as file:
            lines = file.readlines()
            # Stripping newline characters from the end of each line
            lines = [line.strip() for line in lines]
            return lines
    except FileNotFoundError:
        print("File not found.")
        return []


def write_list_to_file(file_path, text_list):
    with open(file_path, "w", encoding="utf-8") as file:
        for item in text_list:
            file.write(str(item) + "\n")


def take_random_elements(input_list, num_elements):
    if num_elements > len(input_list):
        print("Error: Number of elements to take exceeds the length of the list.")
        return []
    else:
        random_elements = random.sample(input_list, num_elements)
        return random_elements


def write_jsonl(list_of_dicts, filename):
    try:
        # Open a file for writing
        with open(filename, "w") as f:
            # Iterate over the list of dictionaries
            for item in list_of_dicts:
                # Convert each dictionary to a JSON string and write it to the file
                json_string = json.dumps(item)
                f.write(json_string + "\n")
        print(f"Data written to {filename} successfully.")
    except Exception as e:
        print(f"Error occurred while writing to {filename}: {e}")


def append_dict_to_jsonl(file_path, dictionary):
    """
    Appends a dictionary as a new JSON object to an existing .jsonl file.
    If the file doesn't exist, it creates a new file.

    Args:
        dictionary (dict): The dictionary to be appended as a JSON object.
        file_path (str): The path to the .jsonl file.
    """
    try:
        with open(file_path, "a", encoding="utf-8") as file:
            json_object = json.dumps(dictionary)
            file.write(json_object + "\n")
    except Exception as e:
        print(f"An error occurred: {e}")

def read_jsonl_file(file_path):
    data_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data_dict = json.loads(line)
            data_dict.pop("input")
            data_dict.pop("output")
            data_dict.pop("annotations")
            data_list.append(data_dict)
    return data_list

In [3]:
import concurrent.futures

from factscore.atomic_facts import AtomicFactGenerator

generator = AtomicFactGenerator(
    "api.key", "demos", gpt3_cache_file="gpt_cache.pkl"
)

def process_data_single(dp):
    if not {"topic": dp["topic"], "num_response": dp["num_response"]} in annotated_bio:
        atomic_facts, _ = generator.run(dp["output"])
        annotations = []
        for f in atomic_facts:
            annotations.append({
                "text": f[0],
                "model-atomic-facts": [{"text": t} for t in f[1]]
            })
        dp["annotations"] = annotations

        return dp
    else:
        return None


def process_data_multi(data):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_data_single, d): d for d in data}

        concurrent.futures.wait(futures)

        for future in futures:
            annotated_dp = future.result()
            if annotated_dp is not None:
                append_dict_to_jsonl("Llama-1-7B-facts.jsonl", annotated_dp)

[nltk_data] Downloading package punkt to /home/haznitrama/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
from tqdm import tqdm

annotated_bio = read_jsonl_file("Llama-1-7B-facts.jsonl")

with open("Llama-1-7B.jsonl") as f:
    batch_data = []
    for line in tqdm(f):
        dp = json.loads(line)
        batch_data.append(dp)
        if len(batch_data) == 2:
            process_data_multi(batch_data)
            batch_data = []
    if batch_data:
        process_data_multi(batch_data)

3550it [05:34, 10.61it/s]  
