In [None]:
import json
import random

def read_file_lines(filename):
    try:
        with open(filename, "r") as file:
            lines = file.readlines()
            # Stripping newline characters from the end of each line
            lines = [line.strip() for line in lines]
            return lines
    except FileNotFoundError:
        print("File not found.")
        return []

def write_jsonl(list_of_dicts, filename):
    try:
        # Open a file for writing
        with open(filename, "w") as f:
            # Iterate over the list of dictionaries
            for item in list_of_dicts:
                # Convert each dictionary to a JSON string and write it to the file
                json_string = json.dumps(item)
                f.write(json_string + "\n")
        print(f"Data written to {filename} successfully.")
    except Exception as e:
        print(f"Error occurred while writing to {filename}: {e}")

In [None]:
def common_elements(list1, list2):
    common_list = []
    for element in list1:
        if element in list2:
            common_list.append(element)
    return common_list

def append_unique_elements(list_a, list_b, n):
    # Convert lists to sets for faster lookup
    set_a = set(list_a)
    set_b = set(list_b)

    # Find unique elements in list_b
    unique_elements = list(set_b - set_a)

    # Randomly pick n unique elements
    if len(unique_elements) >= n:
        new_elements = random.sample(unique_elements, n)
    else:
        new_elements = unique_elements

    # Append new elements to list_a
    list_a.extend(new_elements)
    return list_a

named_entities = read_file_lines("./names_clean.txt")
chosen_entities = read_file_lines("./chosen_names.txt")

overlap = common_elements(chosen_entities, named_entities)
new_chosen_names = append_unique_elements(overlap, named_entities, 355 - len(overlap))

In [None]:
def filter_dicts_by_topics(file_path, topic_list):
    filtered_dicts = []
    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            if data.get('topic') in topic_list:
                filtered_dicts.append(data)
    return filtered_dicts

filtered_data = filter_dicts_by_topics("./data/Llama-1-7B-facts.jsonl", new_chosen_names)
write_jsonl(filtered_data, "./data/new-Llama-1-7B-facts.jsonl")

filtered_data = filter_dicts_by_topics("./data/Llama-1-7B-factscore.jsonl", new_chosen_names)
write_jsonl(filtered_data, "./data/new-Llama-1-7B-factscore.jsonl")