In [64]:
#!/usr/bin/env python3

"""
- Script name: annotation_converter
- Author: Dan Bright, cosmoid@tuta.io
- Description: A script to convert formatting of annotations from spaCy to GPT
"""

import os, json
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [66]:
def read_json_files(directory_path: str) -> list[dict]:
    """Read all JSON files in the given directory and return their contents
    as a list of python dictionaries."""
    annotations: list[dict] = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):
            with open(os.path.join(directory_path, filename), "r") as file:
                annotations.append(json.load(file))
    return annotations

In [67]:
def convert_to_gpt(spacy_annotations: list[dict]) -> list[dict]:
    """convert spaCy formatted annotations to GPT formatted."""
    gpt_annotations: list[dict] = []
    prompt_separator: str = "\n\n###\n\n"
    completion_separator: str = "END"
    ent_class_separator: str = "\n"
    no_entity_token: str = "NULL"
    for doc in spacy_annotations:
        entity_classes: list = doc["classes"]
        for annotations in doc["annotations"]:
            if annotations:
                prompt: str = annotations[0]
                entities: list[list] = annotations[1]["entities"]
                completion: str = ""
                for cls in entity_classes:
                    if cls not in [e[2] for e in entities]:
                        completion += (
                            f"{cls}:['{no_entity_token}']{ent_class_separator}"
                        )
                        pass
                    else:
                        tokens: list[str] = []
                        for ent in entities:
                            if cls == ent[2]:
                                tokens.append(prompt[ent[0] : ent[1]])
                        tokens_str: str = ",".join([f"'{token}'" for token in tokens])
                        cls_completion: str = (
                            f"{cls}:[{tokens_str}]{ent_class_separator}"
                        )
                        completion += cls_completion
                gpt_annotations.append(
                    {
                        "prompt": f"{prompt}{prompt_separator}",
                        "completion": f" {completion} {completion_separator}",
                    }
                )
    return gpt_annotations

In [68]:
def write_to_file(annotations: list[dict], tmp_file: str, output_file: str) -> None:
    """Write annotations to file."""
    print(annotations)
    with open(output_file, "w") as f:
        for a in annotations:
            json.dump(a, f)
            f.write("\n")

In [None]:
# define paths
output_file = "../annotations.jsonl"
tmp_file = "../tmp.jsonl"
spacy_annotations = read_json_files("../../data/sample/train/json/")
# run function to convert spaCy formatted annotations to GPT formatted
gpt_annotations = convert_to_gpt(spacy_annotations)
# write output GPT formatted annotations to file
write_to_file(gpt_annotations, tmp_file, output_file)

In [None]:
# run openAI annotations preparation script (optional)
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
!openai tools fine_tunes.prepare_data -f annotations.jsonl -q