In [2]:
#!/usr/bin/env python3

"""
- Script name: annotation_converter
- Author: Dan Bright, cosmoid@tuta.io
- Description: A script to convert formatting of annotations from spaCy to GPT
- Version 1.2
"""

import os, json
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [3]:
def read_json_files(directory_path: str) -> list[dict]:
    """Read all JSON files in the given directory and return their contents
    as a list of python dictionaries."""
    annotations: list[dict] = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):
            with open(os.path.join(directory_path, filename), "r") as file:
                annotations.append(json.load(file))
    return annotations

In [4]:
def convert_to_gpt(spacy_annotations: list[dict]) -> list[dict]:
    """convert spaCy formatted annotations to GPT formatted."""
    gpt_annotations: list[dict] = []
    prompt_separator: str = "\n\n###\n\n"
    completion_separator: str = "\n\nEND\n\n"
    ent_class_separator: str = "\n"
    no_entity_token: str = "NULL"
    for doc in spacy_annotations:
        entity_classes: list = doc["classes"]
        for annotations in doc["annotations"]:
            if annotations:
                prompt: str = annotations[0]
                entities: list[list] = annotations[1]["entities"]
                completion: str = ""
                for cls in entity_classes:
                    if cls not in [e[2] for e in entities]:
                        completion += (
                            f"{cls}:['{no_entity_token}']{ent_class_separator}"
                        )
                        pass
                    else:
                        tokens: list[str] = []
                        for ent in entities:
                            if cls == ent[2]:
                                tokens.append(prompt[ent[0] : ent[1]])
                        tokens_str: str = ",".join([f"'{token}'" for token in tokens])
                        cls_completion: str = (
                            f"{cls}:[{tokens_str}]{ent_class_separator}"
                        )
                        completion += cls_completion
                gpt_annotations.append(
                    {
                        "prompt": f"{prompt}{prompt_separator}",
                        "completion": f" {completion}{completion_separator}",
                    }
                )
    return gpt_annotations

In [5]:
def write_to_file(annotations: list[dict], tmp_file: str, output_file: str) -> None:
    """Write annotations to file."""
    with open(output_file, "w") as f:
        for a in annotations:
            json.dump(a, f)
            f.write("\n")

In [7]:
# define paths
output_file = "../../data/sample/train/training_data_input/gpt/annotations.jsonl"
tmp_file = "../../data/tmp/tmp.jsonl"
json_spacy_format = read_json_files("../../data/sample/train/json/")
# run function to convert spaCy formatted annotations to GPT formatted
gpt_annotations = convert_to_gpt(json_spacy_format)
# write output GPT formatted annotations to file
write_to_file(gpt_annotations, tmp_file, output_file)

[{'prompt': 'PRELIM INFO FROM FAA OPS: OKLAHOMA CITY, OK / UAS INCIDENT / 1609C / ZFW ADVISED PIPER PA46 REPORTED A UAS 1,000 FEET BELOW ACFT ALTITUDE OF 11,000 FEET 35 E OKLAHOMA CITY, OK. UNKN IF LEO NOTIFIED.\n\n###\n\n', 'completion': " ICDT_DATE:['NULL']\nICDT_TIME:['1609C']\nICDT_LOC:['35 E OKLAHOMA CITY, OK.']\nUAS_COLOR:['NULL']\nUAS_HEADING:['NULL']\nUAS_SIZE:['NULL']\nUAS_REL_ALT:['1,000 FEET BELOW']\nUAS_ACT_ALT:['NULL']\nAC_ALT:['11,000 FEET']\nAC_TYPE:['PIPER PA46']\nFT_NAME:['NULL']\nFT_OPTOR:['NULL']\nFT_ROUTE:['NULL']\nAC_HEADING:['NULL']\nUAS_REL_POS:['NULL']\nUAS_ACT_SPEED:['NULL']\nUAS_REL_SPEED:['NULL']\nAC_ACT_SPEED:['NULL']\nAC_REL_SPEED:['NULL']\nREP_CITY:['OKLAHOMA CITY']\nREP_STATE:['OK']\nATC_ID:['ZFW']\nUAS_DESC:['NULL']\n END"}, {'prompt': 'PRELIM INFO FROM FAA OPS: SANTA ANA, CA / UAS INCIDENT / 1020P / UNITED 533, A320, REPORTED A FLYING OBJECT AT 8,000 FEET THAT APPEARED TO BE A UAS IN RELATIVELY CLOSE PROXIMITY. NO LEOS NOTIFIED. \n\n###\n\n', 'completio

In [9]:
# run openAI annotations preparation script (optional)
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
!openai tools fine_tunes.prepare_data -f ../../data/sample/train/training_data_input/gpt/annotations.jsonl -q

Analyzing...

- Your file contains 55 prompt-completion pairs. In general, we recommend having at least a few hundred examples. We've found that performance tends to linearly increase for every doubling of the number of examples
- More than a third of your `prompt` column/key is uppercase. Uppercase prompts tends to perform worse than a mixture of case encountered in normal language. We recommend to lower case the data if that makes sense in your domain. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more details
- More than a third of your `completion` column/key is uppercase. Uppercase completions tends to perform worse than a mixture of case encountered in normal language. We recommend to lower case the data if that makes sense in your domain. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more details
- All prompts end with suffix `\n\n###\n\n`
- All completions start with prefix ` ICDT_DATE:['`. Most of the ti