In [64]:
#!/usr/bin/env python3

"""
- Script name: scu.projectDrones.annotationConverter
- Author: Dan Bright, cosmoid@tuta.io
- License: Free & Open Source, GPLv3.
- Description: A script to convert formatting of annotations from spaCy to GPT
"""

import os, json, io
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [66]:
def read_json_files(directory_path: str) -> list[dict]:
    """Read all JSON files in the given directory and return their contents
    as a list of python dictionaries."""
    annotations: list[dict] = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):
            with open(os.path.join(directory_path, filename), "r") as file:
                annotations.append(json.load(file))
    return annotations

In [67]:
def convert_to_gpt(spacy_annotations: list[dict]) -> list[dict]:
    gpt_annotations: list[dict] = []
    prompt_separator: str = "\n\n###\n\n"
    completion_separator: str = "END"
    ent_class_separator: str = "\n"
    no_entity_token: str = "NULL"
    for doc in spacy_annotations:
        entity_classes: list = doc["classes"]
        for annotations in doc["annotations"]:
            if annotations:
                prompt: str = annotations[0]
                entities: list[list] = annotations[1]["entities"]
                completion: str = ""
                for cls in entity_classes:
                    if cls not in [e[2] for e in entities]:
                        completion += (
                            f"{cls}:['{no_entity_token}']{ent_class_separator}"
                        )
                        pass
                    else:
                        tokens: list[str] = []
                        for ent in entities:
                            if cls == ent[2]:
                                tokens.append(prompt[ent[0] : ent[1]])
                        tokens_str: str = ",".join([f"'{token}'" for token in tokens])
                        cls_completion: str = (
                            f"{cls}:[{tokens_str}]{ent_class_separator}"
                        )
                        completion += cls_completion
                gpt_annotations.append(
                    {
                        "prompt": f"{prompt}{prompt_separator}",
                        "completion": f" {completion} {completion_separator}",
                    }
                )
    return gpt_annotations

In [68]:
def write_to_file(annotations: list[dict], tmp_file: str, output_file: str) -> None:
    print(annotations)
    with open(output_file, "w") as f:
        for a in annotations:
            json.dump(a, f)
            f.write("\n")

In [69]:
output_file = "./annotations.jsonl"
tmp_file = "./tmp.jsonl"
spacy_annotations = read_json_files("../data/sample/train/json/")
gpt_annotations = convert_to_gpt(spacy_annotations)

write_to_file(gpt_annotations, tmp_file, output_file)

[{'prompt': 'PRELIM INFO FROM FAA OPS: SAVANNAH, GA / UAS INCIDENT / 1133E / E-ROC ADVISED  PIPER P28A REPORTED A BLACK QUAD COPTER UAS WHILE S BOUND AT 1,200 FEET 4 NW SAV. NO EVASIVE ACTION TAKEN. LEO NOTIFICATION NOT REPORTED. \n\n###\n\n', 'completion': " ATC_CITY:['SAVANNAH']\nATC_STATE:['GA']\nICDT_DATE:['NULL']\nICDT_TIME:['1133E']\nICDT_LOC:['4 NW SAV.']\nUAS_COLOR:['BLACK']\nUAS_HEADING:['NULL']\nUAS_SIZE:['NULL']\nUAS_REL_ALT:['NULL']\nUAS_ACT_ALT:['NULL']\nAC_ALT:['1,200 FEET']\nAC_TYPE:['PIPER P28A']\nAC_HEADING:['S BOUND']\nFT_ID:['NULL']\nFT_OPTOR:['NULL']\nUAS_TYPE:['QUAD COPTER']\n END"}, {'prompt': 'UAS MOR Alert for SAV\n\n###\n\n', 'completion': " ATC_CITY:['NULL']\nATC_STATE:['NULL']\nICDT_DATE:['NULL']\nICDT_TIME:['NULL']\nICDT_LOC:['NULL']\nUAS_COLOR:['NULL']\nUAS_HEADING:['NULL']\nUAS_SIZE:['NULL']\nUAS_REL_ALT:['NULL']\nUAS_ACT_ALT:['NULL']\nAC_ALT:['NULL']\nAC_TYPE:['NULL']\nAC_HEADING:['NULL']\nFT_ID:['NULL']\nFT_OPTOR:['NULL']\nUAS_TYPE:['NULL']\n END"}, {'pr

In [None]:
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
!openai tools fine_tunes.prepare_data -f annotations.jsonl -q