In [59]:
#!/usr/bin/env python3

"""
- Script name: run_ner_model.
- Author: Dan Bright, cosmoid@tuta.io.
- Description: Script to run NER model on raw data.
- Version: 0.1.
"""

# Install required packages if not already present on system
#!pip install matplotlib spacy numpy pandas spacy_stanza spacy-transformers openai

# declare imports
import spacy, openai, os, re, html, json, ast
import pandas as pd
from spacy import language
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

True

In [60]:
class CleanData:
    """
    Class that cleans and prepares the data training &

    Consumes:
        - input_data: list[tuple] = list of input data to clean, in form [(record_id:int, text:str)]
        - separate_slashes: bool = whether to separate slashes by a space [True|False]
        - remove_linebreaks: bool = whether to remove linebreaks & join by a space [True|False]
        - remove_non_alphanum: bool = whether to remove all non-alphanumeric characters [True|False]
        - ensure_encoding: bool = ensure all characters are correctly encoded (True|False)
    Produces:
        - list of cleaned data, in form [(record_id:int, text:str)]
    """

    def __new__(
        cls,
        input_data: list[tuple] = [],
        separate_slashes: bool = True,  # Default to True
        remove_linebreaks: bool = True,  # Default to True
        remove_non_alphanum: bool = True,  # Default to True
        ensure_encoding: bool = True,  # Default to True
    ) -> list[tuple]:
        obj = super().__new__(cls)
        return obj._run_filters(
            docs=input_data,
            separate_slashes=separate_slashes,
            remove_linebreaks=remove_linebreaks,
            remove_non_alphanum=remove_non_alphanum,
            ensure_encoding=ensure_encoding,
        )

    def _run_filters(
        self,
        docs,
        separate_slashes,
        remove_linebreaks,
        remove_non_alphanum,
        ensure_encoding,
    ) -> list[tuple]:
        """
        Method to iterate the data & run filters.
        Returns: list of cleaned data in form [(record_id:int, text:str)]
        """
        filtered_docs: list[tuple] = []
        for record in docs:
            record_txt: str = record[1]
            record_txt = (
                self._separate_slashes(record_txt) if separate_slashes else record_txt
            )
            record_txt = (
                self._remove_linebreaks(record_txt) if remove_linebreaks else record_txt
            )
            record_txt = (
                self._remove_non_alphanum(record_txt)
                if remove_non_alphanum
                else record_txt
            )
            record_txt = (
                self._ensure_encoding(record_txt) if ensure_encoding else record_txt
            )
            filtered_docs.append((record[0], record_txt))
        return filtered_docs

    @staticmethod
    def _ensure_encoding(input: str) -> str:
        """
        Method to ensure characters are encoded correctly
        (i.e., no html entities, etc)
        """
        return html.unescape(input)

    @staticmethod
    def _separate_slashes(input: str) -> str:
        """
        Method to ensure all slashes within strings are surrounded by
        whitespace.
        """
        return re.sub(r"(?<!\s)/(?!\s)", " / ", input)

    @staticmethod
    def _remove_linebreaks(input: str) -> str:
        """
        Method to remove paragraphs breaks.
        """
        return " ".join(input.splitlines())

    @staticmethod
    def _remove_non_alphanum(input: str) -> str:
        """
        Method to remove all non-alphanumeric characters, except:
          - whitespaces
          - dots
          - forward slashes
        """
        return re.sub(r"\s+", " ", re.sub(r"[^\w\s\.\/]+", "", input))

In [71]:
class RunNERModel:
    """
    Class that evaluates performance of NER models.

    Consumes:
        - jupyter: bool = whether script is being run as a Jupyter notebook.
        - model_uri: str = URI of model (local path or name on remote API)
        - model_type: str = string representing model type (from: [GPT, SPACY])
        - docs: list[tuple] = list of documents to run NER on, in form [(record_id:int, text:str)]
        - openai_key: str = openAPI key
        - gpt_prompt_sep: str = GPT prompt separator token (if any)
        - gpt_comp_sep: str = GPT completion separator token (if any)
        - output_record_id_name: str = Key / column name to assign to record ID
    Produces:
        - python dictionary containing NER results, in form [{"RECORD_ID": 1, "REP_CITY": ["New York"]}]
    Notes:
        - Record number is derived from the filenames of the consumed text files, which
          MUST be named according to the convention of `record_n.txt`, where n is record number.
    """

    def __init__(
        self,
        jupyter: bool = True,  # default True
        model_uri: str = "",  # no default
        model_type: str = "",  # no default
        docs: list[tuple] = [],  # no default
        openai_key: str = "",  # no default
        gpt_prompt_sep: str = "",  # no default
        gpt_comp_sep: str = "",  # no default
        output_record_id_name: str = "",  # no default
    ) -> list[dict]:
        # define variables
        self._jupyter: bool = jupyter
        self._model_type = model_type.upper()
        self._model_uri: str = model_uri
        self._spacy_nlp: language.Doc = None
        self._docs: list[tuple] = docs
        self._openai_key: str = openai_key
        self._gpt_prompt_sep = gpt_prompt_sep
        self._gpt_comp_sep = gpt_comp_sep
        self._gpt_results: list[tuple] = []  # in form [(record_num, str)]
        self._spacy_results: list[tuple] = []  # in form [(record_num, str)]
        self._results_formatted: list[dict] = []  # in form [(record_num, str)]
        self._output_record_id_name: str = output_record_id_name
        # run methods [note: do not change running order]
        if self._model_type == "GPT":
            self._run_gpt()
            self._format_gpt_results()
        elif self._model_type == "SPACY":
            self._run_spacy()
            self._format_spacy_results()

    def get_results(self) -> list[dict]:
        # return results
        return self._results_formatted

    def _run_spacy(self) -> None:
        # method to run spacy model
        self._spacy_nlp: language.doc = spacy.load(self._model_uri)

    def _run_gpt(self) -> None:
        # method to run GPT
        openai.api_key = self._openai_key
        for doc in self._docs:
            self._gpt_results.append(
                (
                    doc[0],
                    openai.Completion.create(
                        model=self._model_uri,
                        prompt=f"{doc[1]}{self._gpt_prompt_sep}",
                        max_tokens=1500,
                        temperature=0.2,
                        top_p=0.1,
                        frequency_penalty=0,
                        presence_penalty=0,
                        stop=[self._gpt_comp_sep],
                    )["choices"][0]["text"],
                )
            )

    def _format_gpt_results(self) -> None:
        # method to format GPT results ahead of export
        for result in self._gpt_results:
            ent_list: list[str] = result[1].splitlines()
            ent_list: list = result[1].splitlines()
            results: dict = {}
            results = {self._output_record_id_name: result[0]}
            for ents in ent_list:
                if ":" in ents:
                    key, value = ents.split(":", 1)
                    results[key.strip()] = ast.literal_eval(value)
            self._results_formatted.append(results)

    def _format_spacy_results(self) -> None:
        # method to format spaCy results ahead of export
        pass

In [62]:
class ExportResults:
    """
    Class that exports the results as JSON and XLSX files

    Consumes:
        - output_record_id_name: str = Key / column name to assign to record ID
        - output_xlsx_sort_col: str =  name of col to sort output XLSX by (string)
        - output_json_path: str = path to output JSON file
        - output_xlsx_path: str = path to output XLSX file
        - output_xlsx_melt: bool = reshape output XLSX using Pandas to yield RECORD_ID, NAME, VALUE
        - results_formatted: list[dict] = list of results
    Produces:
        - JSON formatted file containing tokens (strings) that were extracted.
        - EXCEL formatted file containing tokens (strings) that were extracted.
    Notes:
        -
    """

    def __init__(
        self,
        output_record_id_name: str = "RECORD_ID",  # default RECORD_ID
        output_xlsx_sort_col: str = "RECORD_ID",  # default RECORD_ID
        output_json_path: str = "",  # no default
        output_xlsx_path: str = "",  # no default
        output_xlsx_melt: bool = False,  # default False
        results_formatted: list[dict] = [],  # default empty list
    ) -> None:
        # define variables
        self._output_record_id_name: str = output_record_id_name
        self._output_xlsx_sort_col: str = output_xlsx_sort_col
        self._output_json_path: str = output_json_path
        self._output_xlsx_path: str = output_xlsx_path
        self._output_xlsx_melt: bool = output_xlsx_melt
        self.results_formatted: list[dict] = results_formatted

    def write_json_output(self) -> None:
        # writes formatted output to JSON file
        with open(self._output_json_path, "w") as fp:
            json.dump(self.results_formatted, fp)

    def write_xlsx_output(self) -> None:
        # writes formatted output to EXCEL file
        df = pd.read_json(json.dumps(self.results_formatted))
        if self._output_xlsx_melt:
            df = pd.melt(
                df,
                id_vars=[self._output_record_id_name],
                value_vars=[e for e in self.results_formatted[0].keys()],
                var_name="NAME",
                value_name="VALUE",
            )
            df = df.dropna(subset=["VALUE"]).sort_values(self._output_xlsx_sort_col)
        df.to_excel(self._output_xlsx_path)

In [63]:
class ExecuteNER:
    """
    Class that evaluates performance of NER models.

    Consumes:
        - jupyter: bool = whether script is being run as a Jupyter notebook.
        - doc_dir: str = path to directory containing input documents.
        - model_uri: str = URI of model (local path or name on remote API).
        - model_type: str = string representing model type (from: [GPT, SPACY]).
        - openai_key: str = openAPI key.
        - gpt_prompt_sep: str = GPT prompt separator token (if any).
        - gpt_comp_sep: str = GPT completion separator token (if any).
        - output_record_id_name: str = Key / column name to assign to record ID.
        - output_xlsx_sort_col: str =  name of col to sort output XLSX by (string).
        - output_json_path: str = path to output JSON file.
        - output_xlsx_path: str = path to output XLSX file.
        - output_xlsx_melt: bool = reshape output XLSX using Pandas to yield RECORD_ID, NAME, VALUE.
        - results_formatted: list[dict] = list of results.
        - export_json: bool = export as JSON?
        - export_xlsx: bool = export as xlsx?
        - batch_size: int = processing batch size.
    Produces:
        - JSON formatted file containing tokens (strings) that were extracted.
        - EXCEL formatted file containing tokens (strings) that were extracted.
    Notes:
        - Record number is derived from the filenames of the consumed text files, which
          MUST be named according to the convention of `record_n.txt`, where n is record number.
    """

    def __init__(
        self,
        jupyter,
        doc_dir,
        model_uri,
        model_type,
        openai_key,
        output_json_path,
        output_xlsx_path,
        output_xlsx_melt,
        output_xlsx_sort_col,
        output_record_id_name,
        export_json,
        export_xlsx,
        batch_size,
        gpt_prompt_sep,
        gpt_comp_sep,
    ) -> None:
        # define variables
        self._output_json_path = output_json_path
        self._output_xlsx_path = output_xlsx_path
        self._output_xlsx_melt = output_xlsx_melt
        self._output_xlsx_sort_col = output_xlsx_sort_col
        self._output_record_id_name = output_record_id_name
        self._export_json = export_json
        self._export_xlsx = export_xlsx
        self._batch_size = batch_size
        self._jupyter = jupyter
        self._model_uri = model_uri
        self._model_type = model_type
        self._openai_key = openai_key
        self._gpt_prompt_sep = gpt_prompt_sep
        self._gpt_comp_sep = gpt_comp_sep
        self._all_results: list[dict] = []  # results from all batches
        self._doc_dir: Path = Path(doc_dir).resolve(strict=True)
        self._docs: list[tuple] = []
        self._doc_batches: list[list] = []
        # run methods [note: do not change running order]
        self._get_docs()
        self._prepare_data()
        self._batchify()
        self._run_model()
        self._export_results()

    def get_results(self) -> list[dict]:
        return self._all_results

    def _get_docs(self) -> None:
        for filepath in self._doc_dir.glob("*.txt"):
            with open(filepath, "r") as file:
                data_txt: str = file.read()
                self._docs.append((int(re.findall(r"\d+", filepath.stem)[0]), data_txt))
                self._docs = sorted(self._docs, key=lambda x: x[0])

    def _prepare_data(self) -> None:
        self._docs = CleanData(self._docs)

    def _batchify(self) -> None:
        for i in range(0, len(self._docs), self._batch_size):
            self._doc_batches.append(self._docs[i : i + self._batch_size])

    def _run_model(self) -> None:
        for idx, batch in enumerate(self._doc_batches):
            try:
                self._all_results += RunNERModel(
                    jupyter=self._jupyter,
                    model_uri=self._model_uri,
                    model_type=self._model_type,
                    docs=batch,
                    openai_key=self._openai_key,
                    gpt_prompt_sep=self._gpt_prompt_sep,
                    gpt_comp_sep=self._gpt_comp_sep,
                    output_record_id_name=self._output_record_id_name,
                ).get_results()
            except Exception as e:
                print(f"An error occurred with batch {idx}: {str(e)}")

    def _export_results(self) -> None:
        export = ExportResults(
            output_xlsx_sort_col=self._output_xlsx_sort_col,
            output_json_path=self._output_json_path,
            output_xlsx_path=self._output_xlsx_path,
            output_xlsx_melt=self._output_xlsx_melt,
            results_formatted=self._all_results,
        )
        if self._export_json:
            export.write_json_output()
        if self._export_xlsx:
            export.write_xlsx_output()

In [76]:
# run the script

# script parameters
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
JUPYTER: bool = True  # Running on Jupyter notebook? (True|False)
MODEL_URI: str = "davinci:ft-uplandsdynamic-2023-07-05-14-10-39"  # URI of model (local path or name on remote API) (string)
MODEL_TYPE: str = "GPT"  # Model type, from ["GPT", "SPACY"] (string)
DOC_DIR: str = "../../data/sample/test/txt/tiny_test"  # Directory containing formatted txt files (`docs`) for NER (string)
GPT_PROMPT_SEP: str = "\n\n###\n\n"  #  GPT prompt separator token (if any) (string)
GPT_COMPLETION_SEP: str = "\n\nEND\n\n"  # GPT completion separator token (if any) (string)
OUTPUT_XLSX_SORT_COL: str = "RECORD_ID"  # name of col to sort output XLSX by (string)
OUTPUT_RECORD_ID_NAME: str = "RECORD_ID"  # name to assign to record ID key/column in output XLSX / JSON (if any) (string)
EXPORT_JSON: bool = True  # write results output to JSON file? (True|False)
EXPORT_XLSX: bool = True  # write results output to XLSX file? (True|False)
OUTPUT_JSON_PATH: str = "../../data/output/ner/test_result_gpt.json"  # Path > output JSON
OUTPUT_XLSX_PATH: str = "../../data/output/ner/test_result_gpt.xlsx"  # Path > output XLSX
OUTPUT_XLSX_MELT: bool = True  # reshape output XLSX using Pandas to yield RECORD_ID, NAME, VALUE (True|False)
BATCH_SIZE: int = 5  # size of batches to send for NER

if __name__ == "__main__":
    results = ExecuteNER(
        doc_dir=DOC_DIR,
        output_record_id_name=OUTPUT_RECORD_ID_NAME,
        output_xlsx_sort_col=OUTPUT_XLSX_SORT_COL,
        output_json_path=OUTPUT_JSON_PATH,
        output_xlsx_path=OUTPUT_XLSX_PATH,
        output_xlsx_melt=OUTPUT_XLSX_MELT,
        export_json=EXPORT_JSON,
        export_xlsx=EXPORT_XLSX,
        batch_size=BATCH_SIZE,
        jupyter=JUPYTER,
        model_uri=MODEL_URI,
        model_type=MODEL_TYPE,
        openai_key=OPENAI_API_KEY,
        gpt_prompt_sep=GPT_PROMPT_SEP,
        gpt_comp_sep=GPT_COMPLETION_SEP,
    ).get_results()

    print(results)