In [None]:
#!/usr/bin/env python3

"""
- Script name: prepare_annotation_data
- Author: Dan Bright, cosmoid@tuta.io
- Description: A script to prepare spreadsheet column for annotation.
"""

import pandas as pd
import re
from pathlib import Path

In [None]:
class PrepareReportData:
    """
    Class to prepare a random sample of a given column of spreadsheet data for annotation.

    Note: To perform extra processing, simply add new methods following the pattern of
    _separate_slashes(), and add the method to __init__().

    Takes:
        - data_file: str = URL of the data file
        - output_data_dir: str = URL of the output directory
        - sample_size: int = Size of the random sample (number of spreadsheet rows)
        - target_header: str = Spreadsheet header of the column from which to extract data
    Provides:
        - Method to ensure all slashes in a text string are surrounded by whitespace
        - One text file (.txt) per randomly selected spreadsheet row, containing the
          appropriately formatted text extracted from the specified spreadsheet column,
          saved in the output directory.
    Returns:
        - None.
    """

    def __init__(
        self,
        data_file: str,
        used_data_file: str,
        output_data_dir: str,
        sample_size: int,
        target_header: str,
        index_column_label: str,
        required_columns: dict[str:str],
        write_xlsx: bool,
        write_txt_files: bool,
    ) -> None:
        self._data_file: Path = Path(data_file).resolve(strict=True)
        self._used_data_file: str = used_data_file
        self._output_data_dir: Path = Path(output_data_dir).resolve(strict=False)
        self._sample_size: int = sample_size
        self._target_header: str = target_header
        self._required_columns: dict[str:str] = required_columns
        self._index_column_label: str = index_column_label
        self._random_sample: list[tuple] = []
        self._df_selected: pd.DataFrame = pd.DataFrame()
        self.df_sample: pd.DataFrame = pd.DataFrame()
        self._get_data()
        self._get_random_sample()
        # ADD PROCESSING METHODS _BELOW_ THIS LINE
        self._separate_slashes()
        # _DO_NOT_ ADD ANYTHING _BELOW_ THIS LINE
        self._write_to_xlsx() if write_xlsx else None
        self._to_txt_files if write_txt_files else None

    def _get_data(self) -> None:
        """
        Method to extract cells from excel spreadsheet column to list of strings.
        """
        pd.options.mode.use_inf_as_na = True
        df: pd.DataFrame = pd.read_excel(self._data_file)
        self._df_selected = df[[c for c in self._required_columns]].copy()
        self._df_selected.dropna(axis=0, inplace=True)
        print(
            f"{len(self._df_selected.index)} valid records have been initially extracted"
        )

    def _get_random_sample(self) -> None:
        """
        Method to return random sample from a list of strings, as [(index (starting @ 1)), string)].
        """
        self._df_sample = self._df_selected.sample(n=self._sample_size)
        for idx in range(self._sample_size):
            self._random_sample.append(
                (
                    self._df_sample.iloc[idx][self._index_column_label],
                    self._df_sample.iloc[idx][self._target_header],
                )
            )

    def _separate_slashes(self) -> list[tuple]:
        """
        Method to ensure all slashes within strings are surrounded by whitespace.
        """
        for idx, record in enumerate(self._random_sample):
            self._random_sample[idx] = (
                record[0],
                re.sub(r"(?<!\s)/(?!\s)", " / ", record[1]),
            )

    def _write_to_xlsx(self):
        self._df_sample.rename(columns=self._required_columns, inplace=True)
        self._df_sample.to_excel(self._used_data_file, index=False)

    def _to_txt_files(self) -> None:
        """
        Method to write each list entry to a .txt file.
        """
        for record in self._random_sample:
            with open(self._output_data_dir / f"record_{record[0]}.txt", "w") as file:
                file.write(record[1])
        print(f"Text files written in {self._output_data_dir}. Job done.")

In [None]:
# define paths & parameters
DATA_FILE: str = "../../data/WIP-Sampling-V10.xlsx"
USED_DATA_FILE: str = "../../data/training_sample.xlsx"
TARGET_HEADER: str = "Cleansed Summary"
INDEX_COLUMN_LABEL: str = "Record No."
WRITE_XLSX: bool = True
WRITE_TXT_FILES: bool = False
REQUIRED_COLUMNS: list[str] = {
    INDEX_COLUMN_LABEL: "RECORD NUM",
    TARGET_HEADER: "SUMMARY",
    "Numerical altitude of observer": "OBSERVER ALT",
    "Numerical relative altitude of UAS": "UAS RELATIVE ALT",
    "Inferred or abs.altitude of UAS": "UAS ABSOLUTE ALT",
}  # in form {original_name: output_name}
OUTPUT_DATA_DIR: str = "../../data/sample/train/txt"
SAMPLE_SIZE: int = 55

# extract & prepare sample of report data for annotation
PrepareReportData(
    data_file=DATA_FILE,
    used_data_file=USED_DATA_FILE,
    output_data_dir=OUTPUT_DATA_DIR,
    sample_size=SAMPLE_SIZE,
    target_header=TARGET_HEADER,
    index_column_label=INDEX_COLUMN_LABEL,
    required_columns=REQUIRED_COLUMNS,
    write_xlsx=WRITE_XLSX,
    write_txt_files=WRITE_TXT_FILES,
)