In [1]:
#!/usr/bin/env python3

"""
- Script name: scu.projectDrones.prepareAnnotationData
- Author: Dan Bright, cosmoid@tuta.io
- License: Free & Open Source, GPLv3.
- Description: A script to prepare spreadsheet column for annotation.
"""

import pandas as pd
from pathlib import Path
import random, re

In [9]:
class PrepareReportData:
    """
    Class to prepare a random sample of a given column of spreadsheet data for annotation.

    Note: To perform extra processing, simply add new methods following the pattern of
    _separate_slashes(), and add the method to __init__().

    Takes:
        - data_file: str = URL of the data file
        - output_data_dir: str = URL of the output directory
        - sample_size: int = Size of the random sample (number of spreadsheet rows)
        - target_header: str = Spreadsheet header of the column from which to extract data
    Provides:
        - Method to ensure all slashes in a text string are surrounded by whitespace
        - One text file (.txt) per randomly selected spreadsheet row, containing the
          appropriately formatted text extracted from the specified spreadsheet column,
          saved in the output directory.
    Returns:
        - None.
    """

    def __init__(
        self,
        data_file: str,
        output_data_dir: str,
        sample_size: int,
        target_header: str,
    ) -> None:
        self._data_file: str = Path(data_file).resolve(strict=True)
        self._output_data_dir: Path = Path(output_data_dir).resolve(strict=True)
        self._sample_size: int = sample_size
        self._target_header: str = target_header
        self._data_column: list[str] = []
        self._random_sample: list[tuple] = []
        self._get_data()
        self._get_random_sample()
        # ADD PROCESSING METHODS _BELOW_ THIS LINE
        self._separate_slashes()
        # _DO_NOT_ ADD ANYTHING _BELOW_ THIS LINE
        self._to_txt_files()

    def _get_data(self) -> None:
        """
        Method to extract cells from excel spreadsheet column to list of strings.
        """
        df: pd.DataFrame = pd.read_excel(self._data_file)
        self._data_column = df[self._target_header].dropna().to_list()

    def _get_random_sample(self) -> None:
        """
        Method to return random sample from a list of strings, as [(index (starting @ 1)), string)].
        """
        for _ in range(self._sample_size):
            idx: int = random.randint(0, len(self._data_column) - 1)
            self._random_sample.append((idx + 1, self._data_column[idx]))

    def _separate_slashes(self) -> list[tuple]:
        """
        Method to ensure all slashes within strings are surrounded by whitespace.
        """
        for idx, record in enumerate(self._random_sample):
            self._random_sample[idx] = (
                record[0],
                re.sub(r"(?<!\s)/(?!\s)", " / ", record[1]),
            )

    def _to_txt_files(self) -> None:
        """
        Method to write each list entry to a .txt file.
        """
        for record in self._random_sample:
            with open(self._output_data_dir / f"record_{record[0]}.txt", "w") as file:
                file.write(record[1])
        print(f"Text files written in {self._output_data_dir}. Job done.")

In [None]:
# define paths & parameters
DATA_FILE: str = "../../data/WIP_RP_VERSION_3a.xlsx"
TARGET_HEADER: str = "CLEANED Summary"
OUTPUT_DATA_DIR: str = "../../data/sample/train/txt"
SAMPLE_SIZE: int = 150

# extract & prepare sample of report data for annotation
PrepareReportData(
    data_file=DATA_FILE,
    output_data_dir=OUTPUT_DATA_DIR,
    sample_size=SAMPLE_SIZE,
    target_header=TARGET_HEADER,
)