# DataLoader

In [1]:
import subprocess
import pandas as pd

from pathlib import Path
from typing import Literal, Optional

## Load data and labels

In [2]:
Partition = Literal["train", "dev-0", "test-A"]

class DataLoader:
    def __init__(self, data_dir: Path, output_dir: Optional[Path] = None):
        self.data_dir = data_dir
        self.output_dir = output_dir
        self._column_names = pd.read_csv(
            data_dir / "in-header.tsv", sep="\t", encoding="utf-8", nrows=0
        ).columns.tolist()

    def load(self, partition: Partition = "train") -> pd.DataFrame:
        if partition == "test-A":
            return self._read_data(partition)
        return pd.concat(
            [self._read_data(partition), self._read_labels(partition)], axis=1
        )

    def export(self, df: pd.DataFrame, partition: Partition = "train") -> None:
        if self.output_dir is None:
            raise ValueError("Output directory has not been set yet.")
        output_path = self.output_dir / partition
        output_path.mkdir(parents=True, exist_ok=True)
        df.to_csv(
            output_path / "dataset.tsv.xz",
            sep="\t",
            encoding="utf-8",
            compression="xz",
            index=False,
            header=True,
        )

    def _read_data(self, partition: Partition) -> pd.DataFrame:
        return pd.read_csv(
            self.data_dir / partition / "in.tsv.xz",
            sep="\t",
            encoding="utf-8",
            compression="xz",
            header=None,
            names=self._column_names,
        )

    def _read_labels(self, partition: Partition) -> pd.DataFrame:
        return pd.read_csv(
            self.data_dir / partition / "expected.tsv",
            sep="\t",
            encoding="utf-8",
            header=None,
            names=["labels"],
        )

In [3]:
loader = DataLoader(Path.cwd().parent / "static" / "data")

train, val, test = [
    loader.load(partition) for partition in ("train", "dev-0", "test-A")
]

### Explore

In [4]:
def open_document(filename: str) -> None:
    pdf_path = loader.data_dir / "documents" / f"{filename}"
    subprocess.run(["open", pdf_path])

In [5]:
val.head()

Unnamed: 0,filename,keys,text_djvu,text_tesseract,text_textract,text_best,labels
0,073f3b9eb0c7088be4ef688f4edfdb6d.pdf,effective_date jurisdiction party term,EX-10 5 ex10-4 .htm EXHIBIT 10.4\nExhibit 10.4...,EX-10 5 ex10-4.htm EXHIBIT 10.4\nExhibit 10.4\...,EX-10 ex10-4.htm EXHIBIT 10.4\nExhibit 10.4\nA...,EX-10 5 ex10-4 .htm EXHIBIT 10.4\nExhibit 10.4...,effective_date=2014-05-20 jurisdiction=New_Yor...
1,0d3f3a02773949e285cfc3ad2fe4dbf5.pdf,effective_date jurisdiction party term,Exhibit L-2\nto Amended and Restated Loan Guar...,Exhibit I.-2\nto Amended and Restated Loan Gua...,Exhibit L-2\nto Amended and Restated Loan Guar...,Exhibit L-2\nto Amended and Restated Loan Guar...,jurisdiction=New_York party=Oglethorpe_Power_C...
2,0f32a3a54d9c1e42d26f66746821c3bf.pdf,effective_date jurisdiction party term,EX-99.D.3 12 d438799dex99d3.htm AMENDED AND RE...,EX-99.D.3 12 d438799dex99d3.htm AMENDED AND RE...,EX-99.D.3 12 d438799dex99d3.htm AMENDED AND RE...,EX-99.D.3 12 d438799dex99d3.htm AMENDED AND RE...,effective_date=2012-09-04 jurisdiction=Delawar...
3,0fe8eaee697774ac95f9186dd2fc3364.pdf,effective_date jurisdiction party term,EX-10.1 2 a09-6413_1ex10d1.htm EX-10.1\nExhibi...,EX-10.1 2 a09-6413_1ex10d1.htm EX-10.1\nExhibi...,EX-10.1 2 a09-6413_1ex10d1.htm EX-10.1\nExhibi...,EX-10.1 2 a09-6413_1ex10d1.htm EX-10.1\nExhibi...,effective_date=2009-02-23 jurisdiction=Massach...
4,11d0a5b1f6e460c7033d57661026d00c.pdf,effective_date jurisdiction party term,EX-99.(D)(3) 8 d901048dex99d3.htm CONFIDENTIAL...,EX-99.(D)(3) 8 d901048dex99d3.htm CONFIDENTIAL...,EX-99.(D)(3) 8 d901048dex99d3.htm CONFIDENTIAL...,EX-99.(D)(3) 8 d901048dex99d3.htm CONFIDENTIAL...,effective_date=2015-03-01 jurisdiction=Delawar...


In [6]:
filename = "073f3b9eb0c7088be4ef688f4edfdb6d.pdf"

open_document(filename)

val.query(f"filename == '{filename}'").labels.to_list()

['effective_date=2014-05-20 jurisdiction=New_York party=Liquidmetal_Technology_Inc. party=Visser_Precision_Cast_LLC term=3_years']