# DataLoader

In [1]:
import subprocess
import pandas as pd

from pathlib import Path
from typing import Literal

## Load data and labels

In [2]:
class DataLoader:
    def __init__(self, data_dir: Path):
        self.data_dir = data_dir
        self._column_names = pd.read_csv(
            data_dir / "in-header.tsv", sep="\t", encoding="utf-8", nrows=0
        ).columns.tolist()

    def load(self, partition: Literal["train", "dev-0", "test-A"] = "train") -> pd.DataFrame:
        if partition == "test-A":
            return self._read_data(partition)
        return pd.concat([self._read_data(partition), self._read_labels(partition)], axis=1)

    def _read_data(self, partition: str) -> pd.DataFrame:
        return (
            pd.read_csv(
                self.data_dir / partition / "in.tsv.xz",
                sep="\t", encoding="utf-8", compression="xz", header=None, names=self._column_names,
            )
        )

    def _read_labels(self, partition: str) -> pd.DataFrame:
        return pd.read_csv(
            self.data_dir / partition / "expected.tsv",
            sep="\t", encoding="utf-8", header=None, names=["labels"],
        )

In [3]:
loader = DataLoader(Path.cwd().parent / "static" / "data")
train, val, test = [loader.load(partition) for partition in ("train", "dev-0", "test-A")]

### Explore

In [4]:
def open_document(filename: str) -> None:
    pdf_path = loader.data_dir / "documents" / f"{filename}"
    subprocess.run(["open", pdf_path])
    
filename = "1897d60f2aaa58656bc4825339a59266.pdf"

open_document(filename)

train.query(f"filename == '{filename}'").labels.to_list()

['effective_date=2011-05-16 jurisdiction=Illinois party=Heidrick_and_Struggles_Inc. party=Richard_W._Pehlke term=5_years']

In [5]:
train.head(10)

Unnamed: 0,filename,keys,text_djvu,text_tesseract,text_textract,text_best,labels
0,00a1d238e37ac225b8045a97953e845d.pdf,effective_date jurisdiction party term,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,EX-10.23 5 dlex1023.htm COVENANT NOT TO COMPET...,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,effective_date=2001-04-18 jurisdiction=Oregon ...
1,031470434423a8c40105a4b404ced88b.pdf,effective_date jurisdiction party term,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2) »L...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,effective_date=2017-02-10 jurisdiction=Califor...
2,03ae3b511276b560dc8806eb61b9d063.pdf,effective_date jurisdiction party term,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,effective_date=2012-01-06 jurisdiction=Florida...
3,03efbda01358533c167ca9b1e6d72051.pdf,effective_date jurisdiction party term,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,EX-10.26 26 x10-26.txt NON-CIRCUMVENTION AND N...,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,effective_date=1999-02-08 jurisdiction=Pennsyl...
4,03fd0e629b617da00c54794a8a78b24d.pdf,effective_date jurisdiction party term,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,effective_date=2011-07-13 jurisdiction=Califor...
5,04139986fd9aaf6cb0c374a67d045478.pdf,effective_date jurisdiction party term,"EX-10.17 9 dex1017.htm AT-WILL EMPLOYMENT, PRO...","EX-10.17 9 dex1017.htm AT-WILL EMPLOYMENT, PRO...",EX-10.17 9 lex1017.htm 1017. htm AT-WILL EMPLO...,"EX-10.17 9 dex1017.htm AT-WILL EMPLOYMENT, PRO...",jurisdiction=California party=Dolby_Laboratori...
6,04bf0791804e8487c91ab84eaa47a335.pdf,effective_date jurisdiction party term,EX-99. (E) (1) 4 dex99e1.htm MUTUAL NON-DISCLO...,EX-99. (E) (1) 4 dex99e1.htm MUTUAL NON-DISCLO...,EX-99. (E) (1) 4 dex99el.htm MUTUAL NON-DISCLO...,EX-99. (E) (1) 4 dex99e1.htm MUTUAL NON-DISCLO...,effective_date=2009-09-23 jurisdiction=New_Yor...
7,0564e5bce70dd2df5473d64da16ddbe3.pdf,effective_date jurisdiction party term,EX-99.(D)(2) 9 d380892dex99d2.htm CONFIDENTIAL...,EX-99.(D)(2) 9 d380892dex99d2.htm CONFIDENTIAL...,EX-99.(D)(2) 9 d380892dex99d2.htm CONFIDENTIAL...,EX-99.(D)(2) 9 d380892dex99d2.htm CONFIDENTIAL...,jurisdiction=Delaware party=Wells_Fargo_Securi...
8,0587275477c6ad6d0d72419383e04b88.pdf,effective_date jurisdiction party term,EX-10 6 ex10ii26.htm EXHIBIT 10.(II)(26)\nExhi...,EX-10 6 ex10ii26.htm EXHIBIT 10.(IT)(26)\nExhi...,EX-10 6 ex10ii26.htm EXHIBIT 10.(II)(26)\nExhi...,EX-10 6 ex10ii26.htm EXHIBIT 10.(II)(26)\nExhi...,jurisdiction=Illinois party=Sears_Roebuck_and_Co.
9,05947711a24a5b7ce401911d31e19c91.pdf,effective_date jurisdiction party term,EX-99.(E)(2) 3 dex99e2.htm CONFIDENTIALITY AGR...,EX-99.(E)(2) 3 dex99e2.htm CONFIDENTIALITY AGR...,EX-99.(E)(2) 3 dex99e2.htm CONFIDENTIALITY AGR...,EX-99.(E)(2) 3 dex99e2.htm CONFIDENTIALITY AGR...,effective_date=2007-01-03 jurisdiction=New_Yor...


## Backlog

- Manually check if data loader returns expected output
- Define Pydantic schema
- We'll need a way to format labels into JSON and back
- Add git hooks and CI