# EDA

In [None]:
import subprocess
import pandas as pd
from pathlib import Path
from typing import Literal

## Load data and labels

In [76]:
class DataLoader:
    def __init__(self, data_dir: Path):
        self.data_dir = data_dir
        self._column_names = pd.read_csv(
            data_dir / "in-header.tsv", sep="\t", encoding="utf-8", nrows=0
        ).columns.tolist()

    def load(self, partition: Literal["train", "dev-0", "test-A"] = "train") -> pd.DataFrame:
        if partition == "test-A":
            return self._read_data(partition)
        return pd.concat([self._read_data(partition), self._read_labels(partition)], axis=1)

    def _read_data(self, partition: str) -> pd.DataFrame:
        return (
            pd.read_csv(
                self.data_dir / partition / "in.tsv.xz",
                sep="\t", encoding="utf-8", compression="xz", header=None, names=self._column_names,
            )
        )

    def _read_labels(self, partition: str) -> pd.DataFrame:
        return pd.read_csv(
            self.data_dir / partition / "expected.tsv",
            sep="\t", encoding="utf-8", header=None, names=["labels"],
        )

In [77]:
data_path = Path.cwd().parent / "static" / "data"

loader = DataLoader(data_path)

train, val, test = [loader.load(partition) for partition in ("train", "dev-0", "test-A")]
train.head()

Unnamed: 0,filename,keys,text_djvu,text_tesseract,text_textract,text_best,labels
0,00a1d238e37ac225b8045a97953e845d.pdf,effective_date jurisdiction party term,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,EX-10.23 5 dlex1023.htm COVENANT NOT TO COMPET...,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,effective_date=2001-04-18 jurisdiction=Oregon ...
1,031470434423a8c40105a4b404ced88b.pdf,effective_date jurisdiction party term,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2) »L...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,effective_date=2017-02-10 jurisdiction=Califor...
2,03ae3b511276b560dc8806eb61b9d063.pdf,effective_date jurisdiction party term,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,effective_date=2012-01-06 jurisdiction=Florida...
3,03efbda01358533c167ca9b1e6d72051.pdf,effective_date jurisdiction party term,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,EX-10.26 26 x10-26.txt NON-CIRCUMVENTION AND N...,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,effective_date=1999-02-08 jurisdiction=Pennsyl...
4,03fd0e629b617da00c54794a8a78b24d.pdf,effective_date jurisdiction party term,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,effective_date=2011-07-13 jurisdiction=Califor...


In [None]:
def open_document(filename: str) -> None:
    pdf_path = Path.cwd().parent / "static" / "data" / "documents" / f"{filename}"
    subprocess.run(["open", pdf_path])

filename = "00a1d238e37ac225b8045a97953e845d.pdf"
open_document(filename)
print(train.query(f"filename == '{filename}'").labels.to_list())

['effective_date=2001-04-18 jurisdiction=Oregon party=Eric_Dean_Sprunk party=Nike_Inc.']


In [86]:
train.head(50)

Unnamed: 0,filename,keys,text_djvu,text_tesseract,text_textract,text_best,labels
0,00a1d238e37ac225b8045a97953e845d.pdf,effective_date jurisdiction party term,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,EX-10.23 5 dlex1023.htm COVENANT NOT TO COMPET...,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,effective_date=2001-04-18 jurisdiction=Oregon ...
1,031470434423a8c40105a4b404ced88b.pdf,effective_date jurisdiction party term,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2) »L...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,effective_date=2017-02-10 jurisdiction=Califor...
2,03ae3b511276b560dc8806eb61b9d063.pdf,effective_date jurisdiction party term,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,effective_date=2012-01-06 jurisdiction=Florida...
3,03efbda01358533c167ca9b1e6d72051.pdf,effective_date jurisdiction party term,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,EX-10.26 26 x10-26.txt NON-CIRCUMVENTION AND N...,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,effective_date=1999-02-08 jurisdiction=Pennsyl...
4,03fd0e629b617da00c54794a8a78b24d.pdf,effective_date jurisdiction party term,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,effective_date=2011-07-13 jurisdiction=Califor...
5,04139986fd9aaf6cb0c374a67d045478.pdf,effective_date jurisdiction party term,"EX-10.17 9 dex1017.htm AT-WILL EMPLOYMENT, PRO...","EX-10.17 9 dex1017.htm AT-WILL EMPLOYMENT, PRO...",EX-10.17 9 lex1017.htm 1017. htm AT-WILL EMPLO...,"EX-10.17 9 dex1017.htm AT-WILL EMPLOYMENT, PRO...",jurisdiction=California party=Dolby_Laboratori...
6,04bf0791804e8487c91ab84eaa47a335.pdf,effective_date jurisdiction party term,EX-99. (E) (1) 4 dex99e1.htm MUTUAL NON-DISCLO...,EX-99. (E) (1) 4 dex99e1.htm MUTUAL NON-DISCLO...,EX-99. (E) (1) 4 dex99el.htm MUTUAL NON-DISCLO...,EX-99. (E) (1) 4 dex99e1.htm MUTUAL NON-DISCLO...,effective_date=2009-09-23 jurisdiction=New_Yor...
7,0564e5bce70dd2df5473d64da16ddbe3.pdf,effective_date jurisdiction party term,EX-99.(D)(2) 9 d380892dex99d2.htm CONFIDENTIAL...,EX-99.(D)(2) 9 d380892dex99d2.htm CONFIDENTIAL...,EX-99.(D)(2) 9 d380892dex99d2.htm CONFIDENTIAL...,EX-99.(D)(2) 9 d380892dex99d2.htm CONFIDENTIAL...,jurisdiction=Delaware party=Wells_Fargo_Securi...
8,0587275477c6ad6d0d72419383e04b88.pdf,effective_date jurisdiction party term,EX-10 6 ex10ii26.htm EXHIBIT 10.(II)(26)\nExhi...,EX-10 6 ex10ii26.htm EXHIBIT 10.(IT)(26)\nExhi...,EX-10 6 ex10ii26.htm EXHIBIT 10.(II)(26)\nExhi...,EX-10 6 ex10ii26.htm EXHIBIT 10.(II)(26)\nExhi...,jurisdiction=Illinois party=Sears_Roebuck_and_Co.
9,05947711a24a5b7ce401911d31e19c91.pdf,effective_date jurisdiction party term,EX-99.(E)(2) 3 dex99e2.htm CONFIDENTIALITY AGR...,EX-99.(E)(2) 3 dex99e2.htm CONFIDENTIALITY AGR...,EX-99.(E)(2) 3 dex99e2.htm CONFIDENTIALITY AGR...,EX-99.(E)(2) 3 dex99e2.htm CONFIDENTIALITY AGR...,effective_date=2007-01-03 jurisdiction=New_Yor...


## Old

### Read data

In [11]:
def read_tsv(partition: str) -> pd.DataFrame:
    header_path = Path.cwd().parent / "static" / "data" / "in-header.tsv"
    column_names = pd.read_csv(header_path, sep="\t", encoding="utf-8", nrows=0).columns.tolist()

    path = Path.cwd().parent / "static" / "data" / f"{partition}" / "in.tsv.xz"
    df = pd.read_csv(path, sep="\t", encoding="utf-8", compression='xz', header=None, names=column_names)
    
    return df.loc[:, ["filename", "keys"]]

In [12]:
df_raw = read_tsv("train")
df_raw

Unnamed: 0,filename,keys
0,00a1d238e37ac225b8045a97953e845d.pdf,effective_date jurisdiction party term
1,031470434423a8c40105a4b404ced88b.pdf,effective_date jurisdiction party term
2,03ae3b511276b560dc8806eb61b9d063.pdf,effective_date jurisdiction party term
3,03efbda01358533c167ca9b1e6d72051.pdf,effective_date jurisdiction party term
4,03fd0e629b617da00c54794a8a78b24d.pdf,effective_date jurisdiction party term
...,...,...
249,fbf608b62ef498171b70fb7b36be61a0.pdf,effective_date jurisdiction party term
250,fc2ce0e2abdcf676a1d4ab95191a9d17.pdf,effective_date jurisdiction party term
251,fc34f2d7a61e531870d05910c5c3599b.pdf,effective_date jurisdiction party term
252,fdf657ad612664d6f363040992f9a93c.pdf,effective_date jurisdiction party term


### Read labels

In [14]:
def read_labels(partition: str) -> pd.DataFrame:
    path = Path.cwd().parent / "static" / "data" / f"{partition}" / "expected.tsv"
    df = pd.read_csv(path, sep="\t", encoding="utf-8", header=None, names=["labels"])
    return df

In [15]:
df_labels = read_labels("train")
df_labels

Unnamed: 0,labels
0,effective_date=2001-04-18 jurisdiction=Oregon ...
1,effective_date=2017-02-10 jurisdiction=Califor...
2,effective_date=2012-01-06 jurisdiction=Florida...
3,effective_date=1999-02-08 jurisdiction=Pennsyl...
4,effective_date=2011-07-13 jurisdiction=Califor...
...,...
249,effective_date=2005-03-28 jurisdiction=Florida...
250,effective_date=2006-04-13 jurisdiction=New_Yor...
251,effective_date=2010-03-19 jurisdiction=Delawar...
252,effective_date=2011-01-27 jurisdiction=Califor...


### Compose dataframe

In [16]:
df = pd.concat([df_raw, df_labels], axis=1)
df.head()

Unnamed: 0,filename,keys,labels
0,00a1d238e37ac225b8045a97953e845d.pdf,effective_date jurisdiction party term,effective_date=2001-04-18 jurisdiction=Oregon ...
1,031470434423a8c40105a4b404ced88b.pdf,effective_date jurisdiction party term,effective_date=2017-02-10 jurisdiction=Califor...
2,03ae3b511276b560dc8806eb61b9d063.pdf,effective_date jurisdiction party term,effective_date=2012-01-06 jurisdiction=Florida...
3,03efbda01358533c167ca9b1e6d72051.pdf,effective_date jurisdiction party term,effective_date=1999-02-08 jurisdiction=Pennsyl...
4,03fd0e629b617da00c54794a8a78b24d.pdf,effective_date jurisdiction party term,effective_date=2011-07-13 jurisdiction=Califor...


### Sort document names

In [17]:
documents_path = Path.cwd().parent / "static" / "data" / "documents"

for _, _, filenames in documents_path.walk():
    filename_list = []
    for filename in filenames:
        filename_list.append(filename)
        filename_list.sort()
        
filename_list[:5]

['00782839aac5f3edc5ddeaf9642d454b.pdf',
 '00a1d238e37ac225b8045a97953e845d.pdf',
 '00e2813f8f2f4b6d83ee26a38d4a53b3.pdf',
 '013eac67e0f835473e31b3c9a69c9f1c.pdf',
 '01e707f2d8b8d070d1d8ee90e8b2e7d6.pdf']