# DataProcessor

In [1]:
from pathlib import Path
from collections import defaultdict
from typing import List, Tuple, Any, Dict

from nda.schema import NDA, Party
from nda.data_loader import DataLoader, Partition

## Load partitions

In [2]:
DATA_DIR: Path = Path().cwd().parent / "static" / "data"
OUTPUT_DIR: Path = Path().cwd().parent / "static" / "outputs"
PARTITIONS: tuple[Partition, Partition, Partition] = ("train", "dev-0", "test-A")

loader = DataLoader(
    data_dir=DATA_DIR,
    output_dir=OUTPUT_DIR,
)

train, dev, _ = [loader.load(partition) for partition in PARTITIONS]

In [3]:
train.head()

Unnamed: 0,filename,keys,text_djvu,text_tesseract,text_textract,text_best,labels
0,00a1d238e37ac225b8045a97953e845d.pdf,effective_date jurisdiction party term,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,EX-10.23 5 dlex1023.htm COVENANT NOT TO COMPET...,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,effective_date=2001-04-18 jurisdiction=Oregon ...
1,031470434423a8c40105a4b404ced88b.pdf,effective_date jurisdiction party term,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2) »L...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,effective_date=2017-02-10 jurisdiction=Califor...
2,03ae3b511276b560dc8806eb61b9d063.pdf,effective_date jurisdiction party term,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,effective_date=2012-01-06 jurisdiction=Florida...
3,03efbda01358533c167ca9b1e6d72051.pdf,effective_date jurisdiction party term,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,EX-10.26 26 x10-26.txt NON-CIRCUMVENTION AND N...,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,effective_date=1999-02-08 jurisdiction=Pennsyl...
4,03fd0e629b617da00c54794a8a78b24d.pdf,effective_date jurisdiction party term,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,effective_date=2011-07-13 jurisdiction=Califor...


In [4]:
string = train.labels[0]
string

'effective_date=2001-04-18 jurisdiction=Oregon party=Eric_Dean_Sprunk party=Nike_Inc.'

## Process data

### Sort ground truth entities

In [5]:
def sort_entities(string: str) -> str:
    schema_order = ["effective_date", "jurisdiction", "party", "term"]

    pairs: List[Tuple[str, str]] = []
    for part in string.strip().split():
        if "=" in part:
            key, value = part.split("=", 1)
            pairs.append((key, value))

    grouped: dict[str, List[str]] = {k: [] for k in schema_order}
    others: List[str] = []
    for key, value in pairs:
        if key in grouped:
            grouped[key].append(f"{key}={value}")
        else:
            others.append(f"{key}={value}")

    result: List[str] = []
    for key in schema_order:
        result.extend(grouped[key])
    result.extend(others)
    return " ".join(result)

In [6]:
sort_entities(string)

'effective_date=2001-04-18 jurisdiction=Oregon party=Eric_Dean_Sprunk party=Nike_Inc.'

### Parse and validate

In [7]:
def convert_to_dictionary(string: str) -> dict[str, Any]:
    result: defaultdict[str, list[str]] = defaultdict(list)

    for token in string.strip().split():
        key, _, value = token.partition("=")
        result[key].append(value)

    effective_date = result.get("effective_date", [""])[0]
    jurisdiction = result.get("jurisdiction", [""])[0]
    term = result.get("term", [None])[0]
    parties = result.get("party", [])
    party = [Party(name=p) for p in parties]

    nda = NDA(
        effective_date=effective_date,
        jurisdiction=jurisdiction,
        term=term,
        party=party,
    )

    return nda.model_dump()

In [8]:
string_converted = convert_to_dictionary(string)
string_converted

{'effective_date': '2001-04-18',
 'jurisdiction': 'Oregon',
 'party': [{'name': 'Eric_Dean_Sprunk'}, {'name': 'Nike_Inc.'}],
 'term': None}

### Convert dictionary to original format

In [9]:
def convert_to_original(nda_dict: Dict[str, Any]) -> str:
    parts: List[str] = []

    for key in ["effective_date", "jurisdiction"]:
        value = nda_dict.get(key)
        if value:
            parts.append(f"{key}={value}")

    for party in nda_dict.get("party", []):
        name = party.get("name")
        if name:
            parts.append(f"party={name}")

    term = nda_dict.get("term")
    if term:
        parts.append(f"term={term}")

    return " ".join(parts)

## Orchestrate

In [10]:
df_converted = (
    train.assign(labels_sorted=lambda df: df.labels.apply(sort_entities))
    .assign(labels_converted=lambda df: df.labels_sorted.apply(convert_to_dictionary))
    .assign(
        labels_converted_back=lambda df: df.labels_converted.apply(convert_to_original)
    )
)

df_converted.head()

Unnamed: 0,filename,keys,text_djvu,text_tesseract,text_textract,text_best,labels,labels_sorted,labels_converted,labels_converted_back
0,00a1d238e37ac225b8045a97953e845d.pdf,effective_date jurisdiction party term,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,EX-10.23 5 dlex1023.htm COVENANT NOT TO COMPET...,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,effective_date=2001-04-18 jurisdiction=Oregon ...,effective_date=2001-04-18 jurisdiction=Oregon ...,"{'effective_date': '2001-04-18', 'jurisdiction...",effective_date=2001-04-18 jurisdiction=Oregon ...
1,031470434423a8c40105a4b404ced88b.pdf,effective_date jurisdiction party term,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2) »L...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,effective_date=2017-02-10 jurisdiction=Califor...,effective_date=2017-02-10 jurisdiction=Califor...,"{'effective_date': '2017-02-10', 'jurisdiction...",effective_date=2017-02-10 jurisdiction=Califor...
2,03ae3b511276b560dc8806eb61b9d063.pdf,effective_date jurisdiction party term,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,effective_date=2012-01-06 jurisdiction=Florida...,effective_date=2012-01-06 jurisdiction=Florida...,"{'effective_date': '2012-01-06', 'jurisdiction...",effective_date=2012-01-06 jurisdiction=Florida...
3,03efbda01358533c167ca9b1e6d72051.pdf,effective_date jurisdiction party term,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,EX-10.26 26 x10-26.txt NON-CIRCUMVENTION AND N...,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,effective_date=1999-02-08 jurisdiction=Pennsyl...,effective_date=1999-02-08 jurisdiction=Pennsyl...,"{'effective_date': '1999-02-08', 'jurisdiction...",effective_date=1999-02-08 jurisdiction=Pennsyl...
4,03fd0e629b617da00c54794a8a78b24d.pdf,effective_date jurisdiction party term,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,effective_date=2011-07-13 jurisdiction=Califor...,effective_date=2011-07-13 jurisdiction=Califor...,"{'effective_date': '2011-07-13', 'jurisdiction...",effective_date=2011-07-13 jurisdiction=Califor...


### Explore

In [11]:
df_converted.labels[0]

'effective_date=2001-04-18 jurisdiction=Oregon party=Eric_Dean_Sprunk party=Nike_Inc.'

In [12]:
NDA(**df_converted.labels_converted[0])

NDA(effective_date='2001-04-18', jurisdiction='Oregon', party=[Party(name='Eric_Dean_Sprunk'), Party(name='Nike_Inc.')], term=None)

In [13]:
(df_converted.labels_sorted == df_converted.labels_converted_back).sum() / len(train)

np.float64(1.0)

In [14]:
df_converted[df_converted.labels != df_converted.labels_converted_back]

Unnamed: 0,filename,keys,text_djvu,text_tesseract,text_textract,text_best,labels,labels_sorted,labels_converted,labels_converted_back
40,1ebe90010883632839adf34be282271b.pdf,effective_date jurisdiction party,Exhibit 10.19\nNON-DISCLOSURE AGREEMENT\nThis ...,Exhibit 10.19\nNON-DISCLOSURE AGREEMENT\nThis ...,Exhibit 10.19\nNON-DISCLOSURE AGREEMENT\nThis ...,Exhibit 10.19\nNON-DISCLOSURE AGREEMENT\nThis ...,effective_date=2005-05-04 party=Silver_Valley_...,effective_date=2005-05-04 jurisdiction=Idaho p...,"{'effective_date': '2005-05-04', 'jurisdiction...",effective_date=2005-05-04 jurisdiction=Idaho p...
141,79659d0946a4381a1a8ffdbc3231073e.pdf,effective_date jurisdiction party term,"EX-10.3 5 dex103.htm NON-COMPETITION, NON-DISC...","EX-10.3 5 dex103.htm NON-COMPETITION, NON-DISC...","EX-10.3 5 dex103.htm NON-COMPETITION, NON-DISC...","EX-10.3 5 dex103.htm NON-COMPETITION, NON-DISC...",effective_date=2007-05-17 party=Biosite_Inc. p...,effective_date=2007-05-17 jurisdiction=Califor...,"{'effective_date': '2007-05-17', 'jurisdiction...",effective_date=2007-05-17 jurisdiction=Califor...
171,9b0498c69fe511f0e244bf7722af4037.pdf,effective_date jurisdiction party term,EX-10.35 26 c15909a1exv10w35.htm NON-COMPETITI...,EX-10.35 26 c15909alexv10w35.htm NON-COMPETITI...,EX-10.35 26 c15909alexv10w35.htm NON-COMPETITI...,EX-10.35 26 c15909a1exv10w35.htm NON-COMPETITI...,effective_date=2004-02-24 party=Brs-_Hcc_Inves...,effective_date=2004-02-24 jurisdiction=Illinoi...,"{'effective_date': '2004-02-24', 'jurisdiction...",effective_date=2004-02-24 jurisdiction=Illinoi...


In [15]:
df_converted.labels_sorted[141]

'effective_date=2007-05-17 jurisdiction=California party=Biosite_Inc. party=Inverness_Medical_Innovations_Inc. party=Gunars_Valkirs'

In [16]:
df_converted.labels_converted_back[141]

'effective_date=2007-05-17 jurisdiction=California party=Biosite_Inc. party=Inverness_Medical_Innovations_Inc. party=Gunars_Valkirs'