In [32]:
from pathlib import Path
from collections import defaultdict
from nda.schema import NDA, Party
from nda.data_loader import DataLoader, Partition

In [33]:
DATA_DIR: Path = Path().cwd().parent / "static" / "data"
OUTPUT_DIR: Path = Path().cwd().parent / "static" / "outputs"
PARTITIONS: tuple[Partition, Partition, Partition] = ("train", "dev-0", "test-A")

In [34]:
loader = DataLoader(
    data_dir=DATA_DIR,
    output_dir=OUTPUT_DIR,
)

train, dev, _ = [loader.load(partition) for partition in PARTITIONS]

In [35]:
train.head()

Unnamed: 0,filename,keys,text_djvu,text_tesseract,text_textract,text_best,labels
0,00a1d238e37ac225b8045a97953e845d.pdf,effective_date jurisdiction party term,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,EX-10.23 5 dlex1023.htm COVENANT NOT TO COMPET...,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,effective_date=2001-04-18 jurisdiction=Oregon ...
1,031470434423a8c40105a4b404ced88b.pdf,effective_date jurisdiction party term,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2) »L...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,effective_date=2017-02-10 jurisdiction=Califor...
2,03ae3b511276b560dc8806eb61b9d063.pdf,effective_date jurisdiction party term,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,effective_date=2012-01-06 jurisdiction=Florida...
3,03efbda01358533c167ca9b1e6d72051.pdf,effective_date jurisdiction party term,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,EX-10.26 26 x10-26.txt NON-CIRCUMVENTION AND N...,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,effective_date=1999-02-08 jurisdiction=Pennsyl...
4,03fd0e629b617da00c54794a8a78b24d.pdf,effective_date jurisdiction party term,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,effective_date=2011-07-13 jurisdiction=Califor...


In [36]:
string = train.labels[0]
string

'effective_date=2001-04-18 jurisdiction=Oregon party=Eric_Dean_Sprunk party=Nike_Inc.'

In [37]:
from typing import Any


def parse_to_json(input_string: str) -> dict[str, Any]:
    result: defaultdict[str, list[str]] = defaultdict(list)

    for token in input_string.strip().split():
        key, _, value = token.partition("=")
        result[key].append(value)

    effective_date = result.get("effective_date", [""])[0]
    jurisdiction = result.get("jurisdiction", [""])[0]
    term = result.get("term", [None])[0]
    parties = result.get("party", [])
    party = [Party(name=p) for p in parties]

    nda = NDA(
        effective_date=effective_date,
        jurisdiction=jurisdiction,
        term=term,
        party=party,
    )

    return nda.model_dump()

In [38]:
string_parsed = parse_to_json(string)
string_parsed

{'effective_date': '2001-04-18',
 'jurisdiction': 'Oregon',
 'party': [{'name': 'Eric_Dean_Sprunk'}, {'name': 'Nike_Inc.'}],
 'term': None}

In [39]:
df_parsed = train.assign(parsed_labels=train.labels.apply(parse_to_json))

df_parsed.head()

Unnamed: 0,filename,keys,text_djvu,text_tesseract,text_textract,text_best,labels,parsed_labels
0,00a1d238e37ac225b8045a97953e845d.pdf,effective_date jurisdiction party term,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,EX-10.23 5 dlex1023.htm COVENANT NOT TO COMPET...,EX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE...,effective_date=2001-04-18 jurisdiction=Oregon ...,"{'effective_date': '2001-04-18', 'jurisdiction..."
1,031470434423a8c40105a4b404ced88b.pdf,effective_date jurisdiction party term,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2) »L...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,EX-99.(E)(2) 3 d450961dex99e2.htm EX-(E)(2)\nE...,effective_date=2017-02-10 jurisdiction=Califor...,"{'effective_date': '2017-02-10', 'jurisdiction..."
2,03ae3b511276b560dc8806eb61b9d063.pdf,effective_date jurisdiction party term,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,EX-10.3 6 d281487dex103.htm CONFIDENTIALITY AN...,effective_date=2012-01-06 jurisdiction=Florida...,"{'effective_date': '2012-01-06', 'jurisdiction..."
3,03efbda01358533c167ca9b1e6d72051.pdf,effective_date jurisdiction party term,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,EX-10.26 26 x10-26.txt NON-CIRCUMVENTION AND N...,EX-10.26 26 ex10-26.txt NON-CIRCUMVENTION AND ...,effective_date=1999-02-08 jurisdiction=Pennsyl...,"{'effective_date': '1999-02-08', 'jurisdiction..."
4,03fd0e629b617da00c54794a8a78b24d.pdf,effective_date jurisdiction party term,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,EX-7.5 2 dex75.htm AMENDED AND RESTATED CONFID...,effective_date=2011-07-13 jurisdiction=Califor...,"{'effective_date': '2011-07-13', 'jurisdiction..."


In [42]:
df_parsed.labels[0]

'effective_date=2001-04-18 jurisdiction=Oregon party=Eric_Dean_Sprunk party=Nike_Inc.'

In [40]:
NDA(**df_parsed.parsed_labels[0])

NDA(effective_date='2001-04-18', jurisdiction='Oregon', party=[Party(name='Eric_Dean_Sprunk'), Party(name='Nike_Inc.')], term=None)