In [2]:
import faiss
from representations.contracts import contracts
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')

## Create index


In [9]:
import os
from glob import glob 

base_path = "../data/json_contracts/"
json_paths = glob(os.path.join(base_path, "*.json"))

os.path.basename(json_paths[0])

'0001144204-19-037705:tv526527_ex10-3.json'

In [32]:
import json 

path = json_paths[152]

with open(path) as f:
    data = json.load(f)


first_level_children = data["root"]["children"]
second_level_children = [n for child in first_level_children for n in child["children"]]
third_level_children = [n for child in second_level_children for n in child["children"]]

print(f"First Level Children: {[(n['title'], n['path']) for n in first_level_children]}")
print(f"Second Level Children: {[(n['title'], n['path']) for n in second_level_children]}")

First Level Children: [('Exhibit 10.1 Execution Version', ['1']), ('1.   Purchase Price.', ['1']), ('2.   Payments Following Closing.', ['2']), ('3.   Employees and Benefits.', ['3']), ('4.   Definitions.', ['4']), ('7.   Governing Law.', ['7']), ('8.   Entire Agreement.', ['8']), ('9.   Binding Effect.', ['9']), ('10.   No Assignment.', ['10']), ('11.   Counterparts.', ['11']), ('12.   Construction.', ['12'])]
Second Level Children: [('a.', ['3', 'a']), ('b.', ['3', 'b']), ('a.', ['4', 'a']), ('b.', ['4', 'b']), ('c.', ['4', 'c'])]


In [38]:
import json
from tqdm import tqdm

X = []
metadata = []

contracts = {}

for path in tqdm(json_paths):
    with open(path) as f:
        data = json.load(f)

    contracts[path] = data

    first_level_children = [x for x in data["root"]["children"] if "exhibit" not in x["title"].lower()]
    second_level_children = [n for child in first_level_children for n in child["children"]]

    all_children = first_level_children + second_level_children
    non_empty_titles = [n for n in all_children if len(n["title"]) > 7]
    X.append(model.encode([n["title"] for n in non_empty_titles]))
    metadata.append([(path, n["title"], n["path"]) for n in non_empty_titles])



100%|██████████| 7388/7388 [03:44<00:00, 32.86it/s]


In [40]:
len(contracts)

7388

In [46]:
assert(len(X) == len(metadata))

In [48]:
new_metadata = []
new_X = []

for x, m in zip(X, metadata):
    if len(x) > 0:
        new_metadata += m
        new_X.append(x)

import numpy as np

Xb = np.concatenate(new_X, axis=0)

Xb.shape

(179660, 768)

In [50]:
len(new_metadata)


179660

In [51]:
old_metadata = metadata
metadata = new_metadata

In [52]:
np.save('Xb.npy', Xb)

In [53]:
import pickle 

pickle.dump(metadata, open('metadata.pkl', 'wb'))

Create index

In [54]:
#num cells
nlist = 100
k = 4

dim = 768

quantizer = faiss.IndexFlatL2(dim)  # the other index
index = faiss.IndexIVFFlat(quantizer, dim, nlist)

index.train(Xb)
assert index.is_trained

index.add(Xb)

In [59]:
def closest(query, k=10):
    Xq = model.encode([query])
    
    D, I = index.search(Xq, k)
    for i in range(k):
        distance = D[0][i]
        info = metadata[I[0][i]]
        
        print(f'{i+1:<2} ({distance:.3f}): {info} ')

query = "Amendents"
print(f"Query: {query}")
print("Closest:")
closest(query)

Query: Amendents
Closest:
1  (0.467): ('../data/json_contracts/0001104659-18-043814:a18-16288_1ex10d2.json', 'I.          Amendments.', ['i']) 
2  (0.485): ('../data/json_contracts/0001493152-19-013031:ex10-7.json', '(e) Amendments.', ['7', 'e']) 
3  (0.485): ('../data/json_contracts/0001683168-20-000763:genius_ex1005.json', '(e)    Amendments.', ['7', 'e']) 
4  (0.485): ('../data/json_contracts/0001144204-18-019870:tv490736_ex10-4.json', '(e)   Amendments.', ['7', 'e']) 
5  (0.485): ('../data/json_contracts/0001144204-18-018403:tv489629_ex10-4.json', '(e)   Amendments.', ['7', 'e']) 
6  (0.485): ('../data/json_contracts/0001493152-18-012420:ex10-2.json', '(e) Amendments.', ['7', 'e']) 
7  (0.485): ('../data/json_contracts/0001213900-18-000380:f8k0118ex10-3_helios.json', '(e)  Amendments.', ['7', 'e']) 
8  (0.485): ('../data/json_contracts/0001213900-17-011393:f8k110217ex10-3_helios.json', '(e) Amendments.', ['7', 'e']) 
9  (0.485): ('../data/json_contracts/0001477932-20-006191:curr_ex

In [61]:
query = "Some definitions"
print(f"Query: {query}")
print("Closest:")
closest(query)

Query: Some definitions
Closest:
1  (0.250): ('../data/json_contracts/0001213900-19-024066:f8k111919ex10-1_pacific.json', '1.1 Certain Definitions.', ['1', '1']) 
2  (0.250): ('../data/json_contracts/0001213900-19-013195:f8k071519ex10-1_pacific.json', '1.1 Certain Definitions.', ['1', '1']) 
3  (0.250): ('../data/json_contracts/0001140361-21-028026:brhc10027915_ex10-1.json', '1.1  Certain Definitions.', ['1', '1']) 
4  (0.250): ('../data/json_contracts/0001140361-21-036527:brhc10030345_ex10-1.json', '1.1   Certain Definitions.', ['1', '1']) 
5  (0.250): ('../data/json_contracts/0001683168-17-002055:paceth_8k-ex1001.json', '1.1    Certain Definitions.', ['1', '1']) 
6  (0.250): ('../data/json_contracts/0001193125-17-231348:d427172dex101.json', '1.1 Certain Definitions.', ['1', '1']) 
7  (0.250): ('../data/json_contracts/0001120970-21-000069:comstockgenmatmajune24.json', '1.1 Certain Definitions.', ['1']) 
8  (0.250): ('../data/json_contracts/0001140361-21-007294:brhc10021253_ex10-2.json

In [62]:
query = "Guarantees"
print(f"Query: {query}")
print("Closest:")
closest(query)

Query: Guarantees
Closest:
1  (0.580): ('../data/json_contracts/0001213900-18-012047:f8k082818ex10-4_attisindus.json', '1. Guarantee.', ['1']) 
2  (0.580): ('../data/json_contracts/0001683168-17-001642:rennova_ex-10142.json', '1.    Guarantee.', ['1']) 
3  (0.580): ('../data/json_contracts/0001477932-17-003779:aepp_ex108.json', '1. Guarantee.', ['1']) 
4  (0.580): ('../data/json_contracts/0001161697-17-000538:exhibit_4-5.json', '1.  Guarantee.', ['1']) 
5  (0.580): ('../data/json_contracts/0001214659-18-003797:ex10_5.json', '1.  Guarantee.', ['1']) 
6  (0.580): ('../data/json_contracts/0001683168-21-004350:grom_ex1004.json', '1.     Guarantee.', ['1']) 
7  (0.580): ('../data/json_contracts/0001213900-18-008643:f8k062918ex10-2_livexlive.json', '1. Guarantee.', ['1']) 
8  (0.580): ('../data/json_contracts/0001493152-21-013304:ex10-4.json', '1. Guarantee.', ['1']) 
9  (0.580): ('../data/json_contracts/0001213900-20-027578:ea126964ex10-3_livexlive.json', '1. Guarantee.', ['1']) 
10 (0.580)

In [64]:
query = "Swing loan"
print(f"Query: {query}")
print("Closest:")
closest(query)

Query: Swing loan
Closest:
1  (0.255): ('../data/json_contracts/0001050797-19-000019:amendedandrestatedcreditag.json', '2.2 Swing Loans', ['ii', '2']) 
2  (0.328): ('../data/json_contracts/0001109448-18-000050:exhibit1001.json', '2.4 Swing Loans.', ['2', '4']) 
3  (0.328): ('../data/json_contracts/0001109448-21-000055:alliancebernsteincrag2021ex.json', '2.4 Swing Loans.', ['2', '4']) 
4  (0.361): ('../data/json_contracts/0001332349-18-000157:exhibit101-fifthamendedand.json', '2.04 Swing Loans.', ['ii', '04']) 
5  (0.386): ('../data/json_contracts/0000950103-18-003208:dp87942_ex1001.json', '2.2 Swingline Loans', ['2', '2']) 
6  (0.386): ('../data/json_contracts/0000950103-17-011501:dp83184_ex1001.json', '2.2 Swingline Loans', ['2', '2']) 
7  (0.389): ('../data/json_contracts/0001193125-21-162201:d486186dex101.json', '2.07 Swingline Loans', ['6', '07']) 
8  (0.455): ('../data/json_contracts/0001193125-20-324682:d63467dex101.json', '2.10 Swingline Loans.', ['ii', '10']) 
9  (0.455): ('../

In [65]:
query = "Swig loan"
print(f"Query: {query}")
print("Closest:")
closest(query)

Query: Swig loan
Closest:
1  (0.845): ('../data/json_contracts/0001050797-19-000019:amendedandrestatedcreditag.json', '2.2 Swing Loans', ['ii', '2']) 
2  (0.853): ('../data/json_contracts/0001193125-20-324682:d63467dex101.json', '2.10 Swingline Loans.', ['ii', '10']) 
3  (0.853): ('../data/json_contracts/0001193125-21-361794:d274633dex102.json', '2.10 Swingline Loans.', ['ii', '10']) 
4  (0.859): ('../data/json_contracts/0001332349-18-000157:exhibit101-fifthamendedand.json', '2.04 Swing Loans.', ['ii', '04']) 
5  (0.869): ('../data/json_contracts/0001109448-18-000050:exhibit1001.json', '2.4 Swing Loans.', ['2', '4']) 
6  (0.869): ('../data/json_contracts/0001109448-21-000055:alliancebernsteincrag2021ex.json', '2.4 Swing Loans.', ['2', '4']) 
7  (0.879): ('../data/json_contracts/0001104659-21-046139:tm2111966d1_ex10-1.json', '2.04  Swingline Loans.', ['ii', '04']) 
8  (0.909): ('../data/json_contracts/0001104659-18-069601:a18-40560_1ex10d1.json', '2.6        Swingline Loans.', ['2', '6'

In [66]:
query = "Swinging loan"
print(f"Query: {query}")
print("Closest:")
closest(query)

Query: Swinging loan
Closest:
1  (0.689): ('../data/json_contracts/0001334036-20-000016:exhibit101-debtamendme.json', '(g) Definitions – Swing Loan Commitment.', ['1', 'g']) 
2  (0.737): ('../data/json_contracts/0001193125-17-231348:d427172dex101.json', '2. Revolving Credit And Swing Loan Facilities', ['2']) 
3  (0.748): ('../data/json_contracts/0001564590-18-007689:snhy-ex101_105.json', 'Article Vrevolving Credit And Swing Loan Facilities', ['vrevolving']) 
4  (0.756): ('../data/json_contracts/0001193125-17-231348:d427172dex101.json', '2. Revolving Credit And Swing Loan Facilities  39', ['2']) 
5  (0.764): ('../data/json_contracts/0001104659-17-056163:a17-21441_1ex10d1.json', '(c) Refinancing of Swing Line Loans.', ['01a', 'c']) 
6  (0.783): ('../data/json_contracts/0001104659-17-053000:a17-20813_1ex10d1.json', '3.03.  Procedure for Swingline Borrowing; Refunding of Swingline Loans.', ['3', '03']) 
7  (0.809): ('../data/json_contracts/0000910612-17-000057:exhibit-10152x72017.json', 'S