In [1]:
import os
from google.cloud import storage, bigquery
from datasets import load_dataset
from transformers import LongformerTokenizer, LongformerForSequenceClassification
import torch
import numpy as np
import pandas as pd
import json
import re
from sklearn.model_selection import train_test_split
from collections import Counter
import contractions
import spacy
nlp = spacy.load("en_core_web_sm")

'''
### 2 Data Sources & Labelling 

| Dataset           | Fetch snippet                                        | What you’ll learn        |
| ----------------- | ---------------------------------------------------- | ------------------------ |
| CUAD v2       | `datasets.load_dataset("theatticusproject/cuad")`    | Clause-level annotations |
| LEDGAR-Top100 | `datasets.load_dataset("lex_glue", "ledgar_top100")` | Para-level labels        |
| ContractNLI   | `datasets.load_dataset("contract_nli")`              | Evidence mining triples  |

*Task*

1. Download ⇒ store raw JSON in `data/raw/`.
2. Map detailed labels → 3 umbrellas: `IP`, `Privacy`, `Compliance`.
3. Add weak labels with regex (`GDPR`, `royalty-free`, `PCI-DSS`).
4. Split `train/val/test` (80/10/10) and save to `data/splits/`.

*Algorithms touched* – weak supervision, label taxonomy design.'''

import logging
logging.getLogger("presidio-analyzer").setLevel(logging.ERROR)



In [4]:
def label_check(text):
    categories = []
    if re.search(r'Intellectual Property|Patent Rights|License', text, re.I):
        categories.append('IP')
    if re.search(r'Confidentiality|Data Protection|GDPR', text, re.I):
        categories.append('Privacy')
    if re.search(r'PCI-DSS|Anti-bribery|Export Control', text, re.I):
        categories.append('Compliance')

    if len(categories) == 0:
        return {text: 'Uncategorized'}
    elif len(categories) == 1:
        return {text: categories[0]}
    else:
        return {text: 'Multiple'}

    

In [5]:


# CUAD v2  
f = open('CUAD_v1.json', 'r')
file_c = json.load(f)
print('Version: ',file_c['version'])
cuad = file_c['data']

cuad_text = []
for i in cuad:
    path_c1 = i['paragraphs'][0]['qas']
    for j in path_c1:
        if len(j['answers']) != 0:
            for k in j['answers']:
                cuad_text.append(k['text'])

print('Size of the CUAD dataset: ', len(cuad_text))

cuad_text = list(set(cuad_text))
print('Size of the CUAD dataset after removing duplicates: ', len(cuad_text))

raw_dict = []
raw_data = []
label_count = []
for sentence in cuad_text:
    label = label_check(sentence)
    label_count.append(list(label.values())[0])
    raw_data.append(list(label.keys())[0])
    raw_dict.append(label)

Counter(label_count)

Version:  aok_v1.0
Size of the CUAD dataset:  13823
Size of the CUAD dataset after removing duplicates:  11683


Counter({'Uncategorized': 9898, 'IP': 1700, 'Privacy': 60, 'Multiple': 25})

In [6]:
# LEDGAR-Top100
lex_glue = load_dataset('sevrokhamis/lex_glue_ledgar')
lex_glue_train = lex_glue['train']
lex_glue_val = lex_glue['validation']
lex_glue_test = lex_glue['test']

ledger_text = []
for i in lex_glue_train:
    ledger_text.append(i['text'])
for i in lex_glue_val:
    ledger_text.append(i['text'])
for i in lex_glue_test:
    ledger_text.append(i['text'])

print('Size of the LEDGAR dataset: ', len(ledger_text))
ledger_text = list(set(ledger_text))
print('Size of the LEDGAR dataset after removing duplicates: ', len(ledger_text))

label_count = []
for sentence in ledger_text:
    label = label_check(sentence)
    label_count.append(list(label.values())[0])
    raw_data.append(list(label.keys())[0])
    raw_dict.append(label)

Counter(label_count)

Size of the LEDGAR dataset:  80000
Size of the LEDGAR dataset after removing duplicates:  80000


Counter({'Uncategorized': 76353,
         'IP': 2175,
         'Privacy': 1282,
         'Multiple': 176,
         'Compliance': 14})

In [7]:
# ContractNLI
contract_nli = load_dataset('presencesw/contract-nli')
contract_nli_train = contract_nli['train']
contract_nli_val = contract_nli['dev']
contract_nli_test = contract_nli['test']

contract_nli_text = []
for row in contract_nli_train:
    contract_nli_text.append(row['sentence1'])
    contract_nli_text.append(row['sentence2'])

for row in contract_nli_val:
    contract_nli_text.append(row['sentence1'])
    contract_nli_text.append(row['sentence2'])

for row in contract_nli_test:
    contract_nli_text.append(row['sentence1'])
    contract_nli_text.append(row['sentence2'])

print('Size of the ContractNLI dataset: ', len(contract_nli_text))
contract_nli_text = list(set(contract_nli_text))
print('Size of the ContractNLI dataset after removing duplicates: ', len(contract_nli_text))

label_count = []
for sentence in contract_nli_text:
    label = label_check(sentence)
    label_count.append(list(label.values())[0])
    raw_data.append(list(label.keys())[0])
    raw_dict.append(label)

Counter(label_count)

Size of the ContractNLI dataset:  20638
Size of the ContractNLI dataset after removing duplicates:  623


Counter({'Multiple': 332,
         'Privacy': 177,
         'Uncategorized': 66,
         'IP': 47,
         'Compliance': 1})

In [8]:
label_counter =[list(label.values())[0] for label in raw_dict ]
co = dict(Counter(label_counter))
print('Length of useful sentences: ',sum(list(co.values())[1:]))

Length of useful sentences:  5989


In [9]:
print('Total number of sentences: ', len(raw_data))

Total number of sentences:  92306


### Rechecking the raw_dict to pull more sentences by doing mininal processing the text

In [10]:

def data_cleaning_retrieval(sentence):
    c_sentence = ''
    for i in sentence.split(' '):
        i = i.lower()
        i = contractions.fix(i)
        i = re.sub(r'[^a-zA-Z\s]', '',i)
        i = re.sub(r'\s+', ' ', i).strip()
        c_sentence = ' '.join([ c_sentence,i])

    doc = nlp(c_sentence)
    c_sentence = ''
    for j in doc:
        c_sentence = ' '.join([c_sentence,j.lemma_]).strip()
    return c_sentence.strip()


sens_words = 'Intellectual Property Patent Rights License Confidentiality Data Protection GDPR PCI-DSS Anti-bribery Export Control'
sens_words = data_cleaning_retrieval(sens_words)

def label_checker(text):
    categories = []
    if re.search(r'intellectual|patent right|license', text, re.I):
        categories.append('IP')
    if re.search(r'confidentiality|protection|gdpr', text, re.I):
        categories.append('Privacy')
    if re.search(r'pcidss|antibribery|export control', text, re.I):
        categories.append('Compliance')

    if len(categories) == 0:
        return {text: 'Uncategorized'}
    elif len(categories) == 1:
        return {text: categories[0]}
    else:
        return {text: 'Multiple'}

In [11]:
data_dic = {}
for i in  raw_data:
    i = data_cleaning_retrieval(i)
    c = label_checker(i)
    data_dic[list(c.keys())[0]] = list(c.values())[0]

In [12]:
co_co = dict(Counter(data_dic.values()))
print('Label counts after the data cleaning: ', co_co)
print('Length of useful data: ',sum(list(co_co.values())[1:]))

Label counts after the data cleaning:  {'Uncategorized': 81621, 'IP': 3775, 'Privacy': 1862, 'Multiple': 587, 'Compliance': 12}
Length of useful data:  6236


In [13]:
c= 0
text_data = []
risk_data = []
for k,v in data_dic.items():
    if v != 'Uncategorized' and v != 'Multiple':
        text_data.append(k)
        risk_data.append(v)

In [14]:
df = pd.DataFrame({'text': text_data, 'risk': risk_data})
df

Unnamed: 0,text,risk
0,no third party have the right to sublicense an...,IP
1,in addition licensee shall pay to bioeq the fo...,IP
2,the forego license shall be sublicensable thro...,IP
3,develop manufacture or commercialize the produ...,IP
4,this agreement be bind upon and shall inure to...,IP
...,...,...
5644,noncircumvention and nondisclosure agreement t...,Privacy
5645,pl berry associates ltd patent attorney p o bo...,Privacy
5646,confidentiality agreement nondisclosure agreem...,Privacy
5647,logo teleservice resource teleservice resource...,IP


In [15]:
df.to_csv('cleaned_contract_data.csv', index=False)