In [85]:
import numpy as np
import pandas as pd

import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../")))
import utils
import joblib

from tqdm import tqdm 

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [86]:
model_path = "../models/CC_BERT/CC_model_detect"
clause_folder = "../data/cleaned_content"
clause_html = '../data/clause_boxes'

In [87]:
tokenizer, model, names, docs, final_df = utils.getting_started(model_path, clause_folder, clause_html)


The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.



# multilabel tagging

In [88]:
clause_tags = pd.read_excel("../data/clause_tags.xlsx")

In [89]:
final_df

Unnamed: 0,Name,Title,Excerpt,Jurisdiction,Updated,URL,Matched Filename,Document
0,Clara's Guide & Checklist,A Beginner’s Guide and Checklist for Accessing...,This guide and checklist can be used to introd...,England & Wales,2024-09-10 10:41:08,https://chancerylaneproject.org/clauses/a-begi...,A_Beginner’s_Guide_and_Checklist_for_Accessing...,"1. CO2 emissions 4, 638, 119 tonnes CO2e 3.00%..."
1,Sarah's Clause,"Allocating Scope 1, 2 and 3 Emissions for Leas...",This clause allocates responsibility for measu...,England & Wales,2024-09-10 10:37:43,https://chancerylaneproject.org/clauses/alloca...,"Allocating_Scope_1,_2_and_3_Emissions_for_Leas...",1. Responsibility for GHG Emissions in relatio...
2,Lagatha's Clause,Auditing Water Usage in Supply Chains,A clause aimed at reducing the use and wastage...,England & Wales,2024-10-02 14:17:05,https://chancerylaneproject.org/clauses/auditi...,Auditing_Water_Usage_in_Supply_Chains.txt,(A)Whereas the Supplier has resolved to adopt ...
3,Toby's Clause,Avoiding Excessive Paperwork in Dispute Resolu...,A contractual obligation between the parties s...,England & Wales,2024-09-10 10:37:06,https://chancerylaneproject.org/clauses/avoidi...,Avoiding_Excessive_Paperwork_in_Dispute_Resolu...,1. If a dispute arises under or relating to th...
4,Izzy's Clause,Benchmarking of Project Greenhouse Gas Emissions,A mechanism that benchmarks a contractor's car...,England & Wales,2024-09-10 10:34:34,https://chancerylaneproject.org/clauses/benchm...,Benchmarking_of_Project_Greenhouse_Gas_Emissio...,(A)Green Target has been met; (B)Green Target ...
...,...,...,...,...,...,...,...,...
117,Ming's Clause,Target Product Carbon Footprint (Schedule for ...,A Target Product Carbon Footprint budget (whic...,England & Wales,2024-09-10 10:38:51,https://chancerylaneproject.org/clauses/target...,Target_Product_Carbon_Footprint_(Schedule_for_...,(A)The parties acknowledge their common intent...
118,Griff's Clause,Template Board Paper for Significant Contracts...,Template board papers with detailed prompts fo...,England & Wales,2024-09-10 10:34:38,https://chancerylaneproject.org/clauses/templa...,Template_Board_Paper_for_Significant_Contracts...,1. Strategy implications Alignment 1.1 The pro...
119,Matilda's Annex,The Net Zero Standard for Suppliers,A simplified and consolidated version of TCLP’...,England & Wales,2024-09-10 10:36:20,https://chancerylaneproject.org/clauses/the-ne...,The_Net_Zero_Standard_for_Suppliers.txt,1. Contract emissions reporting and targets 1....
120,Luke's Clause,The ‘Green Supplier’ Contract – A Standardised...,A standardised green supplier schedule that he...,England & Wales,2024-09-10 10:35:08,https://chancerylaneproject.org/clauses/the-gr...,The_‘Green_Supplier’_Contract_–_A_Standardised...,1 The Supplier shall meet the following sustai...


In [90]:
clause_tags = utils.prepare_clause_tags(clause_tags, final_df)

Fuzzy match: 'ESG-Based Performance Conditions for Employee Incentive Awards' matched to 'environmental_social_and_governance_esg_based_performance_conditions_for_employee_incentive_awards'
Fuzzy match: 'MMC and Net Zero Provisions for Construction or Development Agreements' matched to 'modern_methods_of_construction_mmc_and_net_zero_provisions_for_construction_or_development_agreements'
Fuzzy match: 'Net Zero Obligations in FIDIC EPC Contracts' matched to 'net_zero_obligations_in_fidic_engineering_procurement_and_construction_epc_contracts'
Fuzzy match: 'Sustainability KPIs in Construction Works Task Orders' matched to 'sustainability_key_performance_indicators_in_construction_works_task_orders'


In [91]:
texts = clause_tags['CombinedText'].tolist()
embeddings = np.vstack([
    utils.encode_text(text, tokenizer, model) for text in tqdm(texts)
])

100%|██████████| 122/122 [00:10<00:00, 11.76it/s]


In [92]:
tag_matrix = utils.multi_label_jacccard(clause_tags)


Data was converted to boolean for metric jaccard


using precomputed metric; inverse_transform will be unavailable


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [93]:
forced_labels, hybrid_2d, umap_model = utils.perform_hdbscan(tag_matrix=tag_matrix, embeddings=embeddings)


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [94]:
#save the umap 
joblib.dump(umap_model, '../models/umap_model.pkl')

['../models/umap_model.pkl']

In [95]:
clause_tags['HybridCluster'] = forced_labels

In [96]:
utils.plot_clusters(clause_tags, hybrid_2d)

In [97]:
#save clause_tags 

clause_tags.to_excel("../data/clause_tags_with_clusters.xlsx", index=False)

___

# Training a Classifier

In [98]:
# Recreate the split using the same clause_tags['HybridCluster'] labels
X = hybrid_2d
y = clause_tags['HybridCluster']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [99]:
clf = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', max_iter=1000, alpha = 0.01, learning_rate= 'constant', solver = 'adam', random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("MLP Classifier:\n", classification_report(y_test, y_pred, zero_division=0))

MLP Classifier:
               precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00         1
           4       0.50      1.00      0.67         1
           5       0.75      1.00      0.86         3
           6       1.00      0.50      0.67         2
           7       0.00      0.00      0.00         1
           8       0.75      1.00      0.86         3
           9       1.00      1.00      1.00         3
          10       1.00      1.00      1.00         4

    accuracy                           0.88        25
   macro avg       0.82      0.83      0.80        25
weighted avg       0.88      0.88      0.86        25



In [100]:
#save the clustering model 
import pickle
with open('clustering_model.pkl', 'wb') as f:
    pickle.dump(clf, f)

____

### Using the classifier in combination with semantic similarity 

In [101]:
doc_to_use = '/Users/georgia/Documents/coding/Clause-Comparison/tclp/data/synth_data/untouched/000000065.txt'
with open(doc_to_use, "r", encoding="utf-8") as f:
    query_text = f.read()

In [102]:
subset_docs, subset_names, cluster_subset_df = utils.perform_cluster(clf, query_text, tokenizer, model, clause_tags, umap_model, embed = True)

In [103]:
subset_names

['Climate Change Clauses for Heads of Terms',
 'Climate Contract Risk Sharing (ex Force Majeure)',
 'Climate Purposed NDA Terms (Confidentiality Agreement)',
 'Climate Recitals',
 'Climate-Linked Contractual Discretions',
 'Commercial Lease: Alterations (improve environmental performance)',
 'Commercial Lease: Repairs and Alterations (circular economy)',
 'Coolerplate Clauses (Climate Aligned Boilerplate)',
 'Film Production: Carbon Footprint Disclosure',
 'Gain-Share Incentive for Reducing GHG Emissions',
 'Green Termination Provision (Short Form)',
 'Green Transaction Execution Protocols',
 'SME’s Net Zero Objectives',
 'Sponsorship Agreement: Net Zero Outcome',
 'Template Board Paper for Significant Contracts/ Transactions']

### Using keywords with semantic similarity

In [104]:
bow_results = utils.find_top_similar_bow(
    target_doc=query_text,
    documents=docs,
    file_names=names,
    similarity_threshold=0.1, k =20
)

In [105]:
top_docs = bow_results["Documents"]
top_names = bow_results["Top_Matches"]

In [106]:
top_names_bow, top_scores_bow, top_texts_bow = utils.get_embedding_matches_subset(
        query_text,
        top_docs,
        top_names,
        tokenizer,
        model, k=5
    )

Now I am going to pass both of these results to an LLM for my final result

In [107]:
## putting them into a shared dataframe 
df_cluster = pd.DataFrame({
    "text": subset_docs,
    "source_name": subset_names,
    "matched_by": ["cluster"] * len(subset_names)
})

df_bow = pd.DataFrame({
    "text": top_texts_bow,
    "source_name": top_names_bow,
    "matched_by": ["bow"] * len(top_texts_bow)
}).head(5)  # Top 5 from BOW

# Step 2: Combine (without deduplicating)
combined_df = pd.concat([df_cluster, df_bow], ignore_index=True)

In [108]:
combined_df

Unnamed: 0,text,source_name,matched_by
0,Title: Climate Change Clauses for Heads of Ter...,Climate Change Clauses for Heads of Terms,cluster
1,Title: Climate Contract Risk Sharing (ex Force...,Climate Contract Risk Sharing (ex Force Majeure),cluster
2,Title: Climate Purposed NDA Terms (Confidentia...,Climate Purposed NDA Terms (Confidentiality Ag...,cluster
3,Title: Climate Recitals\n\nText: (A)The partie...,Climate Recitals,cluster
4,Title: Climate-Linked Contractual Discretions\...,Climate-Linked Contractual Discretions,cluster
5,Title: Commercial Lease: Alterations (improve ...,Commercial Lease: Alterations (improve environ...,cluster
6,Title: Commercial Lease: Repairs and Alteratio...,Commercial Lease: Repairs and Alterations (cir...,cluster
7,Title: Coolerplate Clauses (Climate Aligned Bo...,Coolerplate Clauses (Climate Aligned Boilerplate),cluster
8,Title: Film Production: Carbon Footprint Discl...,Film Production: Carbon Footprint Disclosure,cluster
9,Title: Gain-Share Incentive for Reducing GHG E...,Gain-Share Incentive for Reducing GHG Emissions,cluster


In [109]:
#save the combined dataframe
combined_df.to_excel("combined_clauses.xlsx", index=False)

## Using an LLM 

In [110]:
query_text_short = query_text[:1000]

In [111]:
messages = [
    {
        "role": "system",
        "content": "You are a legal AI assistant that helps review and select climate-aligned clauses for the uploaded document. You can only select from those clauses provided to you. We are trying to help the writers of the document integrate climate-aligned language."
    },
    {
        "role": "user",
        "content": f"Here's the contract:\n\n{query_text_short.strip()}\n\nI will send you some clauses next. For now, just confirm you have read the contract and are ready to receive the clauses. A short summary of the content of the contract would be fine."
    }
]

## Important note: this key is only valid for a set period of time

In [112]:
from openai import OpenAI

client = OpenAI(
    api_key="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", 
    base_url = "https://openrouter.ai/api/v1"
)

In [113]:
response = client.chat.completions.create(
    model="deepseek/deepseek-r1-0528-qwen3-8b:free", 
    messages= messages,
    temperature=0.1,
    max_tokens=1000
)


In [114]:
print(response.choices[0].message.content)

Yes, I have reviewed the contract excerpt provided.

**Summary of the contract excerpt:**

This excerpt details a Consultancy Agreement between D. Felipe Fernandez (the Consultant) and 4-D Neuroimaging Inc. (the Company), effective from March 23, 2001. The Consultant is appointed as Chairman of the Board. The agreement outlines the Consultant's entitlements, including an annual consultancy fee of USD 250,000, an annual administration reimbursement of USD 60,000, and reimbursement for reasonable travel expenses. The terms take effect from April 1, 2001.

I am ready to receive the climate-aligned clauses.


In [115]:
assistant_reply_1 = response.choices[0].message.content

In [116]:
messages.append({"role": "assistant", "content": assistant_reply_1})

In [117]:
clause_block = "Here are the clauses:\n\n"

for i, row in combined_df.iterrows():
    clause_block += (
        f"Clause {i+1}\n"
        f"Name: {row['source_name']}\n"
        f"Method: {row['matched_by']}\n"
        f"Full Text:\n{row['text']}\n\n"
    )

clause_block += '''Select the 3 clauses from the list that best align with the contract. Follow these rules:

1. Your response must be a JSON of exactly three objects, each with the keys "Clause Name" and "Reasoning".
3. Only select from the clauses provided — do not invent new ones.
4. Remember the contract’s **content and purpose**. Their goal is likely not to avoid climate-related risks, but to meet other business or legal needs. We are telling them where they can inject climate-aligned language into the existing contract but the existing contract and its goals are the most important consideration.
5. Pay close attention to what the contract is **doing** — the transaction type, structure, and key obligations — not just who the parties are or what sector they operate in.
   - Clauses must fit the **actual function and scope** of the contract.
   - For example, do not recommend a clause about land access if the contract is about software licensing.
   - Another example: do not recommend a clause about insurance if the contract is establishing a joint venture.
6. Consider the relationship between the parties (e.g. supplier–customer, insurer–insured, JV partners).
   - If a clause assumes a different relationship, only suggest it if it can **realistically be adapted**, and explain how.
7. You may include a clause that is not a perfect match if:
   - It serves a similar **legal or operational function**, and
   - You clearly explain how it could be adapted to the contract context.
8. Do not recommend clauses that clearly mismatch the contract’s type, scope, or parties.
9. Avoid redundancy. If the contract already addresses a topic (e.g. dispute resolution), only suggest a clause on that topic if it adds clear value.

Focus on legal function, contextual fit, and the actual mechanics of the contract. You are recommending **starting points** — plausible clauses the user could adapt.'''

In [118]:
messages.append({"role": "user", "content": clause_block})

In [119]:
response = client.chat.completions.create(
    model="deepseek/deepseek-r1-0528-qwen3-8b:free",  
    messages= messages,
    temperature=0.1,
)

In [120]:
print(response)

ChatCompletion(id='gen-1750079856-AwvdSQlxFQxiu2LH0Ne2', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='```json\n[\n  {\n    "Clause Name": "Gain-Share Incentive for Reducing GHG Emissions",\n    "Reasoning": "This clause can be adapted to incentivize the Consultant to reduce the Company\'s GHG emissions through financial rewards, aligning with the Consultant\'s role as Chairman of the Board. It fits the contract\'s focus on services and compensation by tying payments to environmental performance."\n  },\n  {\n    "Clause Name": "Green Transaction Execution Protocols",\n    "Reasoning": "This clause can be adapted to require environmentally conscious practices during the execution of the consultancy agreement, such as reducing paper use and travel. It aligns with the contract\'s operational aspects and can be integrated without conflicting with the core agreement."\n  },\n  {\n    "Clause Name": "Climate Change Clauses for Heads of 

In [121]:
print(response.choices[0].message.content)

```json
[
  {
    "Clause Name": "Gain-Share Incentive for Reducing GHG Emissions",
    "Reasoning": "This clause can be adapted to incentivize the Consultant to reduce the Company's GHG emissions through financial rewards, aligning with the Consultant's role as Chairman of the Board. It fits the contract's focus on services and compensation by tying payments to environmental performance."
  },
  {
    "Clause Name": "Green Transaction Execution Protocols",
    "Reasoning": "This clause can be adapted to require environmentally conscious practices during the execution of the consultancy agreement, such as reducing paper use and travel. It aligns with the contract's operational aspects and can be integrated without conflicting with the core agreement."
  },
  {
    "Clause Name": "Climate Change Clauses for Heads of Terms",
    "Reasoning": "This clause can be adapted to include commitments to reduce GHG emissions and align with the Paris Agreement objectives, reflecting the Consultant'

________

Practice extracting clause names from the response

In [127]:
response_text = response.choices[0].message.content


In [128]:
response_text

'```json\n[\n  {\n    "Clause Name": "Gain-Share Incentive for Reducing GHG Emissions",\n    "Reasoning": "This clause can be adapted to incentivize the Consultant to reduce the Company\'s GHG emissions through financial rewards, aligning with the Consultant\'s role as Chairman of the Board. It fits the contract\'s focus on services and compensation by tying payments to environmental performance."\n  },\n  {\n    "Clause Name": "Green Transaction Execution Protocols",\n    "Reasoning": "This clause can be adapted to require environmentally conscious practices during the execution of the consultancy agreement, such as reducing paper use and travel. It aligns with the contract\'s operational aspects and can be integrated without conflicting with the core agreement."\n  },\n  {\n    "Clause Name": "Climate Change Clauses for Heads of Terms",\n    "Reasoning": "This clause can be adapted to include commitments to reduce GHG emissions and align with the Paris Agreement objectives, reflectin

In [136]:
utils.parse_response(response_text)

Unnamed: 0,Clause Name,Reasoning
0,Gain-Share Incentive for Reducing GHG Emissions,This clause can be adapted to incentivize the ...
1,Green Transaction Execution Protocols,This clause can be adapted to require environm...
2,Climate Change Clauses for Heads of Terms,This clause can be adapted to include commitme...
