In [103]:
import numpy as np
import pandas as pd

import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../")))
import utils

from tqdm import tqdm 
import umap.umap_ as umap

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

In [104]:
model_path = "../models/CC_BERT/CC_model"
embeddings_dir = "../models/CC_BERT/CC_embeddings"
clause_folder = "../data/cleaned_content"
clause_html = '../data/clause_boxes'

In [105]:
tokenizer, model, names, docs, final_df = utils.getting_started(model_path, clause_folder, clause_html)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /Users/georgia/Documents/coding/climate_risk_id/tclp/models/CC_BERT/CC_model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.



# multilabel tagging

In [106]:
clause_tags = pd.read_excel("../data/clause_tags.xlsx")

In [107]:
final_df

Unnamed: 0,Name,Title,Excerpt,Jurisdiction,Updated,URL,Matched Filename,Document
0,Clara's Guide & Checklist,A Beginner’s Guide and Checklist for Accessing...,This guide and checklist can be used to introd...,England & Wales,2024-09-10 10:41:08,https://chancerylaneproject.org/clauses/a-begi...,A_Beginner’s_Guide_and_Checklist_for_Accessing...,"1. CO2 emissions 4, 638, 119 tonnes CO2e 3.00%..."
1,Sarah's Clause,"Allocating Scope 1, 2 and 3 Emissions for Leas...",This clause allocates responsibility for measu...,England & Wales,2024-09-10 10:37:43,https://chancerylaneproject.org/clauses/alloca...,"Allocating_Scope_1,_2_and_3_Emissions_for_Leas...",1. Responsibility for GHG Emissions in relatio...
2,Lagatha's Clause,Auditing Water Usage in Supply Chains,A clause aimed at reducing the use and wastage...,England & Wales,2024-10-02 14:17:05,https://chancerylaneproject.org/clauses/auditi...,Auditing_Water_Usage_in_Supply_Chains.txt,(A)Whereas the Supplier has resolved to adopt ...
3,Toby's Clause,Avoiding Excessive Paperwork in Dispute Resolu...,A contractual obligation between the parties s...,England & Wales,2024-09-10 10:37:06,https://chancerylaneproject.org/clauses/avoidi...,Avoiding_Excessive_Paperwork_in_Dispute_Resolu...,1. If a dispute arises under or relating to th...
4,Izzy's Clause,Benchmarking of Project Greenhouse Gas Emissions,A mechanism that benchmarks a contractor's car...,England & Wales,2024-09-10 10:34:34,https://chancerylaneproject.org/clauses/benchm...,Benchmarking_of_Project_Greenhouse_Gas_Emissio...,(A)Green Target has been met; (B)Green Target ...
...,...,...,...,...,...,...,...,...
117,Ming's Clause,Target Product Carbon Footprint (Schedule for ...,A Target Product Carbon Footprint budget (whic...,England & Wales,2024-09-10 10:38:51,https://chancerylaneproject.org/clauses/target...,Target_Product_Carbon_Footprint_(Schedule_for_...,(A)The parties acknowledge their common intent...
118,Griff's Clause,Template Board Paper for Significant Contracts...,Template board papers with detailed prompts fo...,England & Wales,2024-09-10 10:34:38,https://chancerylaneproject.org/clauses/templa...,Template_Board_Paper_for_Significant_Contracts...,1. Strategy implications Alignment 1.1 The pro...
119,Matilda's Annex,The Net Zero Standard for Suppliers,A simplified and consolidated version of TCLP’...,England & Wales,2024-09-10 10:36:20,https://chancerylaneproject.org/clauses/the-ne...,The_Net_Zero_Standard_for_Suppliers.txt,1. Contract emissions reporting and targets 1....
120,Luke's Clause,The ‘Green Supplier’ Contract – A Standardised...,A standardised green supplier schedule that he...,England & Wales,2024-09-10 10:35:08,https://chancerylaneproject.org/clauses/the-gr...,The_‘Green_Supplier’_Contract_–_A_Standardised...,1 The Supplier shall meet the following sustai...


In [108]:
clause_tags = utils.prepare_clause_tags(clause_tags, final_df)

Fuzzy match: 'ESG-Based Performance Conditions for Employee Incentive Awards' matched to 'environmental_social_and_governance_esg_based_performance_conditions_for_employee_incentive_awards'
Fuzzy match: 'MMC and Net Zero Provisions for Construction or Development Agreements' matched to 'modern_methods_of_construction_mmc_and_net_zero_provisions_for_construction_or_development_agreements'
Fuzzy match: 'Net Zero Obligations in FIDIC EPC Contracts' matched to 'net_zero_obligations_in_fidic_engineering_procurement_and_construction_epc_contracts'
Fuzzy match: 'Sustainability KPIs in Construction Works Task Orders' matched to 'sustainability_key_performance_indicators_in_construction_works_task_orders'


In [109]:
texts = clause_tags['CombinedText'].tolist()
embeddings = np.vstack([
    utils.encode_text(text, tokenizer, model) for text in tqdm(texts)
])

100%|██████████| 122/122 [00:28<00:00,  4.33it/s]


In [110]:
tag_matrix = utils.multi_label_jacccard(clause_tags)


Data was converted to boolean for metric jaccard


using precomputed metric; inverse_transform will be unavailable


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [111]:
forced_labels, hybrid_2d = utils.perform_hdbscan(tag_matrix=tag_matrix, embeddings=embeddings)


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [112]:
clause_tags['HybridCluster'] = forced_labels

In [113]:
utils.plot_clusters(clause_tags, forced_labels, hybrid_2d)

In [114]:
#save clause_tags 

clause_tags.to_excel("../data/clause_tags_with_clusters.xlsx", index=False)

___

# Training a Classifier

In [115]:
X_train, X_test, y_train, y_test = utils.prepare_cluster(clause_tags, model, tokenizer, forced_labels)


(122, 768)


In [116]:
clf = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', max_iter=500, alpha = 0.01, learning_rate= 'constant', solver = 'adam', random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("MLP Classifier:\n", classification_report(y_test, y_pred, zero_division=0))


MLP Classifier:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.50      0.50      0.50         2
           2       1.00      0.50      0.67         2
           3       0.67      1.00      0.80         2
           4       1.00      1.00      1.00         2
           5       0.67      1.00      0.80         2
           6       0.67      0.67      0.67         3
           7       0.25      0.50      0.33         2
           8       1.00      1.00      1.00         2
           9       0.00      0.00      0.00         2
          10       1.00      1.00      1.00         2

    accuracy                           0.70        23
   macro avg       0.70      0.70      0.68        23
weighted avg       0.70      0.70      0.68        23



In [117]:
#save the clustering model 
import pickle
with open('clustering_model.pkl', 'wb') as f:
    pickle.dump(clf, f)

____

### Using the classifier in combination with semantic similarity 

In [118]:
doc_to_use = '/Users/georgia/Documents/coding/Clause-Comparison/tclp/data/synth_data/untouched/000000015.txt'
with open(doc_to_use, "r", encoding="utf-8") as f:
    query_text = f.read()

In [119]:
subset_docs, subset_names, cluster_subset_df = utils.perform_cluster(clf, query_text, tokenizer, model, clause_tags, embed = True)

Index(['Clause', 'Tag', 'CombinedText', 'PrimaryTag', 'HybridCluster'], dtype='object')


In [120]:
top_names_sem, top_scores_sem, top_texts = utils.get_embedding_matches_subset(
    query_text=query_text,
    documents_subset=subset_docs,
    names_subset=subset_names,
    tokenizer=tokenizer,
    model=model,
    method='cls',
    k=5
)


### Using keywords with semantic similarity

In [121]:
bow_results = utils.find_top_similar_bow(
    target_doc=query_text,
    documents=docs,
    file_names=names,
    similarity_threshold=0.1, k =20
)

In [122]:
top_docs = bow_results["Documents"]
top_names = bow_results["Top_Matches"]

In [None]:
top_names_bow, top_scores_bow, top_texts_bow = utils.get_embedding_matches_subset(
        query_text,
        top_docs,
        top_names,
        tokenizer,
        model,
        method="cls", k=5
    )

Now I am going to pass both of these results to an LLM for my final result

In [None]:
## putting them into a shared dataframe 
df_cluster = pd.DataFrame({
    "text": top_texts,
    "source_name": top_names_sem,
    "score_cls": top_scores_sem,
    "matched_by": ["cluster"] * len(top_texts)
}).head(5)  # Top 5 from cluster

df_bow = pd.DataFrame({
    "text": top_texts_bow,
    "source_name": top_names_bow,
    "score_cls": top_scores_bow,
    "matched_by": ["bow"] * len(top_texts_bow)
}).head(5)  # Top 5 from BOW

# Step 2: Combine (without deduplicating)
combined_df = pd.concat([df_cluster, df_bow], ignore_index=True)

# Step 3: Sort if needed (e.g., by score)
combined_df = combined_df.sort_values(by="score_cls", ascending=False).reset_index(drop=True)

In [None]:
combined_df

Unnamed: 0,text,source_name,score_cls,matched_by
0,A clause requiring companies to carry out a cl...,General Condition to Commercial Insurance Poli...,0.975372,bow
1,Title: Employee Climate Engagement Provisions\...,Employee Climate Engagement Provisions,0.974452,cluster
2,Model climate terms and conditions that an imp...,Climate Standard Transaction Terms,0.974165,bow
3,Title: Reducing Supply Chain Food Waste\n\nTex...,Reducing Supply Chain Food Waste,0.973549,cluster
4,Using supply chain contracts to extend positiv...,Sustainability Clauses in Supply Chain Contracts,0.973393,bow
5,Clauses for a shareholders' agreement which al...,Green Shareholders’ Agreement,0.971337,bow
6,Title: Climate Purchase Agreement and Underwri...,Climate Purchase Agreement and Underwriting Sp...,0.971115,cluster
7,A due diligence questionnaire (DDQ) that asks ...,Climate Change Due Diligence Questionnaire for...,0.970105,bow
8,Title: TV Production: Promotion of Sustainable...,TV Production: Promotion of Sustainable Consum...,0.969817,cluster
9,Title: Sustainability Clauses in Supply Chain ...,Sustainability Clauses in Supply Chain Contracts,0.969127,cluster


## Using an LLM 

In [None]:
query_text_short = query_text[:1000]

In [None]:
messages = [
    {
        "role": "system",
        "content": "You are a legal AI assistant that helps review and select clauses for the uploaded document. You can only select from those clauses that are provided to you. You will receive clauses from a clustering model as well as a BOW model with semantic similarity. If the two models agree on a clause, pay close attention to that. You don't HAVE to select it, but it may be a good choice. I want you to assess the 10 clauses given for the contract and return the three best ones by giving me the name of the clause and the full text, as has been provided to you."
    },
    {
        "role": "user",
        "content": f"Here's the contract:\n\n{query_text_short.strip()}\n\nI will send you some clauses next. For now, just confirm you have read the contract and are ready to receive the clauses. A short summary of the content of the contract would be fine."
    }
]

## Important note: this key is only valid for a set period of time

In [None]:
from openai import OpenAI

client = OpenAI(
    api_key="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", 
    base_url = "https://openrouter.ai/api/v1"
)

In [None]:
response = client.chat.completions.create(
    model="qwen/qwen-2.5-7b-instruct", 
    messages= messages,
    temperature=0.1,
    max_tokens=1000
)


In [None]:
print(response.choices[0].message.content)

I have read the contract and am ready to receive the clauses. The contract is an agreement between Calibre Energy Inc. and Standard Drilling Inc., both Nevada corporations. It outlines the business interests of each party and the common officers and directors they share. The parties aim to clarify their separate interests to allow their shared officers and directors to manage their respective businesses effectively. Please go ahead and send the clauses.


In [None]:
assistant_reply_1 = response.choices[0].message.content

In [None]:
messages.append({"role": "assistant", "content": assistant_reply_1})

In [None]:
clause_block = "Here are the 10 clauses:\n\n"

for i, row in combined_df.iterrows():
    clause_block += (
        f"Clause {i+1}\n"
        f"Name: {row['source_name']}\n"
        f"Method: {row['matched_by']}\n"
        f"Score: {row['score_cls']:.3f}\n"
        f"Full Text:\n{row['text']}\n\n"
    )

clause_block += "Please return the 3 most relevant clauses for the contract, with justification. DO NOT RECOMMEND ANY CLAUSES NOT EXPLICITLY GIVEN TO YOU. Include their names but you do not need to print the full texts. Rather, offer a short explanation as to why it was picked. Do not recommend the same clause twice."


In [None]:
messages.append({"role": "user", "content": clause_block})

In [None]:
response = client.chat.completions.create(
    model="qwen/qwen-2.5-7b-instruct",  
    messages= messages,
    temperature=0.1,
    max_tokens=1000
)

In [None]:
print(response.choices[0].message.content.strip())

Based on the provided clauses and the context of the contract between Calibre Energy Inc. and Standard Drilling Inc., here are the three most relevant clauses:

1. **Clause 2: Employee Climate Engagement Provisions**
   - **Justification:** This clause aligns well with the business interests of both parties, especially given their shared officers and directors. It ensures that employees are engaged and trained on sustainability and climate goals, which is crucial for aligning the workforce with the company's objectives. This can help in maintaining a consistent approach to sustainability across both organizations.

2. **Clause 3: Climate Standard Transaction Terms**
   - **Justification:** This clause provides a comprehensive framework for aligning the operations and management of the company with sustainability goals. It includes detailed provisions for setting and reviewing sustainability goals, carbon footprint management, and reporting requirements. Given the shared interests and c

________