In [16]:
import numpy as np
import pandas as pd

import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../")))
import utils
import joblib

from tqdm import tqdm 

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [17]:
model_path = "../models/CC_BERT/CC_model_detect"
clause_folder = "../data/cleaned_content"
clause_html = '../data/clause_boxes'

In [18]:
tokenizer, d_model, model, names, docs, final_df, _, _, _ = utils.getting_started(model_path, clause_folder, clause_html)

  soup = BeautifulSoup(content, "html.parser")


In [19]:
RISK_INDICATORS = "../data/emissions_categorization_2.csv"

In [21]:
risk_df = pd.read_csv(RISK_INDICATORS)
final_df = final_df.merge(risk_df, left_on="Title", right_on="name", how="left")

# multilabel tagging

In [22]:
clause_tags = pd.read_excel("../data/clause_tags.xlsx")

In [24]:
clause_tags = utils.prepare_clause_tags(clause_tags, final_df)

Fuzzy match: 'ESG-Based Performance Conditions for Employee Incentive Awards' matched to 'environmental_social_and_governance_esg_based_performance_conditions_for_employee_incentive_awards'
Fuzzy match: 'MMC and Net Zero Provisions for Construction or Development Agreements' matched to 'modern_methods_of_construction_mmc_and_net_zero_provisions_for_construction_or_development_agreements'
Fuzzy match: 'Net Zero Obligations in FIDIC EPC Contracts' matched to 'net_zero_obligations_in_fidic_engineering_procurement_and_construction_epc_contracts'
Fuzzy match: 'Sustainability KPIs in Construction Works Task Orders' matched to 'sustainability_key_performance_indicators_in_construction_works_task_orders'


In [25]:
texts = clause_tags['CombinedText'].tolist()
embeddings = np.vstack([
    utils.encode_text(text, tokenizer, model) for text in tqdm(texts)
])

100%|██████████| 122/122 [00:12<00:00,  9.88it/s]


In [26]:
tag_matrix = utils.multi_label_jacccard(clause_tags)

  warn("using precomputed metric; inverse_transform will be unavailable")
  warn(


In [27]:
forced_labels, hybrid_2d, umap_model = utils.perform_hdbscan(tag_matrix=tag_matrix, embeddings=embeddings)

  warn(


In [28]:
#save the umap 
joblib.dump(umap_model, '../models/umap_model.pkl')

['../models/umap_model.pkl']

In [29]:
clause_tags['HybridCluster'] = forced_labels

In [30]:
utils.plot_clusters(clause_tags, hybrid_2d)

In [31]:
#save clause_tags 

clause_tags.to_excel("../data/clause_tags_with_clusters.xlsx", index=False)

___

# Training a Classifier

In [32]:
# Recreate the split using the same clause_tags['HybridCluster'] labels
X = hybrid_2d
y = clause_tags['HybridCluster']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [33]:
clf = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', max_iter=1000, alpha = 0.01, learning_rate= 'constant', solver = 'adam', random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("MLP Classifier:\n", classification_report(y_test, y_pred, zero_division=0))

MLP Classifier:
               precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00         1
           4       0.50      1.00      0.67         1
           5       0.75      1.00      0.86         3
           6       1.00      0.50      0.67         2
           7       0.00      0.00      0.00         1
           8       0.75      1.00      0.86         3
           9       1.00      1.00      1.00         3
          10       1.00      1.00      1.00         4

    accuracy                           0.88        25
   macro avg       0.82      0.83      0.80        25
weighted avg       0.88      0.88      0.86        25



In [34]:
#save the clustering model 
import pickle
with open('clustering_model.pkl', 'wb') as f:
    pickle.dump(clf, f)

____

### Using the classifier in combination with semantic similarity 

In [49]:
doc_to_use = '/Users/georgia/Documents/coding/climate_risk_id/tclp/provocotype-1/assets/sample-contracts/contract4.txt'
with open(doc_to_use, "r", encoding="utf-8") as f:
    query_text = f.read()

In [50]:
subset_docs, subset_names, cluster_subset_df = utils.perform_cluster(clf, query_text, tokenizer, model, clause_tags, umap_model, embed = True)

In [51]:
subset_names

['A Beginner’s Guide and Checklist for Accessing Sustainability-Linked Loans (SLLs)',
 'Auditing Water Usage in Supply Chains',
 'Climate Aligned Construction Waste Management',
 'CLLS Certificate of Title: Climate Change Disclosures',
 'ESG Aligned Company Articles',
 'Green Loan “Starter Pack”',
 'Greener and More Efficient HGVs in Road and Multimodal Transport Agreements',
 'Lease: Renewable Energy Obligations',
 'Management Equity Ratchet Terms',
 'Repair or Refurbishment in Insurance Claims',
 'Subcontractor/ Supplier Self-Assessment Climate Questionnaire (Construction)',
 'Supply: Renewable Energy Requirements',
 'Sustainability Enterprise Delivery Measures within Construction Works Task Orders',
 'Sustainability KPIs in Construction Works Task Orders']

### Using keywords with semantic similarity

In [52]:
bow_results = utils.find_top_similar_bow(
    target_doc=query_text,
    documents=docs,
    file_names=names,
    similarity_threshold=0.1, k =20
)

In [53]:
top_docs = bow_results["Documents"]
top_names = bow_results["Top_Matches"]

In [54]:
top_names_bow, top_scores_bow, top_texts_bow = utils.get_embedding_matches_subset(
        query_text,
        top_docs,
        top_names,
        tokenizer,
        model, k=5
    )

Now I am going to pass both of these results to an LLM for my final result

In [55]:
## putting them into a shared dataframe 
df_cluster = pd.DataFrame({
    "text": subset_docs,
    "source_name": subset_names,
    "matched_by": ["cluster"] * len(subset_names)
})

df_bow = pd.DataFrame({
    "text": top_texts_bow,
    "source_name": top_names_bow,
    "matched_by": ["bow"] * len(top_texts_bow)
}).head(5)  # Top 5 from BOW

# Step 2: Combine (without deduplicating)
combined_df = pd.concat([df_cluster, df_bow], ignore_index=True)

In [56]:
combined_df

Unnamed: 0,text,source_name,matched_by
0,Title: A Beginner’s Guide and Checklist for Ac...,A Beginner’s Guide and Checklist for Accessing...,cluster
1,Title: Auditing Water Usage in Supply Chains\n...,Auditing Water Usage in Supply Chains,cluster
2,Title: Climate Aligned Construction Waste Mana...,Climate Aligned Construction Waste Management,cluster
3,Title: CLLS Certificate of Title: Climate Chan...,CLLS Certificate of Title: Climate Change Disc...,cluster
4,Title: ESG Aligned Company Articles\n\nText: A...,ESG Aligned Company Articles,cluster
5,Title: Green Loan “Starter Pack”\n\nText: 1. t...,Green Loan “Starter Pack”,cluster
6,Title: Greener and More Efficient HGVs in Road...,Greener and More Efficient HGVs in Road and Mu...,cluster
7,Title: Lease: Renewable Energy Obligations\n\n...,Lease: Renewable Energy Obligations,cluster
8,Title: Management Equity Ratchet Terms\n\nText...,Management Equity Ratchet Terms,cluster
9,Title: Repair or Refurbishment in Insurance Cl...,Repair or Refurbishment in Insurance Claims,cluster


In [57]:
#save the combined dataframe
combined_df.to_excel("combined_clauses.xlsx", index=False)

## Using an LLM 

In [58]:
query_text_short = query_text[:1000]

In [59]:
messages = [
    {
        "role": "system",
        "content": "You are a legal AI assistant that helps review and select climate-aligned clauses for the uploaded document. You can only select from those clauses provided to you. We are trying to help the writers of the document integrate climate-aligned language."
    },
    {
        "role": "user",
        "content": f"Here's the contract:\n\n{query_text_short.strip()}\n\nI will send you some clauses next. For now, just confirm you have read the contract and are ready to receive the clauses. A short summary of the content of the contract would be fine."
    }
]

## Important note: this key is only valid for a set period of time

In [60]:
from openai import OpenAI

client = OpenAI(
    api_key=os.getenv("OPENROUTER_API_KEY"), 
    base_url = "https://openrouter.ai/api/v1"
)

In [61]:
response = client.chat.completions.create(
    model="deepseek/deepseek-r1-0528-qwen3-8b:free", 
    messages= messages,
    temperature=0.1,
    max_tokens=1000
)


In [62]:
print(response.choices[0].message.content)

Yes, I have reviewed the contract excerpt provided. Here's a brief summary:

The contract is a **Software-as-a-Service (SaaS) Agreement** between **CloudFusion Ltd** (Provider) and **BrightWave Consulting Ltd** (Customer). It outlines the subscription terms for a cloud-based project management platform and related services, with the agreement dated **15 September 2025**. The parties are based in the **United Kingdom**.

I am ready to receive the list of clauses and begin the selection process for climate-aligned language.


In [63]:
assistant_reply_1 = response.choices[0].message.content

In [64]:
messages.append({"role": "assistant", "content": assistant_reply_1})

In [65]:
clause_block = "Here are the clauses:\n\n"

for i, row in combined_df.iterrows():
    clause_block += (
        f"Clause {i+1}\n"
        f"Name: {row['source_name']}\n"
        f"Method: {row['matched_by']}\n"
        f"Full Text:\n{row['text']}\n\n"
    )

clause_block += '''Select the 3 clauses from the list that best align with the contract. Follow these rules:

1. Your response must be a JSON of exactly three objects, each with the keys "Clause Name" and "Reasoning".
3. Only select from the clauses provided — do not invent new ones.
4. Remember the contract’s **content and purpose**. Their goal is likely not to avoid climate-related risks, but to meet other business or legal needs. We are telling them where they can inject climate-aligned language into the existing contract but the existing contract and its goals are the most important consideration.
5. Pay close attention to what the contract is **doing** — the transaction type, structure, and key obligations — not just who the parties are or what sector they operate in.
   - Clauses must fit the **actual function and scope** of the contract.
   - For example, do not recommend a clause about land access if the contract is about software licensing.
   - Another example: do not recommend a clause about insurance if the contract is establishing a joint venture.
6. Consider the relationship between the parties (e.g. supplier–customer, insurer–insured, JV partners).
   - If a clause assumes a different relationship, only suggest it if it can **realistically be adapted**, and explain how.
7. You may include a clause that is not a perfect match if:
   - It serves a similar **legal or operational function**, and
   - You clearly explain how it could be adapted to the contract context.
8. Do not recommend clauses that clearly mismatch the contract’s type, scope, or parties.
9. Avoid redundancy. If the contract already addresses a topic (e.g. dispute resolution), only suggest a clause on that topic if it adds clear value.

Focus on legal function, contextual fit, and the actual mechanics of the contract. You are recommending **starting points** — plausible clauses the user could adapt.'''

In [66]:
messages.append({"role": "user", "content": clause_block})

In [67]:
response = client.chat.completions.create(
    model="deepseek/deepseek-r1-0528-qwen3-8b:free",  
    messages= messages,
    temperature=0.1,
)

In [68]:
print(response)

ChatCompletion(id='gen-1756980620-UtNryw0FocLinSvxB96i', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='[\n  {\n    "Clause Name": "Green Termination Provision (Short Form)",\n    "Reasoning": "This clause allows termination if the supplier fails to meet sustainability objectives, which can be adapted into a SaaS agreement to align with climate goals. SaaS contracts often include termination clauses, and adding a sustainability trigger ensures the provider meets environmental standards, supporting the customer\'s ability to pivot to greener options if needed, without fundamentally changing the contract\'s structure."\n  },\n  {\n    "Clause Name": "Supply: Renewable Energy Requirements",\n    "Reasoning": "This clause requires suppliers to use renewable energy, which can be directly adapted to a SaaS agreement by applying it to the provider\'s energy use for data centers and services. SaaS contracts focus on service delivery and ope

In [69]:
print(response.choices[0].message.content)

[
  {
    "Clause Name": "Green Termination Provision (Short Form)",
    "Reasoning": "This clause allows termination if the supplier fails to meet sustainability objectives, which can be adapted into a SaaS agreement to align with climate goals. SaaS contracts often include termination clauses, and adding a sustainability trigger ensures the provider meets environmental standards, supporting the customer's ability to pivot to greener options if needed, without fundamentally changing the contract's structure."
  },
  {
    "Clause Name": "Supply: Renewable Energy Requirements",
    "Reasoning": "This clause requires suppliers to use renewable energy, which can be directly adapted to a SaaS agreement by applying it to the provider's energy use for data centers and services. SaaS contracts focus on service delivery and operational aspects, and this clause fits by ensuring the provider's operations are climate-aligned, such as by mandating renewable energy sources for hosting and maintena

________

Practice extracting clause names from the response

In [70]:
response_text = response.choices[0].message.content


In [71]:
response_text

'[\n  {\n    "Clause Name": "Green Termination Provision (Short Form)",\n    "Reasoning": "This clause allows termination if the supplier fails to meet sustainability objectives, which can be adapted into a SaaS agreement to align with climate goals. SaaS contracts often include termination clauses, and adding a sustainability trigger ensures the provider meets environmental standards, supporting the customer\'s ability to pivot to greener options if needed, without fundamentally changing the contract\'s structure."\n  },\n  {\n    "Clause Name": "Supply: Renewable Energy Requirements",\n    "Reasoning": "This clause requires suppliers to use renewable energy, which can be directly adapted to a SaaS agreement by applying it to the provider\'s energy use for data centers and services. SaaS contracts focus on service delivery and operational aspects, and this clause fits by ensuring the provider\'s operations are climate-aligned, such as by mandating renewable energy sources for hosting 

In [72]:
response_df = utils.parse_response(response_text)

In [82]:
def get_risk_label(response_df, risk_df):
    # For each clause name in the response_df, find all matching risk_labels in risk_df
    for name in response_df['Clause Name']:
        # Find all rows in risk_df with this name
        matches = risk_df[risk_df['name'] == name]
        if not matches.empty:
            # If there are multiple risk_labels, join them with '; '
            risk_labels = matches['risk_label'].dropna().unique()
            risk_label_str = "; ".join(risk_labels)
            # Assign the concatenated risk labels to all matching rows in response_df
            response_df.loc[response_df['Clause Name'] == name, 'risk_label'] = risk_label_str
            print(f"Clause '{name}' found in risk categories with label(s): {risk_label_str}.")
        else:
            print(f"Clause '{name}' not found in risk categories.")
    return response_df

In [83]:
response_df = get_risk_label(response_df, risk_df)

Clause 'Green Termination Provision (Short Form)' found in risk categories with label(s): Operational Emissions; Supply Chain Emissions; Product Lifecycle Emissions.
Clause 'Supply: Renewable Energy Requirements' found in risk categories with label(s): Fossil Fuel Combustion; Electricity Generation; Energy Supply Chain.
Clause 'Subcontractor/ Supplier Self-Assessment Climate Questionnaire (Construction)' found in risk categories with label(s): Construction Waste; Energy Consumption; Transportation Emissions; Material Sourcing.


In [84]:
response_df

Unnamed: 0,Clause Name,Reasoning,risk_label
0,Green Termination Provision (Short Form),This clause allows termination if the supplier...,Operational Emissions; Supply Chain Emissions;...
1,Supply: Renewable Energy Requirements,This clause requires suppliers to use renewabl...,Fossil Fuel Combustion; Electricity Generation...
2,Subcontractor/ Supplier Self-Assessment Climat...,This clause involves assessing climate perform...,Construction Waste; Energy Consumption; Transp...
