The idea behind this notebook is that rather than using keywords, I will slot the clauses into buckets based on the 14 TCFD categories. Then, when I upload a contract, I can first slot it into a TCFD bucket and match clauses based on that bucket, rather than all of the clauses. 

In [None]:

import numpy as np
from transformers import AutoTokenizer, AutoModel
import pandas as pd

from tclp.clause_recommender import utils

In [3]:
doc_to_use = '../../../tclp/data/synth_data/untouched/000000009.txt'
model_path = "../../CC_BERT/CC_model"
embeddings_dir = "../../CC_BERT/CC_embeddings"
legal_model_path = "../../legalbert/lebalbert_model"
legal_embeddings_dir = "../../legalbert/lebalbert_embeddings"
clause_folder = "../../data/cleaned_content"
clause_html = '../../data/clause_boxes'

with open(doc_to_use, "r", encoding="utf-8") as f:
    query_text = f.read()


In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)

  return self.fget.__get__(instance, owner)()
Some weights of RobertaModel were not initialized from the model checkpoint at ../../CC_BERT/CC_model and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
documents, file_names = utils.load_clauses(clause_folder)


In [6]:
clause_boxes, clause_box_filenames = utils.load_clauses(clause_html)

In [7]:
clause_box_df = utils.parse_clause_boxes_to_df(clause_boxes)

In [8]:
final_df = utils.attach_documents(clause_box_df, documents, file_names)

In [9]:
names, docs = utils.rebuild_documents(final_df)

In [19]:
clause_embeddings = np.vstack([
    utils.encode_text(doc, tokenizer, model, method="cls") for doc in docs
])

____

# Matching to the TCFD categories

In [10]:
tcfd = pd.read_csv('TCFD_14.csv')

In [12]:
tcfd

Unnamed: 0,Disclosure Item,Short Description
0,Describe the board’s oversight of climate-rela...,Governance
1,Describe management’s role in assessing and ma...,Governance
2,Describe the climate-related risks and opportu...,Risk Identification
3,Describe the impact of climate-related risks a...,Strategic Risk Impact
4,Describe the resilience of the organization’s ...,Scenario Analysis
5,Describe the organization’s processes for iden...,Risk Assessment
6,Describe the organization’s processes for mana...,Risk Management
7,"Describe how processes for identifying, assess...",Integrated Risk Governance
8,Disclose the metrics used by the organization ...,Metrics
9,"Disclose Scope 1, Scope 2, and, if appropriate...",Emissions (GHG)


In [16]:
title_to_text = dict(zip(names, docs))

In [20]:
#embed the tcfd clauses
tcfd_discs = tcfd['Disclosure Item'].tolist()
tcfd_embeddings = []
tcfd_matrix = np.vstack([
    utils.encode_text(disc, tokenizer, model, method="cls") for disc in tcfd_discs
])

## Finding Semantic Similarity

In [43]:
assigned_tcfd = []
similarity_scores = []

for clause_vec in clause_embeddings:
    _, score, idx, _, _ = utils.get_matching_clause(
        clause_vec.reshape(1, -1),
        tcfd_matrix,
        tcfd_discs
    )
    similarity_scores.append(score)

In [None]:
clause_tcfd_df = pd.DataFrame({
    "Clause Title": names,
    "Clause Text": docs,
    "Assigned TCFD Category": assigned_tcfd,
    "Similarity Score": similarity_scores
})


In [42]:
clause_tcfd_df['Assigned TCFD Category'].value_counts()

Assigned TCFD Category
Describe the climate-related risks and opportunities the organization has identified over the short, medium, and long term    69
Unassigned                                                                                                                    37
Describe management’s role in assessing and managing climate-related risks and opportunities                                   7
Disclose Scope 1, Scope 2, and, if appropriate, Scope 3 greenhouse gas (GHG) emissions                                         7
Describe climate-related opportunities                                                                                         1
Describe the resilience of the organization’s strategy, taking into consideration different climate-related scenarios          1
Name: count, dtype: int64