## Clause Detection Using Pre-Trained Model

In [1]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../")))
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import utils as du
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch


In [2]:
model_path = "../models/CC_BERT/CC_model_detect" 
device = torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50500, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [3]:
# NOTE: If this throws an error, it is because you may have uniquely constructed your test set from training_model.ipynb. 
# Because of randonmess, different contracts may be selected for training and testing. 
# Just go into the data/test_contracts directory and pull the name of a single contract to fix this.
single_contract = "/Users/georgia/Documents/coding/climate_risk_id/tclp/provocotype-1/assets/sample-contracts/contract4.txt"
contract_folder = "../data/test_contracts"

In [4]:
#you can input contract_dir or single_contract as the argument to this function
processed_contracts = du.load_unlabelled_contract(single_contract)

In [5]:
processed_contracts

Unnamed: 0,index,text
0,contract4.txt,﻿Software-as-a-Service(SaaS)Agreement Provider...
1,contract4.txt,Customer:BrightWave Consulting Ltd(No. 0659871...
2,contract4.txt,.
3,contract4.txt,The Provider operates a cloud-based project ma...
4,contract4.txt,2. The Customer wishes to subscribe to the Ser...
...,...,...
93,contract4.txt,6. Delete or return Personal Data on terminati...
94,contract4.txt,7. Allow reasonable audits of compliance(subje...
95,contract4.txt,Schedule 4 — Support Services and SLA 9:00–17:...
96,contract4.txt,Premium Support adds 24×7 incident response fo...


In [6]:
texts = processed_contracts["text"].tolist()

In [7]:
results, probs = du.predict_climatebert(texts, tokenizer, device, model)

In [8]:
result_df, result_df_true = du.create_result_df(results, processed_contracts)

In [9]:
result_df[60:80]

Unnamed: 0,prediction,sentence,contains_climate_keyword
60,0,2. Assignment.,False
61,0,The Customer may not assign without Provider c...,False
62,0,3. Notices.,False
63,0,"As set out in Schedule 1, Section 6. 4. Entire...",False
64,0,This Agreement and Schedules constitute the en...,False
65,0,5. Governing Law.,False
66,0,England and Wales; courts of England have excl...,False
67,0,21. Sustainability(Informational and Non‑Bindi...,True
68,0,This clause is informational only and does not...,False
69,1,Note:Inspired by climate‑contracting practice ...,True


In [10]:
#note: This function is designed to work for single contracts and will create a single file for multiple contracts
highlighted_output = du.highlight_climate_content(result_df)

In [11]:
from IPython.display import display, HTML

# Display the highlighted HTML output in the notebook
display(HTML(highlighted_output))


In [12]:
#save highlighted output to a file; commented so it doesn't save every time this script runs 
du.save_file("highlighted_output.html", highlighted_output)

## Using the Detector

In [13]:
contract_df = du.create_contract_df(
    result_df, processed_contracts, labelled=False
)

In [14]:
cat0, cat1, cat2, cat3 = du.create_threshold_buckets(contract_df)

In [15]:
du.print_percentages(cat0, cat1, cat2, cat3, contract_df)

Unlikely: 0.0%
Possible: 100.0%
Likely: 0.0%
Very Likely: 0.0%


I split the data 60% including a clause and 40% not including a clause so this looks good! 

### Testing for a Single File

In [16]:
# note: this doesn't work if you are working with a contract database; it is designed for a single file

# If you want to see this functionality, run the code using the single file path
du.print_single(cat0, cat1, cat2, cat3)

possible
