## Clause Detection Using Pre-Trained Model

In [1]:
from tclp.clause_detector import detector_utils as du

In [2]:
model_name = "../clause_identifier_model.pkl"
model = du.load_model(model_name)

In [3]:
contract_dir = "../../data/test_contracts"
# NOTE: If this throws an error, it is because you may have uniquely constructed your test set from training_model.ipynb. 
# Because of randonmess, different contracts may be selected for training and testing. 
# Just go into the data/test_contracts directory and pull the name of a single contract to fix this.
single_contract = "../../data/spend_network/fresh_contracts_identified_txt/241209_con_7063_desnz_erm_nzm_modelling_contract_award_form_and_schedules_v1_redacted.txt"
contract_folder = "../../data/test_contracts"

In [4]:
#you can input contract_dir or single_contract as the argument to this function
processed_contracts = du.load_unlabelled_contract(contract_folder)

In [5]:
processed_contracts

Unnamed: 0,contract_ids,text
0,000038611.txt,Exhibit 10.14 SEPARATION AGREEMENT between Ray...
1,000038611.txt,"1.2 This termination shall extend to all, if a..."
2,000038611.txt,2. RETIREMENT FROM ALL FORMS OF OFFICE 2.1 Mr ...
3,000038611.txt,2.2 Mr Burel likewise shall retire from his of...
4,000038611.txt,2.3 Mr Burel shall resign from his office in s...
...,...,...
376604,000008913.txt,38 PROTECTION OF THE TRUSTEE Any sale by the T...
376605,000008913.txt,39 APPLICATION FOR LISTING OR ADMISSION TO TRA...
376606,000008913.txt,"shall, at its expense, make application for, a..."
376607,000008913.txt,40 RELATIONSHIP OF PLAN TO CONTRACT OF EMPLOYM...


In [6]:
results = model.predict(processed_contracts["text"])

In [7]:
result_df, result_df_true = du.create_result_df(results, processed_contracts)

In [8]:
#note: This function is designed to work for single contracts. 
# It will work with a contract directory but they will all be in one, very long file 
highlighted_output = du.highlight_climate_content(result_df)

In [9]:
#save highlighted output to a file; commented so it doesn't save every time this script runs 
#du.save_file("highlighted_output_1.html", highlighted_output)

## Using the Detector

In [10]:
result_df

Unnamed: 0,prediction,sentence,contains_climate_keyword
0,0,Exhibit 10.14 SEPARATION AGREEMENT between Ray...,False
1,0,"1.2 This termination shall extend to all, if a...",False
2,0,2. RETIREMENT FROM ALL FORMS OF OFFICE 2.1 Mr ...,False
3,0,2.2 Mr Burel likewise shall retire from his of...,False
4,0,2.3 Mr Burel shall resign from his office in s...,False
...,...,...,...
376604,0,38 PROTECTION OF THE TRUSTEE Any sale by the T...,False
376605,0,39 APPLICATION FOR LISTING OR ADMISSION TO TRA...,False
376606,0,"shall, at its expense, make application for, a...",False
376607,0,40 RELATIONSHIP OF PLAN TO CONTRACT OF EMPLOYM...,False


In [11]:
contract_df_pkl = du.create_contract_df(
    result_df, processed_contracts, labelled=False
)

In [12]:
likely, very_likely, extremely_likely, none = du.create_threshold_buckets(contract_df_pkl)

In [13]:
du.print_percentages(likely, very_likely, extremely_likely, none, contract_df_pkl)

Not Likely:  45.13 %
Could Contain:  14.48 %
Likely:  26.7 %
Very Likely:  13.7 %


I split the data 60% including a clause and 40% not including a clause so this looks good! 

### Testing for a Single File

In [14]:
# note: this doesn't work if you are working with a contract database; it is designed for a single file

# If you want to see this functionality, run the code using the single file path
du.print_single(likely, very_likely, extremely_likely, none)

very likely


_____

## Using the BERT model

In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [16]:
model_path = "../../CC_BERT/CC_model_detect" 
device = torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50500, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [None]:
texts = processed_contracts["text"].tolist()
results, probs = du.predict_climatebert(texts, tokenizer, device, model)

In [None]:
result_df, result_df_true = du.create_result_df(results, processed_contracts)

In [None]:
contract_df = du.create_contract_df(
    result_df, processed_contracts, labelled=False
)

In [None]:
likely, very_likely, extremely_likely, none = du.create_threshold_buckets(contract_df)

In [None]:
du.print_single(likely, very_likely, extremely_likely, none)

likely


In [None]:
du.print_percentages(likely, very_likely, extremely_likely, none, contract_df)

In [None]:
highlighted_output = du.highlight_climate_content(result_df)

In [None]:
#save highlighted output to a file; commented so it doesn't save every time this script runs 
#du.save_file("highlighted_output_2.html", highlighted_output)