## Clause Detection Using Pre-Trained Model

In [40]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../")))
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import utils as du
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch


In [41]:
model_path = "../models/CC_BERT/CC_model_detect" 
device = torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50500, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [42]:
contract_dir = "../data/test_contracts"
# NOTE: If this throws an error, it is because you may have uniquely constructed your test set from training_model.ipynb. 
# Because of randonmess, different contracts may be selected for training and testing. 
# Just go into the data/test_contracts directory and pull the name of a single contract to fix this.
single_contract = "../data/spend_network/fresh_contracts_identified_txt/241209_con_7063_desnz_erm_nzm_modelling_contract_award_form_and_schedules_v1_redacted.txt"
contract_folder = "../data/test_contracts"

In [43]:
#you can input contract_dir or single_contract as the argument to this function
processed_contracts = du.load_unlabelled_contract(single_contract)

In [44]:
processed_contracts

Unnamed: 0,contract_ids,text
0,241209_con_7063_desnz_erm_nzm_modelling_contra...,Docusign Envelope ID:89C377CB-B239-4094-B128-F...
1,241209_con_7063_desnz_erm_nzm_modelling_contra...,It summarises the main features of the procure...
2,241209_con_7063_desnz_erm_nzm_modelling_contra...,Department for Energy Security and Net Zero(th...
3,241209_con_7063_desnz_erm_nzm_modelling_contra...,This Contract between the Buyer and the Suppli...
4,241209_con_7063_desnz_erm_nzm_modelling_contra...,CON_7063/Framework Agreement CON_6347 Any mate...
...,...,...
743,241209_con_7063_desnz_erm_nzm_modelling_contra...,3.2. When the licence granted under Paragraph ...
744,241209_con_7063_desnz_erm_nzm_modelling_contra...,5.2.1. the New IPR Items are suitable for rele...
745,241209_con_7063_desnz_erm_nzm_modelling_contra...,5.4. The Supplier must supply any or all New I...
746,241209_con_7063_desnz_erm_nzm_modelling_contra...,5.5. The Supplier may within fifteen(15)Workin...


In [45]:
texts = processed_contracts["text"].tolist()

In [46]:
results, probs = du.predict_climatebert(texts, tokenizer, device, model)

In [47]:
result_df, result_df_true = du.create_result_df(results, processed_contracts)

In [48]:
#note: This function is designed to work for single contracts and will create a single file for multiple contracts
highlighted_output = du.highlight_climate_content(result_df)

In [49]:
#save highlighted output to a file; commented so it doesn't save every time this script runs 
#du.save_file("highlighted_output.html", highlighted_output)

## Using the Detector

In [50]:
contract_df = du.create_contract_df(
    result_df, processed_contracts, labelled=False
)

In [51]:
likely, very_likely, extremely_likely, none = du.create_threshold_buckets(contract_df)

In [54]:
likely

Unnamed: 0,index,prediction,keyword_pass,bucket


In [None]:
du.print_percentages(likely, very_likely, extremely_likely, none, contract_df)

Not Likely:  0.0 %
Could Contain:  0.0 %
Likely:  0.03 %
Very Likely:  0.0 %


I split the data 60% including a clause and 40% not including a clause so this looks good! 

### Testing for a Single File

In [53]:
# note: this doesn't work if you are working with a contract database; it is designed for a single file

# If you want to see this functionality, run the code using the single file path
du.print_single(likely, very_likely, extremely_likely, none)

likely
