# Training Clause Detection Model

This folder is just for those interested in the process or wanting to retrain. For those that just want to use the pre-trained model, that is accesible at ../clause_identifier_model.pkl

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from tclp.clause_detector import detector_utils as du
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import f1_score, accuracy_score
import numpy as np


In [2]:
import os
os.environ["WANDB_API_KEY"] = "3bff39257044a210c516c50b1d45e18d506dc4ee"

In [3]:
import wandb

## Loading Synthetic Data into a Dataframe

In [4]:
modified_data_folder = '../../data/synth_data/modified_real'
untouched_data_folder = '../../data/synth_data/untouched'
modified_gen_data_folder = '../../data/synth_data/modified_gen'

In [5]:
texts, labels, contract_ids, contract_level_labels = du.load_labeled_contracts(
    modified_data_folder, modified=True
)
(
    texts_untouched,
    labels_untouched,
    contract_ids_untouched,
    contract_level_labels_untouched,
) = du.load_labeled_contracts(untouched_data_folder)
texts_gen, labels_gen, contract_ids_gen, contract_level_labels_gen = (
    du.load_labeled_contracts(modified_gen_data_folder, modified=True)
)

In [6]:
# combine texts from different sources
texts = texts + texts_untouched + texts_gen
labels = labels + labels_untouched + labels_gen
contract_ids = contract_ids + contract_ids_untouched + contract_ids_gen
contract_level_labels = (
    contract_level_labels + contract_level_labels_untouched + contract_level_labels_gen
)

In [7]:
data = du.create_and_clean_base_df(texts, labels, contract_ids, contract_level_labels)

In [8]:
data

Unnamed: 0,contract_ids,text,label,contract_label,real_clause
0,000025126.txt,"EXECUTION COPY AGREEMENT DATED 6 AUGUST, 2010 ...",0,1,0
1,000025126.txt,"ARRANGED BY COMMERZBANK Aktiengesellschaft, po...",0,1,0
2,000025126.txt,as Mandated Lead Arrangers - AND Citibank Euro...,0,1,0
3,000025126.txt,0040772-0000059 BT:601674.7 - CONTENTS Clause ...,0,1,0
4,000025126.txt,with its registered seat at Vstupný areál U. S...,0,1,0
...,...,...,...,...,...
996863,000039516.txt,6. Other than in respect of the amendment refe...,0,1,0
996864,000039516.txt,7. This letter and any non-contractual obligat...,0,1,0
996865,000039516.txt,Please indicate your acceptance of the terms o...,0,1,0
996866,000039516.txt,Yours faithfully Lloyds TSB Bank plc ING Bank ...,0,1,0


## Load Real Clauses for Training Data

In [9]:
clause_folder = "../../data/clause_boxes"

In [10]:
clause_texts, clause_labels, clause_ids, clause_reality = du.load_clauses(clause_folder)

In [11]:
# put this in the same form as the contract data so they can be combined; contract label for all of them is 1
clause_data = pd.DataFrame(
    {
        "contract_ids": clause_ids,
        "text": clause_texts,
        "label": clause_labels,
        "contract_label": [1] * len(clause_ids),
        "real_clause": clause_reality,
    }
)

In [12]:
clause_data

Unnamed: 0,contract_ids,text,label,contract_label,real_clause
0,Template_Board_Paper_for_Significant_Contracts...,"<div class=""clause-wrapper"">",1,1,1
1,Template_Board_Paper_for_Significant_Contracts...,"<p class=""childs-name"">Griff's Clause</p>",1,1,1
2,Template_Board_Paper_for_Significant_Contracts...,<h4>Template Board Paper for Significant Contr...,1,1,1
3,Template_Board_Paper_for_Significant_Contracts...,"<p class=""excerpt"">Template board papers with ...",1,1,1
4,Template_Board_Paper_for_Significant_Contracts...,"<p class=""meta-data"">",1,1,1
...,...,...,...,...,...
1703,Stakeholder_Company_Climate_Questionnaire.txt,"<span class=""cfc-leadin"">Updated: </span>",1,1,1
1704,Stakeholder_Company_Climate_Questionnaire.txt,"<span class=""cfc-taxonomy"">2024-09-10 10:35:26...",1,1,1
1705,Stakeholder_Company_Climate_Questionnaire.txt,</p>,1,1,1
1706,Stakeholder_Company_Climate_Questionnaire.txt,"<a href=""https://chancerylaneproject.org/claus...",1,1,1


## Creating Full Dataframe

Now that my two dataframes are in the same form, I can combine them to create one data set. 

From there, I can create my training, testing, and validation data.

In [13]:
full_data = pd.concat([data, clause_data], ignore_index=True)

In [14]:
full_data

Unnamed: 0,contract_ids,text,label,contract_label,real_clause
0,000025126.txt,"EXECUTION COPY AGREEMENT DATED 6 AUGUST, 2010 ...",0,1,0
1,000025126.txt,"ARRANGED BY COMMERZBANK Aktiengesellschaft, po...",0,1,0
2,000025126.txt,as Mandated Lead Arrangers - AND Citibank Euro...,0,1,0
3,000025126.txt,0040772-0000059 BT:601674.7 - CONTENTS Clause ...,0,1,0
4,000025126.txt,with its registered seat at Vstupný areál U. S...,0,1,0
...,...,...,...,...,...
998571,Stakeholder_Company_Climate_Questionnaire.txt,"<span class=""cfc-leadin"">Updated: </span>",1,1,1
998572,Stakeholder_Company_Climate_Questionnaire.txt,"<span class=""cfc-taxonomy"">2024-09-10 10:35:26...",1,1,1
998573,Stakeholder_Company_Climate_Questionnaire.txt,</p>,1,1,1
998574,Stakeholder_Company_Climate_Questionnaire.txt,"<a href=""https://chancerylaneproject.org/claus...",1,1,1


____

In [15]:
model_path = "../../contract_climate_bert" 

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2  # binary classification: clause / not-clause
)

  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ../../contract_climate_bert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
train_data, val_data, test_data, train_indices, val_indices, test_indices = (
    du.custom_train_test_split(full_data, "real_clause")
)

Train: 75.04%
Validation: 9.98%
Test: 14.97%


In [17]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_data[["text", "label"]].reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_data[["text", "label"]].reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_data[["text", "label"]].reset_index(drop=True))

In [18]:
def tokenize(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )

In [19]:
train_dataset = train_dataset.map(tokenize, batched=True).rename_column("label", "labels")
val_dataset = val_dataset.map(tokenize, batched=True).rename_column("label", "labels")
test_dataset = test_dataset.map(tokenize, batched=True).rename_column("label", "labels")

for ds in [train_dataset, val_dataset, test_dataset]:
    ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/749363 [00:00<?, ? examples/s]

Map:   0%|          | 0/99697 [00:00<?, ? examples/s]

Map:   0%|          | 0/149514 [00:00<?, ? examples/s]

In [20]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)




In [4]:
# Login if not already
wandb.login()
wandb.init()

api = wandb.Api()
run = api.run("ger23-imperial-college-london/climatebert-classification/ulavv1k3")

for artifact in run.logged_artifacts():
    print(artifact.name)



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mger23[0m ([33mger23-imperial-college-london[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


climatebert-checkpoint-10000:v0


In [None]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Current

  0%|          | 0/281013 [00:00<?, ?it/s]

_____

## Train Test Split

I will keep individual contracts together in either train, test, or split as well as ensure anything with the 'real_clause' positive designation is in the training set.

In [None]:
train_data, val_data, test_data, train_indices, val_indices, test_indices = (
    du.custom_train_test_split(full_data, "real_clause")
)

Train: 75.1%
Validation: 9.99%
Test: 14.91%


In [None]:
X_train, y_train = du.X_y_split(train_data)
X_val, y_val = du.X_y_split(val_data)
X_test, y_test = du.X_y_split(test_data)

In [None]:
# save the test contracts for future use
du.save_test_data(
    test_data, "../../data/synth_data/combined", "../../data/test_contracts"
)

Test contracts have been saved to: ../../data/test_contracts


## Training a Model

In [None]:
model = Pipeline(
    [
        ("tfidf", TfidfVectorizer(max_features=None, ngram_range=(1, 2), min_df=2)),
        (
            "logreg",
            LogisticRegression(
                max_iter=1000, class_weight="balanced", solver="liblinear", C=10
            ),
        ),
    ]
)
model.fit(X_train, y_train)

In [None]:
# save the model to disk
du.save_model(model, "../clause_identifier_model.pkl")

In [None]:
y_pred = du.evaluate_model_clause_level(model, X_val, y_val)

## Extrapolating Results to Contracts

In [None]:
val_contract_df = du.create_contract_df(X_val, val_data, y_pred)

In [None]:
val_contract_df

In [None]:
f1s, f1_thresholds = du.threshold_graphs(
    val_contract_df, thresholds=range(1, 8), metric_type="f1"
)

In [None]:
du.print_contract_classification_report(val_contract_df, f1s, f1_thresholds)

## Test Set

In [None]:
test_contract_df = du.create_contract_df(X_test, test_data, model.predict(X_test))

In [None]:
f1s, f1_thresholds = du.threshold_graphs(
    test_contract_df, thresholds=range(1, 7), metric_type="f1"
)

In [None]:
du.print_contract_classification_report(test_contract_df, f1s, f1_thresholds)