In [1]:
import fitz  # PyMuPDF
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

  from pandas.core import (


In [2]:
# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [3]:
# Define the contract clauses template
clauses = [
    "Services Provided",
    "Payment",
    "Term",
    "Confidentiality",
    "Termination",
    "Governing Law",
    "Signatures"
]

In [4]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

In [5]:
def classify_clause(text, clauses):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    return clauses[predictions.item()]


In [6]:
def find_deviations(template_text, contract_text):
    template_lines = template_text.split('\n')
    contract_lines = contract_text.split('\n')
    deviations = []

    for line in contract_lines:
        if line not in template_lines:
            deviations.append(line)
    
    return deviations

In [7]:
def main():
    template_path = 'template.pdf'
    contract_path = 'contract.pdf'

    template_text = extract_text_from_pdf(template_path)
    contract_text = extract_text_from_pdf(contract_path)

    contract_lines = contract_text.split('\n')
    entity_label_pairs = []

    for line in contract_lines:
        if line.strip():
            clause_label = classify_clause(line, clauses)
            entity_label_pairs.append((line, clause_label))

    deviations = find_deviations(template_text, contract_text)

    print("Classified Entities and Labels:")
    for entity, label in entity_label_pairs:
        print(f"Entity: {entity}, Label: {label}")

    print("\nDeviations from Template:")
    for deviation in deviations:
        print(deviation)

In [8]:
if __name__ == "__main__":
    main()

Classified Entities and Labels:
Entity: BUSINESS CONTRACT, Label: Confidentiality
Entity: This Business Contract ("Contract") is made and entered into as of May 30, 2024, by and between:, Label: Confidentiality
Entity: Party A:, Label: Confidentiality
Entity: Name: ABC Marketing Solutions, Label: Confidentiality
Entity: Address: 123 Market St, Springfield, IL 62701, Label: Confidentiality
Entity: Contact: (555) 123-4567, contact@abcmarketing.com, Label: Confidentiality
Entity: Party B:, Label: Confidentiality
Entity: Name: XYZ Retailers Inc., Label: Confidentiality
Entity: Address: 456 Commerce Blvd, Springfield, IL 62702, Label: Confidentiality
Entity: Contact: (555) 987-6543, info@xyzretailers.com, Label: Confidentiality
Entity: 1. Services Provided:, Label: Signatures
Entity: ABC Marketing Solutions agrees to provide the following services to XYZ Retailers Inc.:, Label: Confidentiality
Entity: - Digital marketing strategy development, Label: Confidentiality
Entity: - Social media ma

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import TextClassificationPipeline
import torch
from torch.utils.data import Dataset, DataLoader

In [10]:
# Custom Dataset Class
class ContractDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [11]:
# Load data
train_df = pd.read_csv('train.csv')
eval_df = pd.read_csv('eval.csv')

In [12]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(clauses))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [13]:
# Create datasets
train_dataset = ContractDataset(
    texts=train_df.text.to_numpy(),
    labels=train_df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=128
)

eval_dataset = ContractDataset(
    texts=eval_df.text.to_numpy(),
    labels=eval_df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=128
)

In [14]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

In [15]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

In [16]:
# Train the model
trainer.train()

***** Running training *****
  Num examples = 7
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3
  Number of trainable parameters = 109487623
The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


Epoch,Training Loss,Validation Loss
1,No log,2.001677
2,No log,2.000971
3,No log,1.999594


***** Running Evaluation *****
  Num examples = 7
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 7
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 7
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


Training completed. Do 

TrainOutput(global_step=3, training_loss=1.9898648262023926, metrics={'train_runtime': 779.19, 'train_samples_per_second': 0.027, 'train_steps_per_second': 0.004, 'total_flos': 1381395052800.0, 'train_loss': 1.9898648262023926, 'epoch': 3.0})

In [17]:
# Save the fine-tuned model
model.save_pretrained('./fine-tuned-bert')
tokenizer.save_pretrained('./fine-tuned-bert')

Configuration saved in ./fine-tuned-bert\config.json
Model weights saved in ./fine-tuned-bert\pytorch_model.bin
tokenizer config file saved in ./fine-tuned-bert\tokenizer_config.json
Special tokens file saved in ./fine-tuned-bert\special_tokens_map.json


('./fine-tuned-bert\\tokenizer_config.json',
 './fine-tuned-bert\\special_tokens_map.json',
 './fine-tuned-bert\\vocab.txt',
 './fine-tuned-bert\\added_tokens.json')

In [18]:
# Load the fine-tuned model and tokenizer for inference
fine_tuned_model = BertForSequenceClassification.from_pretrained('./fine-tuned-bert')
fine_tuned_tokenizer = BertTokenizer.from_pretrained('./fine-tuned-bert')

loading configuration file ./fine-tuned-bert\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "f