In [1]:
# !pip install torch torchvision torchaudio

In [2]:
# !pip install pdfplumber spacy difflib

In [3]:
# !python -m spacy download en_core_web_sm

In [4]:
!pip install pdfplumber



In [5]:
# Import Dependencies
import pdfplumber
import spacy
from difflib import unified_diff
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification

  from pandas.core import (


In [6]:
import pytesseract
from pytesseract import Output
from PIL import Image
import logging

In [7]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [8]:
# Load spaCy model for NER
nlp = spacy.load('en_core_web_sm')

In [9]:
# Custom dataset for PyTorch
class ContractDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [10]:
# Load the pre-trained BERT Model for classification model (dummy)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
# Example Classification Model
class SimpleClassifier(nn.Module):
    def __init__(self):
        super(SimpleClassifier, self).__init__()
        self.linear = nn.Linear(768, 2)

    def forward(self, x):
        return self.linear(x)

classifier = SimpleClassifier()

In [12]:
# Example function to parse and extract text from PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

In [13]:
def classify_text_sections(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding='max_length')
    outputs = model(**inputs)
    _, predicted = torch.max(outputs.logits, dim=1)
    return predicted.item()


In [14]:
# Function to perform NER (Named Entity Recognition)
def perform_ner(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities


In [15]:
# Function to compare texts and highlight differences
def compare_texts(text1, text2):
    diff = unified_diff(text1.splitlines(), text2.splitlines(), lineterm='')
    return '\n'.join(list(diff))

In [16]:
# Function to highlight differences in PDF (dummy implementation)
def highlight_differences(pdf_path, differences):
    # Implement PDF highlighting logic
    pass

In [17]:
# Function to summarize text (dummy implementation)
def summarize_text(text):
    # Implement text summarization logic
    return "Summary of the text"

In [18]:
# Main function
def main(pdf_path, template_path):
    contract_text = extract_text_from_pdf(pdf_path)
    template_text = extract_text_from_pdf(template_path)
    
    # Classify text sections (example for the entire text)
    classification_result = classify_text_sections(contract_text)
    logger.info("Classification Result: %s", classification_result)

    # Perform NER
    entities = perform_ner(contract_text)
    logger.info("Named Entities: %s", entities)

    # Compare with template
    differences = compare_texts(contract_text, template_text)
    logger.info("Differences:\n%s", differences)

    # Highlight differences in PDF (dummy implementation)
    highlight_differences(pdf_path, differences)

    # Summarize text (dummy implementation)
    summary = summarize_text(contract_text)
    logger.info("Summary: %s", summary)

In [19]:
if __name__ == "__main__":
    # Paths to the contract and template PDF files, and image file
    contract_pdf_path = "contract.pdf"
    template_pdf_path = "template.pdf"

    main(contract_pdf_path, template_pdf_path)

INFO:__main__:Classification Result: 1
INFO:__main__:Named Entities: [('This Business Contract', 'ORG'), ('May 30, 2024', 'DATE'), ('ABC Marketing Solutions\nAddress', 'ORG'), ('123', 'CARDINAL'), ('Springfield', 'GPE'), ('IL 62701\nContact', 'ORG'), ('555', 'CARDINAL'), ('123-4567', 'CARDINAL'), ('XYZ Retailers Inc.', 'ORG'), ('456', 'CARDINAL'), ('Springfield', 'GPE'), ('IL 62702', 'ORG'), ('555', 'CARDINAL'), ('987-6543', 'CARDINAL'), ('1', 'CARDINAL'), ('ABC Marketing Solutions', 'ORG'), ('XYZ Retailers Inc.', 'ORG'), ('Digital', 'ORG'), ('2', 'CARDINAL'), ('XYZ Retailers Inc.', 'ORG'), ('ABC Marketing Solutions', 'ORG'), ('10,000', 'MONEY'), ('50%', 'PERCENT'), ('50%', 'PERCENT'), ('June 1, 2024', 'DATE'), ('December 31, 2024', 'DATE'), ('first', 'ORDINAL'), ('4', 'CARDINAL'), ('third', 'ORDINAL'), ('5', 'CARDINAL'), ("30 days'", 'DATE'), ('ABC Marketing Solutions', 'ORG'), ('6', 'CARDINAL'), ('the State of Illinois', 'GPE'), ('7', 'CARDINAL'), ('John Smith', 'PERSON'), ('May 30, 