In [1]:
# pip install --upgrade ipywidgets
# pip install transformers
# pip install --upgrade transformers

In [2]:
import torch
print(torch.cuda.is_available())  # Should print True if CUDA is enabled

True


In [3]:
from PyPDF2 import PdfReader

def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

text_v1 = extract_text_from_pdf("../data/external-transfer-send-money-en-202211.pdf")
text_v2 = extract_text_from_pdf("../data/external-transfer-send-money-en-202304.pdf")

In [4]:
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

chunks_v1 = split_text_into_chunks(text_v1)
chunks_v2 = split_text_into_chunks(text_v2)

## Embeddings creation

Now let's create vector representations (embeddings) for each text fragment.\
We'll use `sentence-transformers/all-MiniLM-L6-v2`. That is lightweight and fast model.

In [5]:
from sentence_transformers import SentenceTransformer

# Load a SentenceTransformer model for embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def create_embeddings(text_chunks):
    return embedding_model.encode(text_chunks, show_progress_bar=True)

# Example: Creating embeddings for document chunks
embeddings_v1 = create_embeddings(chunks_v1)
embeddings_v2 = create_embeddings(chunks_v2)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

### Save the embeddings:
Use the `FAISS` library to create a database of embeddings.

In [6]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Build a FAISS index
dimension = embeddings_v1.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(dimension)

# Add embeddings to the index
# index.add(np.array(embeddings_v1, dtype="float32"))
# index.add(np.array(embeddings_v2, dtype="float32"))

# Combine embeddings
all_embeddings = np.concatenate([embeddings_v1, embeddings_v2], axis=0)

# Combine chunks
all_chunks = chunks_v1 + chunks_v2

# Add combined embeddings to the FAISS index
index.add(np.array(all_embeddings, dtype="float32"))

# Save the index for later use
faiss.write_index(index, "document_index.faiss")

## Integrating the Mistral-7B-Instruct model
Now we will leverage a large language model to conduct a detailed comparison of the texts.

### Load `transformers` library and init Mistral-7B-Instruct:

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "mistralai/Mistral-7B-Instruct-v0.3"
# model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, token="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")
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda:0", torch_dtype="auto", token="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")
# model = AutoModel.from_pretrained(model_name, device_map="cuda:0", torch_dtype="auto", token="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")
# ValueError: When passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or 'auto', 'balanced', 'balanced_low_0', 'sequential' but found gpu.

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Implementing Difference Analysis
We retrieve relevant text fragments using FAISS.\
These fragments are then passed to the LLM for analysis.

In [8]:
def search_relevant_chunks(query, index, chunks, top_k=5):
    # Create an embedding for the query
    query_embedding = embedding_model.encode([query])
    query_embedding = np.array(query_embedding, dtype="float32")

    # Search for nearest neighbors
    distances, indices = index.search(query_embedding, top_k)

    # Extract matching text chunks
    return [chunks[i] for i in indices[0]]

In [9]:
def generate_comparison(chunk1, chunk2):
    # Format input for Mistral
    prompt = (
        f"Document 1:\n{chunk1}\n\n"
        f"Document 2:\n{chunk2}\n\n"
        "Describe the differences between these two documents:"
    )
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=500)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Retrieve relevant chunks for comparison
query = "What are the differences between these documents?"
relevant_chunks = search_relevant_chunks(query, index, all_chunks)

# Separate chunks back into original groups
relevant_v1 = [chunk for chunk in relevant_chunks if chunk in chunks_v1]
relevant_v2 = [chunk for chunk in relevant_chunks if chunk in chunks_v2]

# Perform pairwise comparisons (ensure lengths match)
for chunk1, chunk2 in zip(relevant_v1, relevant_v2):
    print("################################################################")
    print(generate_comparison(chunk1, chunk2))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Document 1:
Page 1 of 29 November 2022 External Transfer and Send Money with Zelle® Paymen ts Services Terms and Conditions TERMS OF SERVICE Part I: GENERAL TER MS FOR EACH SERVICE 1.1. Introduction. This T erms of Service documen t (hereinafte r "Agre ement") is a contract between you and TD BANK, N.A. (hereinafter "we" or "us") in connect ion with each service that is described in the re st of this Agreement that applies to services you use from us, a s applicable (each, a "Service") offer ed through our online banking site or m obile applicat ions (the "Site "). The Agreement consists of these General T erms f or Each Service (referred to as "Gene ral Terms"), and ea ch set of Terms tha t follows after the General Terms that applies to the specific Service you are using from us. This Agreement applies to your use of the Service and the portion of the Site thr ough which the Service is offered. 1.2. Servi ce Prov iders. We are offering you the Service through one or more Service Prov