## Lab 4 - Cross-encoder re-ranking


In [1]:
# This cell is the same as Lab 1 till we create the collection collapsed into a single block

import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

reader = PdfReader("The-Mom-Test-en.pdf")

pdf_texts = [p.extract_text().strip() for p in reader.pages]

pdf_texts = [text for text in pdf_texts if text]  # Filter the empty strings

character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=0
)
character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)
   

embedding_function = SentenceTransformerEmbeddingFunction()

chroma_client = chromadb.Client()

chroma_collection = chroma_client.create_collection("TheMomTest_book", embedding_function=embedding_function)

ids = [str(i) for i in range(len(token_split_texts))]

chroma_collection.add(ids=ids, documents=token_split_texts)

chroma_collection.count()

  from .autonotebook import tqdm as notebook_tqdm
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


236

In [2]:

import numpy as np

# Re-ranking the long tail

In [3]:
query = "What is the worst escenario when asking for feedback?"
results = chroma_collection.query(query_texts=query, n_results=10, include=['documents', 'embeddings'])

retrieved_documents = results['documents'][0]

for document in results['documents'][0]:
    print(document)
    print('')

to make matters worse, the feedback they ’ re getting is absurdly inconsistent. if they run twenty conversations, they end up with twenty different must - have features and twenty separate must - solve problems. the more people they talk to, the more confused they get. what ’ s going on here? their customer segment was incredibly broad, but in a sneaky way. imagine i tell you my customer segment is “ students ”. okay, you say, with a picture of an american undergraduate university student in your head. maybe it ’ s a male student. he sits down in the lecture hall, cracks open his mac ( adding to the sea of glowing apples the professor ’ s view has recently become ), and fires up reddit to help him survive the next ninety minutes. so i ’ ve built a product for students, and feedback starts coming in. but it ’ s not what i expect. one user needs to add formal citations. another wants practice questions. a third needs it to run on the ipad. a fourth needs eighty


think it ’ s going to be

In [4]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

config.json: 100%|██████████| 794/794 [00:00<00:00, 2.82MB/s]
pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:12<00:00, 7.52MB/s]
tokenizer_config.json: 100%|██████████| 316/316 [00:00<00:00, 1.54MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.83MB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 602kB/s]


In [5]:
pairs = [[query, doc] for doc in retrieved_documents]
scores = cross_encoder.predict(pairs)

print("Scores:")

for score in scores:
    print(score)

Scores:
-6.768958
-10.219069
-10.971854
-11.008442
-11.4253435
-11.410198
-10.13852
-10.082534
-10.697642
-11.268075


In [6]:
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o+1)

New Ordering:
1
8
7
2
9
3
4
10
6
5


# Re-ranking with Query Expansion


In [None]:
original_query = "What were the most important factors that contributed to increases in revenue?"
generated_queries = [
    "What were the major drivers of revenue growth?",
    "Were there any new product launches that contributed to the increase in revenue?",
    "Did any changes in pricing or promotions impact the revenue growth?",
    "What were the key market trends that facilitated the increase in revenue?",
    "Did any acquisitions or partnerships contribute to the revenue growth?"
]

In [None]:
queries = [original_query] + generated_queries

results = chroma_collection.query(query_texts=queries, n_results=10, include=['documents', 'embeddings'])
retrieved_documents = results['documents']

In [None]:
# Deduplicate the retrieved documents
unique_documents = set()
for documents in retrieved_documents:
    for document in documents:
        unique_documents.add(document)

unique_documents = list(unique_documents)

In [None]:
pairs = []
for doc in unique_documents:
    pairs.append([original_query, doc])

In [None]:
scores = cross_encoder.predict(pairs)


In [None]:
print("Scores:")
for score in scores:
    print(score)

In [None]:
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o)