# Environment setup

In [None]:
!sudo apt update
# vector database
%pip install faiss-gpu  # vector database
# PDF
!apt install poppler-utils
!apt install tesseract-ocr
!apt install libtesseract-dev
%pip install langchain-community
%pip install langchain-unstructured
%pip install langchain-huggingface
%pip install nltk --upgrade
%pip install unstructured[pdf]
%pip install optimum
%pip install auto-gptq

[33m0% [Working][0m            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
[33m0% [Connecting to security.ubuntu.com (91.189.91.82)] [Connected to cloud.r-pro[0m                                                                               Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
                                                                               Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
[33m0% [2 InRelease 5,484 B/128 kB 4%] [Connecting to security.ubuntu.com (91.189.9[0m[33m0% [2 InRelease 28.7 kB/128 kB 22%] [Connecting to security.ubuntu.com (91.189.[0m[33m0% [Waiting for headers] [Connecting to r2u.stat.illinois.edu (192.17.190.167)][0m                                                                               Get:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
[33m0% [4 InRelease 6,932 B/127 kB 5%] [Waiting for headers] [Connecting to r

Collecting optimum
  Downloading optimum-1.23.3-py3-none-any.whl.metadata (20 kB)
Collecting datasets (from optimum)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->optimum)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->optimum)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets->optimum)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub>=0.8.0->optimum)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading optimum-1.23.3-py3-none-any.whl (424 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m424.1/424.1 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6

In [None]:
# Clone our repo to obtain the data
!git clone https://github.com/andrzejczukm/NLP-RAG.git

Cloning into 'NLP-RAG'...
remote: Enumerating objects: 86, done.[K
remote: Counting objects: 100% (86/86), done.[K
remote: Compressing objects: 100% (79/79), done.[K
Receiving objects: 100% (86/86), 14.72 MiB | 336.00 KiB/s, done.
remote: Total 86 (delta 20), reused 54 (delta 7), pack-reused 0 (from 0)[K
Resolving deltas: 100% (20/20), done.


# Loading data

In [None]:
import os
import itertools

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from langchain_core.documents import Document
# from langchain_unstructured import UnstructuredLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from unstructured.cleaners.core import clean_extra_whitespace, group_broken_paragraphs
from sentence_transformers import CrossEncoder
from transformers import pipeline, set_seed as transformers_set_seed

from google.colab import userdata

  from tqdm.autonotebook import tqdm, trange


In [None]:
# set seeds for reproducibility
seed = 3
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
transformers_set_seed(seed)

In [None]:
INPUT_PATH = './NLP-RAG/data'

In [None]:
post_processors = [clean_extra_whitespace, group_broken_paragraphs]

In [None]:
documents_loaders: list[UnstructuredFileLoader] = []

for filename in os.listdir(INPUT_PATH):
    if not filename.endswith('ulysses.pdf'):
        continue

    documents_loaders.append(UnstructuredFileLoader(
        os.path.join(INPUT_PATH, filename),
        post_processors=post_processors,
        paragraph_grouper=group_broken_paragraphs,
    ))

  documents_loaders.append(UnstructuredFileLoader(


In [None]:
chunk_size = 2000

text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n\n', '.', '?', '\n'],
    chunk_size=chunk_size,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
    keep_separator="end"
)

In [None]:
# ~ 1 min (but note there is just one document for PoC)
passages: dict[str, Document] = {}

for document in documents_loaders:
    passages_for_doc = document.load_and_split(text_splitter)
    passages[document.file_path] = passages_for_doc

In [None]:
# merging all documents into a single list
passages_list = list(itertools.chain(*passages.values()))

In [None]:
# example passage from the middle
passages_list[len(passages_list) // 2]

Document(metadata={'source': './NLP-RAG/data/joyce_ulysses.pdf'}, page_content='Glad to get away from other chap’s wife. Eating off his cold plate. Chap in the Burton today spitting back gumchewed gristle. French letter still in my pocketbook. Cause of half the trouble. But might happen sometime, I don’t think. Come in, all is prepared. I dreamt. What? Worst is beginning. How they change the venue when it’s not what they like. Ask you do you like mushrooms because she once knew a gentleman who. Or ask you what someone was going to say when he changed his mind and stopped. Yet if I went the whole hog, say: I want to, something like that. Because I did. She too. Offend her. Then make it up. Pretend to want something awfully, then cry off for her sake. Flatters them. She must have been thinking of someone else all the time. What harm? Must since she came to the use of reason, he, he and he. First kiss does the trick. The propitious moment. Something inside them goes pop. Mushy like, tell 

# Proof of concept

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Embedding model

In [None]:
embedding_model = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
)

In [None]:
# this takes a few minutes on CPU but is much quicker on GPU
vector_database = FAISS.from_documents(passages_list, embedding_model)

## Cross encoder re-ranker model

In [None]:
# ~30s
reranker_model = CrossEncoder("BAAI/bge-reranker-large")

In [None]:
QUERY = "Who is Leopold Bloom's father?"
DOCS_TO_RETURN = 30

In [None]:
retriever = vector_database.as_retriever(
    search_kwargs={'k': DOCS_TO_RETURN}
)
retrieved_passages = retriever.invoke(QUERY)

passages_contents = [p.page_content for p in retrieved_passages]
data = [(QUERY, content) for content in passages_contents]

In [None]:
scores = reranker_model.predict(data)

In [None]:
results_df = pd.DataFrame({
    'passage': passages_contents,
    'score': scores
})
results_df = results_df.sort_values(by='score', ascending=False)
results_df

Unnamed: 0,passage,score
4,He thought that he thought that he was a jew w...,0.904907
0,How did these beliefs and practices now appear...,0.642891
8,drunken goy ever. So you catch no money.\n\nBL...,0.493208
1,But sir Leopold was passing grave maugre his w...,0.134021
6,"fanciers, millwrights, newspaper canvassers,\n...",0.0535
3,"BLOOM: O, I so want to be a mother. MRS THORNT...",0.007625
15,Now let us speak of that fellowship that was t...,0.006447
22,Did Bloom discover common factors of similarit...,0.006159
19,What cerebration accompanied his frequentative...,0.004982
28,What is the age of the soul of man? As she hat...,0.004861


In [None]:
results_df['passage'].iloc[0]

'He thought that he thought that he was a jew whereas he knew that he knew that\n\nhe knew that he was not.\n\nWhat, the enclosures of reticence removed, were their respective parentages? Bloom, only born male transubstantial heir of Rudolf Virag (subsequently Rudolph Bloom) of Szombathely, Vienna, Budapest, Milan, London and Dublin and of Ellen Higgins, second daughter of Julius Higgins (born Karoly) and Fanny Higgins (born Hegarty). Stephen, eldest surviving male consubstantial heir of Simon Dedalus of Cork and Dublin and of Mary, daughter of Richard and Christina Goulding (born Grier).\n\nHad Bloom and Stephen been baptised, and where and by whom, cleric or\n\nlayman?\n\nBloom (three times), by the reverend Mr Gilmer Johnston M. A., alone, in the protestant church of Saint Nicholas Without, Coombe, by James O’Connor, Philip Gilligan and James Fitzpatrick, together, under a pump in the village of Swords, and by the reverend Charles Malone C. C., in the church of the Three Patrons, Ra

In [None]:
results_df['passage'].iloc[1]

'How did these beliefs and practices now appear to him? Not more rational than they had then appeared, not less rational than other\n\nbeliefs and practices now appeared.\n\nWhat first reminiscence had he of Rudolph Bloom (deceased)? Rudolph Bloom (deceased) narrated to his son Leopold Bloom (aged 6) a retrospective arrangement of migrations and settlements in and between Dublin, London, Florence, Milan, Vienna, Budapest, Szombathely with statements of satisfaction (his grandfather having seen Maria Theresia, empress of Austria, queen of Hungary), with commercial advice (having taken care of pence, the pounds having taken care of themselves). Leopold Bloom (aged 6) had accompanied these narrations by constant consultation of a geographical map of Europe (political) and by suggestions for the establishment of affiliated business premises in the various centres mentioned.\n\nHad time equally but differently obliterated the memory of these migrations in\n\nnarrator and listener?\n\nIn nar

## LLM

In [None]:
hf_token = userdata.get('HF_TOKEN')
os.environ['HF_TOKEN'] = hf_token

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4"  # Replace with the exact model path or ID
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # Automatically assign to GPU if available
    torch_dtype="auto"  # Use appropriate precision for quantized models
)


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/75.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

In [None]:
best_passages = results_df['passage'].iloc[:5]

In [None]:
def make_prompt(user_query: str, passages: list[str]) -> str:
    prompt = 'Based on these passages:\n\n'
    prompt += '\n\n'.join(passages)
    prompt += f'\n\nAnswer to question: {user_query}\n\n'
    return prompt

In [None]:
llm_prompt = make_prompt(QUERY, best_passages)
inputs = tokenizer(llm_prompt, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Based on these passages:

He thought that he thought that he was a jew whereas he knew that he knew that

he knew that he was not.

What, the enclosures of reticence removed, were their respective parentages? Bloom, only born male transubstantial heir of Rudolf Virag (subsequently Rudolph Bloom) of Szombathely, Vienna, Budapest, Milan, London and Dublin and of Ellen Higgins, second daughter of Julius Higgins (born Karoly) and Fanny Higgins (born Hegarty). Stephen, eldest surviving male consubstantial heir of Simon Dedalus of Cork and Dublin and of Mary, daughter of Richard and Christina Goulding (born Grier).

Had Bloom and Stephen been baptised, and where and by whom, cleric or

layman?

Bloom (three times), by the reverend Mr Gilmer Johnston M. A., alone, in the protestant church of Saint Nicholas Without, Coombe, by James O’Connor, Philip Gilligan and James Fitzpatrick, together, under a pump in the village of Swords, and by the reverend Charles Malone C. C., in the church of the Th

In [None]:
inputs = tokenizer("Who is Leopold Bloom's father in Ulysses?", return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
# outputs nonsense

Who is Leopold Bloom's father in Ulysses? In James Joyce's novel "Ulysses," Leopold Bloom's father is named David. David Bloom was a solicitor and a member of the United Hebrew Congregation of Dublin. He passed away before the events of the novel take place


In [None]:
def answer_question(query: str, num_passages: int = 5, max_new_tokens=50) -> str:
    retrieved_passages = retriever.invoke(query)
    passages_contents = [p.page_content for p in retrieved_passages]
    data = [(QUERY, content) for content in passages_contents]
    scores = reranker_model.predict(data)
    results_df = pd.DataFrame({
      'passage': passages_contents,
      'score': scores
    })
    results_df = results_df.sort_values(by='score', ascending=False)
    best_passages = results_df['passage'].iloc[:num_passages]
    llm_prompt = make_prompt(query, best_passages)
    inputs = tokenizer(llm_prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).removeprefix(llm_prompt)

In [None]:
answer_question("How does Buck Mulligan look like?", max_new_tokens=200)

'According to the passage, Buck Mulligan is described as a "stately, plump" man who comes from the stairhead, carrying a bowl of lather on which a mirror and a razor lie crossed. He wears a yellow dressing gown that is not yet tied around his waist, and it is supported by the mild morning air. His face is described as being "equine in its length," meaning it is long and horse-like in shape. His hair is described as "light untonsured," meaning it is unpowdered and natural in color, "grained and hued like pale oak," suggesting it has a grainy texture and a light, oak-like color. \n\nOverall, Buck Mulligan appears to be a robust, well-built man with a somewhat formal and imposing appearance. He is also described as making rapid crosses in the air and gurgling, which suggests he is a religious figure, possibly a priest or someone who performs religious rituals. Additionally, he bless'

#### questions, answers with passages, answers without passages

In [None]:
# ~15 min
questions = [
    "Who is Leopold Bloom's father in Ulysses?",
    "What does Buck Mulligan look like?",
    'What role does Stephen Dedalus play in Ulysses, and how is he connected to the protagonist, Leopold Bloom?'
]
RAG_output = [answer_question(question, max_new_tokens = 200) for question in questions]
LLM_inputs = [tokenizer(question, return_tensors = 'pt').to(device) for question in questions]
LLM_outputs = [model.generate(**inputs, max_new_tokens = 200) for inputs in LLM_inputs]
LLM_outputs = [tokenizer.decode(outputs[0], skip_special_tokens = True) for outputs in LLM_outputs]

In [None]:
df = pd.DataFrame({
    'Questions': questions,
    'RAG_Output': RAG_output,
    'LLM_Output': LLM_outputs
})

df

Unnamed: 0,Questions,RAG_Output,LLM_Output
0,Who is Leopold Bloom's father in Ulysses?,"Based on the passage, Leopold Bloom's father i...",Who is Leopold Bloom's father in Ulysses? In J...
1,What does Buck Mulligan look like?,Buck Mulligan is described as a broad-shoulder...,What does Buck Mulligan look like? What is his...
2,What role does Stephen Dedalus play in Ulysses...,Stephen Dedalus does not appear in the provide...,What role does Stephen Dedalus play in Ulysses...


In [None]:
RAG_output[0]

'Based on the passage, Leopold Bloom\'s father is Rudolph Bloom (also referred to as Rudolf Virag). This can be inferred from the following excerpt:\n\n"Bloom, only born male transubstantial heir of Rudolf Virag (subsequently Rudolph Bloom) of Szombathely, Vienna, Budapest, Milan, London and Dublin and of Ellen Higgins, second daughter of Julius Higgins (born Karoly) and Fanny Higgins (born Hegarty)."\n\nThis passage explicitly states that Rudolf Virag is the father of Rudolph Bloom, who is in turn the father of Leopold Bloom. Therefore, Rudolph Bloom is Leopold Bloom\'s father in *Ulysses*. \n\nAdditionally, the dialogue where Rudolph Bloom addresses Leopold Bloom as "my son Leopold" further confirms this relationship. \n\nSo, to directly answer the question: **Leopold Bloom\'s father in *Ulysses* is Rudolph Bloom (R'

In [None]:
LLM_outputs[0]

'Who is Leopold Bloom\'s father in Ulysses? In James Joyce\'s novel "Ulysses," Leopold Bloom\'s father is named David. David Bloom was a Jewish grocer who died before the events of the novel take place. Bloom refers to his father as "the old man" and describes him with some fondness, but also with a sense of loss, as he had passed away by the time the story begins.\n\nDavid Bloom is not a central character in the novel, as he does not appear in the narrative. However, Bloom often reflects on his father and his legacy, particularly in relation to his own identity and experiences as a Jewish man in early 20th century Dublin. The relationship between father and son is an important theme in the novel, exploring themes of loss, memory, and the transmission of identity across generations.'

In [None]:
RAG_output[1]

"Buck Mulligan is described as a broad-shouldered, deep-chested, strong-limbed, frank-eyed, red-haired, freely-freckled, long-headed, deep-voiced, bare-kneed, shaggy-bearded, wide-mouthed, brawny-handed, hair-legged, ruddy-faced, sinewy-armed hero. He measures several ells from shoulder to shoulder and his rock-like, mountainous knees are covered with a strong growth of tawny prickly hair similar to the mountain gorse (Ulex Europeus). His nostrils are wide and cavernous, capable of lodging a fieldlark's nest. His eyes are large, with a teardrop and a smile constantly competing for dominance. They are the size of a good-sized cauliflower. A powerful current of warm breath issues from his mouth at regular intervals, and the loud, strong, hale resonances of his heart thunder rumblingly, causing the ground, the"

In [None]:
LLM_outputs[1]

'What does Buck Mulligan look like? What is his attitude towards the world around him?\n\n1 Answer | Add Yours\n\nmwestwood\'s profile pic\n\nPosted on\n\nJames Joyce\'s Ulysses is a stream of consciousness novel in which the thoughts of the characters are recorded as they occur. Thus, it is not often that the author describes the physical appearance of the characters.\n\nIn Chapter 3, Buck Mulligan is described by Stephen Dedalus as he is walking down the street. He is wearing a white shirt and a green coat. When Stephen approaches Buck, the latter looks at him with an "amused, pitying glance." Buck is a character who is aware of his own beauty and his body; he is a man who is self-absorbed, confident, and carefree. His attitude toward the world around him is one of amusement and even disdain. He is also a man who is very proud of his body, as he is described as having a "slender, well-shaped figure." In fact,'

--------------------------- testing other books ------------------------------------------

In [None]:
documents_loaders: list[UnstructuredFileLoader] = []

for filename in os.listdir(INPUT_PATH):
    if not filename.endswith('shakespeare_romeo_and_juliet.pdf'):
        continue

    documents_loaders.append(UnstructuredFileLoader(
        os.path.join(INPUT_PATH, filename),
        post_processors=post_processors,
        paragraph_grouper=group_broken_paragraphs,
    ))

  documents_loaders.append(UnstructuredFileLoader(


In [None]:
chunk_size = 800

text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n\n'],
    chunk_size=chunk_size,
    chunk_overlap=200,
    length_function=len,
    keep_separator=False,
    is_separator_regex=False,
)