# Retrieval-Augmented Generation (RAG)

Install the Hugging Face libraries to run this notebook.

In [98]:
!pip install transformers wikipedia

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import torch
import torch.nn.functional as F

import wikipedia
import json

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Document ingestion

In [3]:
def extract_wikipedia_pages(page_titles):
    """
    Extracts Wikipedia pages and stores them in a dictionary.

    Args:
        page_titles: A list of Wikipedia page titles to extract.

    Returns:
        A dictionary containing the text of each Wikipedia page.
    """

    page_data = {}
    for title in page_titles:
        try:
            page = wikipedia.page(title)
            content = page.content.strip()
            content = content.replace("\n", "")
            page_data[page.title] = content
        except wikipedia.exceptions.PageError:
            print(f"Page '{title}' not found.")
        except wikipedia.exceptions.DisambiguationError as e:
            print(f"Disambiguation error for '{title}': {e.options}")

    return page_data

In [4]:
page_titles = [
               "Roger Apéry",
               "Owen Willans Richardson",
               "Otto Sackur",
               "Ludvig Lorenz",
               "Klaus von Klitzing",
               "Henri Victor Regnault",
               "Erwin Madelung",
              ]

# Uncomment the next line to scroll through Wikipedia
# wikipedia_data = extract_wikipedia_pages(page_titles)

Save the dictionary using `json.dump()`:

In [5]:
# with open('wikipedia_data.json', 'w') as f:
#     json.dump(wikipedia_data, f, indent=4)

Load the dictionary using `json.load()`:

In [6]:
with open('wikipedia_data.json', 'r') as f:
    wikipedia_data = json.load(f)

In [7]:
for doc in wikipedia_data:
    print(len(wikipedia_data[doc]))

3107
3455
1683
1873
1762
3431
1487


## Document pre-processing

We load just the tokenizer:

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model_max_length = tokenizer.model_max_length
model_max_length

512

In [9]:
encoded_text = tokenizer.encode(["hello", "how are you?"])
tokenizer.decode(encoded_text)

'[CLS] hello [SEP] how are you? [SEP]'

In [10]:
def text_splitting(text, chunk_length = 300, chunk_overlap = 100):
    pass

text_splitting("".join([str(x) for x in range(20)]), 5, 2)

In [11]:
wikipedia_data_splits = {}

for doc in wikipedia_data.keys():
    wikipedia_data_splits[doc] = text_splitting_paragraph(wikipedia_data[doc])

first_key = page_titles[0]
wikipedia_data_splits[first_key][:3]

NameError: name 'text_splitting_paragraph' is not defined

In [12]:
min_doc = min(len(wikipedia_data_splits[doc]) for doc in wikipedia_data_splits)
max_doc = max(len(wikipedia_data_splits[doc]) for doc in wikipedia_data_splits)
av_doc = sum(len(wikipedia_data_splits[doc]) for doc in wikipedia_data_splits) / len(wikipedia_data_splits)

min_doc,max_doc,av_doc

ValueError: min() iterable argument is empty

## Generating embeddings

Now we load the embedder:

In [13]:
from transformers import AutoModel

model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [14]:
inputs = tokenizer("Hello, world!", return_tensors="pt")
outputs = model(**inputs)

output_dim = outputs.last_hidden_state.size(2)
output_dim

384

The embedder needs to know whether the document is a document or a query.

In [15]:
def embed(chunk_list, doc_type="document"):
    encoded_docs = tokenizer(["search_{}: {}".format(doc_type, chunk) for chunk in chunk_list],
                                 padding = True,
                                 return_tensors="pt")
    output = model(**encoded_docs) # (batch, input_length, output_dim)
    token_embeddings = output.last_hidden_state
    output_embeddings = torch.sum(token_embeddings, 1)
    output_embeddings = F.normalize(output_embeddings, p=2, dim=1)
    return output_embeddings # (batch, output_dim)

In [16]:
embed(["hello", "another document", "and another one"]).shape

torch.Size([3, 384])

**Exercise 1**: chunks may lack context. The ideal of `contextual embeddings` is to ask an LLM to write some context about the chunk (given the full document and the chunk), and to embed the chunk together with the context.
Implement this idea here (choose a simple enough model and the appropriate task!).

In [17]:
def populate_database(dic_splits, batch_size = 1):
    n_chunks = sum([len(dic_splits[doc]) for doc in dic_splits])
    vectorial_database = torch.zeros([n_chunks, output_dim], requires_grad = False).to(device)
    chunk_list = []
    pass

chunk_list, vectorial_database = populate_database(wikipedia_data_splits)

TypeError: cannot unpack non-iterable NoneType object

Save the vectorial database using `torch.save()`:

In [18]:
torch.save(vectorial_database, 'vectorial_database.pth')

with open('chunk_list.json', 'w') as f:
    json.dump(chunk_list, f, indent=4)

NameError: name 'vectorial_database' is not defined

Load the database using `torch.load()`:

In [19]:
vectorial_database = torch.load('vectorial_database.pth')
vectorial_database.to(device)
vectorial_database.requires_grad_(False)

with open('chunk_list.json', 'r') as f:
    chunk_list = json.load(f)

In [20]:
len(chunk_list), vectorial_database.shape

(67, torch.Size([67, 384]))

In [21]:
for i, embedding_vector in enumerate(vectorial_database[:20]):
    print(embedding_vector[:5], chunk_list[i][:50])

tensor([-0.0557,  0.0384, -0.0307, -0.0143, -0.0188], device='cuda:0') Roger Apéry (French: [apeʁi]; 14 November 1916, Ro
tensor([ 0.0187,  0.0690, -0.0273, -0.0288,  0.0369], device='cuda:0') == Biography ==Apéry was born in Rouen in 1916 to 
tensor([ 0.0271,  0.0228, -0.0974,  0.0471, -0.0042], device='cuda:0')   His studies were interrupted at the start of Wor
tensor([-0.0875,  0.0111, -0.0129, -0.0130, -0.1274], device='cuda:0')  He wrote his doctoral thesis in algebraic geometr
tensor([-0.0636,  0.0460, -0.0357,  0.1095, -0.0656], device='cuda:0')  In 1949 he was appointed Professor at the Univers
tensor([-0.1005,  0.0247, -0.0105,  0.1190,  0.0525], device='cuda:0')  An indication of the difficulty is that the corre
tensor([-0.1744,  0.0473,  0.0322,  0.0616,  0.0066], device='cuda:0')  Nevertheless, many mathematicians have since work
tensor([ 0.0378,  0.0296, -0.0356,  0.0142,  0.0272], device='cuda:0') Apéry was active in politics and for a few years i
tensor([-0.0365,  0.0051

## Retrieval

In [22]:
def similarity(query_embeddings, doc_embeddings):
    return query_embeddings @ doc_embeddings.T

In [23]:
query_embeddings = embed([
    "What is TSNE?",
    "Who is Laurens van der Maaten?",
], "query")

doc_embeddings = embed([
    "TSNE is a dimensionality reduction algorithm created by Laurens van Der Maaten",
], "document")

with torch.no_grad():
    print(similarity(query_embeddings, doc_embeddings))

tensor([[0.6542],
        [0.4483]])


In [24]:
def retrieve(query, 
             vectorial_database = vectorial_database, 
             chunk_list = chunk_list, 
             topk = 5,
             verbose = False):
    pass

In [25]:
%%timeit
retrieve("What did Erwin Madelung study?")

37.2 ns ± 0.335 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


**Exercise 2**: The similarity measure is based on embeddings. A completely different approach is `lexical matching`, meaning by matching keywords from the query to the documents. It is based on `TF-IDF (Term Frequency-Inverse Document Frequency)`, as follows:
* Compute TF-IDF for each chunk
* BM25 returns the 25 most relevant chunks based on their TF-IDF match to the query

Implement this approach.

**Exercise 3**:
A `reranker` is (yet another) LLM which looks at the query and some chunks and ranks them by relevance. 
Implement this approach.

For information: Claude combines BM25 with similarity measures as follows:
* Use BM25 to retrieve 25 chunks
* independently, use similarity measure on embeddings to retrieve 25 chunks
* Use a reranker to combine and deduplicate the obtained 50 chunks

### Alternative retrieval: SVM

In [26]:
import numpy as np
from sklearn import svm

def retrieve_SVM(query, 
             vectorial_database = vectorial_database, 
             chunk_list = chunk_list, 
             topk = 5):
    query_embedding = embed([query], "query")
    x = np.concatenate([query_embedding.detach().numpy(), vectorial_database.detach().numpy()])
    y = np.zeros(vectorial_database.size(0) + 1)
    y[0] = 1 # we have a single positive example

    clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1, dual="auto")
    clf.fit(x, y)
    similarities = clf.decision_function(x)
    sorted_ix = np.argsort(-similarities)
    for k in sorted_ix[1:topk+1]:
        print(f"Score: {similarities[k]:.4f}\nText:\n", chunk_list[k-1], "\n")
    return "\n".join([chunk_list[k-1] for k in sorted_ix[1:topk+1]])

In [27]:
retrieve_SVM("What did Erwin Madelung study?")

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

## Full pipeline

This model does **extractive** question answering, meaning it can only points to the answer in the provided context.

In [28]:
from transformers import AutoModelForQuestionAnswering, pipeline

model_name = "deepset/tinyroberta-squad2"

QA = pipeline('question-answering', model=model_name, tokenizer=model_name, device=device)

Device set to use cuda


In [29]:
def query(prompt):
    topk_chunks = retrieve(prompt)
#     topk_chunks = retrieve_SVM(prompt)
    return QA(question=prompt, context=topk_chunks)

In [30]:
query("What did Erwin Madelung study?")

ValueError: Arguments can't be understood