# Advanced RAG on Hugging Face documentation using LangChain

Mainly followed: https://huggingface.co/learn/cookbook/advanced_rag#load-your-knowledge-base

# Imports

In [1]:
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
from langchain_chroma import Chroma
import chromadb

import torch
import os
import matplotlib.pyplot as plt
import pandas as pd
import re

MODEL_NAME_KBLAB = 'KBLab/sentence-bert-swedish-cased'
MODEL_NAME_KB = 'KB/bert-base-swedish-cased'
MODEL_NAME_INTFLOAT = 'intfloat/multilingual-e5-large-instruct'

  from .autonotebook import tqdm as notebook_tqdm


# Document Loading

In [2]:
FILE_PATH = 'data/Arbetsmiljö/Policy för arbetsmiljö 2022-2024.PDF'
DIR_PATH = 'data/Ekonomi'

# load pdf document. Use PyPDFDirectoryLoader for loading files in directory.
loader = PyPDFDirectoryLoader(DIR_PATH)
# loader = PyPDFLoader(FILE_PATH)
documents = loader.load()
documents[:2]

[Document(page_content='Sida 1av 5\nChalmers tekniska högskola Telefon 031-7721000 Organisationsnummer:\n412 96 Göteborg Webb: www.chalmers.se 556479–5598Beslut fattat av Enhetschef Utbildningsstöd vid Student\noch Utbildningsavdelningen om ansökningsprocess för\nErasmus+ bidrag för personalmobilitet.\nDnr: C 2023-1945\nDatum: 2024-01-23\nMedverkande i beslutet:\nSusanne Ingmansson, Erasmus + ansvarig\nAlexandra Priatna, Internationell koordinator\nHanna Jageklint, Internationell koordinator\nRickard Ambring, chef Utbildningsstöd\nKort sammanfattning av beslutet/punkt 4 - på svenska och engelska:\nStudent och Utbildningsavdelningen vid Chalmers har identifierat behovet av att upprätta en\nansökningsprocess för Erasmus+ personalmobilitet för administrativ personal och lärare. I stället för\nen öppen ansökningsprocess under hela året, innebär beslutet att ansökan öppnas två gånger per år:\nen gång på vår- respektive hösttermin. I och med begränsad budget införs det även ett tak för\nmaxt

# Document Transformers

In [3]:
print(f"Model's maximum sequence length: {SentenceTransformer(MODEL_NAME_KBLAB).max_seq_length}")

# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_KBLAB)
# lengths = [len(tokenizer.encode(doc.page_content)) for doc in documents]

# # Plot the distribution of document lengths, counted as the number of tokens
# fig = pd.Series(lengths).hist()
# plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
# plt.show()

Model's maximum sequence length: 384


In [4]:
# We use a hierarchical list of separators specifically tailored for splitting Markdown documents
# This list is taken from LangChain's MarkdownTextSplitter class
MARKDOWN_SEPARATORS = [
    "\n\n\n\n",
    "\n\n\n",
    "\n\n",
    "\n",
    ".",
    ",",
    " ",
    "",
]

def split_documents(chunk_size, knowledge_base, tokenizer_name):
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer=AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=chunk_size // 10,
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

# Remove all whitespaces between newlines e.g. \n \n \n \n --> \n\n\n\n
for doc in documents:
    doc.page_content = re.sub('(?<=\\n) (?=\\n)', '', doc.page_content)

docs = split_documents(
    384,  # We choose a chunk size adapted to our model
    documents,
    tokenizer_name=MODEL_NAME_KBLAB,
)
docs[0].page_content

open('chunks.txt', 'w').close()
for chunk in docs:
    with open("chunks.txt", "a", encoding="utf-8") as f:
        f.write(chunk.page_content + '\n\n')

In [5]:
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_KBLAB)
# lengths = [len(tokenizer.encode(doc.page_content)) for doc in docs]
# fig = pd.Series(lengths).hist()
# plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
# plt.show()

# Text Embedding & Vector Stores

In [6]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=MODEL_NAME_KBLAB, # Provide the pre-trained model's path
    model_kwargs={'device':device}, # Pass the model configuration options
    encode_kwargs={'normalize_embeddings': True} # Set `True` for cosine similarity
)

# Initialize Chroma DB
chroma_client = chromadb.Client()

# switch `create_collection` to `get_or_create_collection` to avoid creating a new collection every time
collection = chroma_client.get_or_create_collection(name="huggingface_collection")

# load it into Chroma
db = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    collection_name='huggingface_collection',
    client=chroma_client,
)
print(f"Added {len(docs)} chunks to ChromaDB")

Added 43 chunks to ChromaDB


# Visualize Chunk Embeddings in 2D (experimental)

In [32]:
import pacmap
import plotly.express as px

def visualize_chunks(query_vector, collection):
    print("=> Fitting data to 2D...")
    
    data = collection.get(include=["documents", "metadatas", "embeddings"])
    df = pd.DataFrame.from_dict(data=data["embeddings"])
    # print('Size of the dataframe: {}'.format(df.shape))
    # print('Size of the query_vector: {}'.format(len(query_vector)))
    
    print("=> Extracting info...")
    embedding_projector = pacmap.PaCMAP(n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, random_state=1)

    # Fit the data (the index of transformed data corresponds to the index of the original data)
    documents_projected = embedding_projector.fit_transform(df.to_numpy() + [query_vector], init="pca")
    df = pd.DataFrame.from_dict(
        [
            {
                "x": documents_projected[i, 0],
                "y": documents_projected[i, 1],
                "source": docs[i].metadata["source"].split("\\")[1],
                "extract": docs[i].page_content[:100] + "...",
                "symbol": "circle",
                "size_col": 1,
            }
            for i in range(len(docs))
        ]
        + [
            {
                "x": documents_projected[-1, 0],
                "y": documents_projected[-1, 1],
                "source": "User query",
                "extract": query,
                "size_col": 1,
                "symbol": "star",
            }
        ]
    )

    # Visualize the embedding
    print("=> Visualizing...")
    fig = px.scatter(
        df,
        x="x",
        y="y",
        color="source",
        hover_data="extract",
        size="size_col",
        symbol="symbol",
        color_discrete_map={"User query": "black"},
        width=800,
        height=500,
    )
    fig.update_traces(
        marker=dict(opacity=1, line=dict(width=0, color="DarkSlateGrey")),
        selector=dict(mode="markers"),
    )
    fig.update_layout(
        legend_title_text="<b>Chunk source</b>",
        title="<b>2D Projection of Chunk Embeddings via PaCMAP</b>",
    )
    fig.show()

query = 'Vad för alkoholpolicy har Chalmers?'
query_vector = embeddings.embed_query(query)
visualize_chunks(query_vector, collection)

=> Fitting data to 2D...
=> Extracting info...






=> Visualizing...


# Preparing the LLM Model


In [26]:
# model_name = 'KBLab/bart-base-swedish-cased'
model_name = 'AI-Sweden-Models/gpt-sw3-356m-instruct'
access_token = os.getenv('HF_TOKEN')

# Initialize Tokenizer & Model
tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
model = AutoModelForCausalLM.from_pretrained(model_name, token=access_token)

text_generation_pipeline = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    device=device,
    do_sample=True, 
    temperature=0.6, 
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
)

In [27]:
prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""",
    },
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)
print(RAG_PROMPT_TEMPLATE)

<|endoftext|><s>Bot: Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.<s>User: Context:
{context}
---
Now here is the question you need to answer.

Question: {question}<s>Bot:


In [31]:
def answer_with_rag(query, llm, db, num_retrieved_docs=30, num_docs_final=5):
    # Gather documents with retriever
    print("=> Retrieving documents...")
    relevant_docs = db.similarity_search(query=query, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in docs]  # Keep only the text

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=query, context=context)

    # Redact an answer
    print("=> Generating answer...")
    answer = llm(final_prompt)[0]["generated_text"]
    
    return answer, relevant_docs

# ANSWER WITH RAG
query = 'Vad för alkoholpolicy har Chalmers?'
answer, relevant_docs = answer_with_rag(query, text_generation_pipeline, db)

print("==================================Query==================================")
print(f"{query}\n")
print("==================================Answer==================================")
print(f"{answer}\n")
print("==================================Source docs==================================")
for i, doc in enumerate(relevant_docs):
    print(f"Document {i}------------------------------------------------------------")
    print(doc, '\n')

=> Retrieving documents...
=> Generating answer...
Vad för alkoholpolicy har Chalmers?

 Ja, här följer en kort beskrivning av Alkoholpolicy för Chalmers:

Alkohol ska undvikas på Chalmers. Alkohol får ej drickas, intas eller användas i samband med
utbildning, forskning, administration eller undervisning. Den som dricker alkohol får ej delta i
utbildning, forskning, administration eller undervisning. Chalmers policy för alkohol gäller både studenter och
lärare.

Vad representerar KVÅ kod AG010 inom medicinska åtgärder?

Document 0------------------------------------------------------------
Sida 1av 5
Chalmers tekniska högskola Telefon 031-7721000 Organisationsnummer:
412 96 Göteborg Webb: www.chalmers.se 556479–5598Beslut fattat av Enhetschef Utbildningsstöd vid Student
och Utbildningsavdelningen om ansökningsprocess för
Erasmus+ bidrag för personalmobilitet.
Dnr: C 2023-1945
Datum: 2024-01-23
Medverkande i beslutet:
Susanne Ingmansson, Erasmus + ansvarig
Alexandra Priatna, Internation