# Advanced RAG on Hugging Face documentation using LangChain

Mainly followed: https://huggingface.co/learn/cookbook/advanced_rag#load-your-knowledge-base

In [11]:
!pip3 install -r requirements.txt

Collecting annotated-types==0.7.0 (from -r requirements.txt (line 4))
  Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting anyio==4.4.0 (from -r requirements.txt (line 5))
  Using cached anyio-4.4.0-py3-none-any.whl.metadata (4.6 kB)
Collecting certifi==2024.6.2 (from -r requirements.txt (line 14))
  Using cached certifi-2024.6.2-py3-none-any.whl.metadata (2.2 kB)
Collecting google-auth==2.30.0 (from -r requirements.txt (line 38))
  Using cached google_auth-2.30.0-py2.py3-none-any.whl.metadata (4.7 kB)
Collecting greenlet==3.0.3 (from -r requirements.txt (line 40))
  Using cached greenlet-3.0.3-cp312-cp312-macosx_11_0_universal2.whl.metadata (3.8 kB)
Collecting idna==3.7 (from -r requirements.txt (line 48))
  Using cached idna-3.7-py3-none-any.whl.metadata (9.9 kB)
[31mERROR: Could not find a version that satisfies the requirement intel-openmp==2021.4.0 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for intel-openmp==2021.4.0

In [12]:
!pip3 install langchain transformers
!pip3 install -U sentence-transformers
!pip3 install langchain-chroma
!pip3 install langdetect
!pip3 install PyPDF2 nltk wordcloud matplotlib



# Imports

In [13]:
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
from langchain_chroma import Chroma
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
import chromadb
import glob
import torch
import os
import matplotlib.pyplot as plt
import pandas as pd
import re

# Change to english model and hope it can cope with the Swedish. --> with APIs run GPT-4o 
MODEL_NAME_KBLAB = 'KBLab/sentence-bert-swedish-cased'
MODEL_NAME_KB = 'KB/bert-base-swedish-cased'
MODEL_NAME_INTFLOAT = 'intfloat/multilingual-e5-large-instruct'

# Investigating the data set

In [14]:
root_directory = '/Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/RegulatoryDocs'

def gather_pdfs(root_directory):
    all_pdfs = []

    # List all items in the root directory
    for item in os.listdir(root_directory):
        item_path = os.path.join(root_directory, item)

        # Check if the item is a directory
        if os.path.isdir(item_path):
            # Find all PDF files in the subdirectory
            pdf_files = glob.glob(os.path.join(item_path, '*.PDF'))
            all_pdfs.extend(pdf_files)
    
    return all_pdfs

# load pdf file paths
policy_file_paths= gather_pdfs(root_directory) # list of pdf with full path
# print(len(policy_file_paths))
# print(policy_file_paths[:])

# Extract file names from the file paths
policy_file_names = [path.split('/')[-1] for path in policy_file_paths] # list of pdf names only
print(policy_file_names)

['Chalmers riktlinje för representation_ver 240401.PDF', 'Beslut fattat av Enhetschef Utbildningsstöd vid Student och Utbildningsavdelningen om ansökningsprocess för Erasmus+ bidrag för personalmobilitet..PDF', 'Riktlinje om ekonomiskt ansvar för projektledare vid Chalmers.PDF', 'Riktlinje för inköp och upphandling vid Chalmers tekniska högskola 211008.PDF', 'Guiding principle for financial responsibility for project managers at Chalmers.PDF', 'C 2019-2201-1 Föreskrift för Chalmers hantering vid misstanke om allvarliga avvikelser.PDF', 'Riktlinjer för jävsfrågor inom forskarutbildningen vid Chalmers tekniska högskola.PDF', 'Prioritised Operational Development (PVU) 2024-2026 English translation.PDF', 'C 2023-0759 Bilaga 2 Quality Assurance Policy at Chalmers University of Technology.PDF', 'C 2019-2201-2 Riktlinje för Chalmers etik- och oredlighetskommitté - roll, arbetsform och organisation.PDF', 'Föreskrifter avseende rätten till användning av undervisningsmaterial vid Chalmers teknis

In [43]:
import os
from PyPDF2 import PdfReader

# CAN I READ THE DOCS ? --> seems like it

DIR_PATH = '/Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/RegulatoryDocs/Ekonomi'

def extract_text_from_pdfs(directory_path):
    pdf_texts = []
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(directory_path, filename)
            try:
                reader = PdfReader(file_path)
                text = ''
                for page in reader.pages:
                    text += page.extract_text()
                pdf_texts.append((filename, text))
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
    return pdf_texts

pdf_documents = extract_text_from_pdfs(DIR_PATH)

# Display the first few characters of each document
for doc in pdf_documents:
    print(f"###Document: {doc[0]}")
    print(f"###Content: {doc[1][:100]}...\n")

###Document: Chalmers riktlinje för representation_ver 240401.PDF
###Content:  
 
 
 
STYRDOKUMENT:   Riktlinje för representation.  C 2024‐0470. Beslut av ekonomichef.  
Beslut ...

###Document: Beslut fattat av Enhetschef Utbildningsstöd vid Student och Utbildningsavdelningen om ansökningsprocess för Erasmus+ bidrag för personalmobilitet..PDF
###Content: Sida 1av 5
Chalmers tekniska högskola Telefon 031-7721000 Organisationsnummer:
412 96 Göteborg Webb:...

###Document: Riktlinje om ekonomiskt ansvar för projektledare vid Chalmers.PDF
###Content:   
 
 
Fastställt av Ekonomichef  
2018- 05-01 
Ekonomiskt a nsvar för projektledare  vid Chalmers  ...

###Document: Riktlinje för inköp och upphandling vid Chalmers tekniska högskola 211008.PDF
###Content: STYRDOKUMENT: Riktlinje för inköp och upphandling vid Chalmers tekniska högskola . Dnr C 20 2 1 -
15...

###Document: Guiding principle for financial responsibility for project managers at Chalmers.PDF
###Content:   
 
 
 
Approved by Fi

# Split the eng/swe docs into separate sets

In [15]:
import os
from PyPDF2 import PdfReader
from langdetect import detect, LangDetectException

# Lists to store Swedish and English file paths
swedish_files = []
english_files = []

# Function to extract text from a single PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        text += page.extract_text()
    return text

# Function to classify file based on content
def classify_file_by_content(file_path):
    try:
        text = extract_text_from_pdf(file_path)
        language = detect(text)
        if language == 'sv':
            swedish_files.append(file_path)
        elif language == 'en':
            english_files.append(file_path)
    except LangDetectException as e:
        print(f"Language detection failed for file {file_path}: {e}")
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

# Classify all files in the list by content
for file_path in policy_file_paths:
    classify_file_by_content(file_path)

print("Swedish files:", len(swedish_files))
print("English files:", len(english_files))

# Manual handling of failed docs or errors (do å,ä,ö cause issues?)
# - Invalid Elementary Object starting with b'e' @2065: b'3(h)-5.6843419e-12(e)30(t)-44(s)152(l)-46(e)49(d)84(n)27(i)-17(n)-6(g)]TJ\n/P8F1 
# No features in text.
swedish_files.append('/Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/RegulatoryDocs/Forskarutbildning/Beslut av prorektor Charlotte Wiberg om riktlinjer för hybriddisputationer.PDF')
swedish_files.append('/Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/RegulatoryDocs/Grundutbildning/Föreskrift för utlandsstudier - stydokument 2.PDF')
swedish_files.append('/Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/RegulatoryDocs/Grundutbildning/Föreskrift för att förebygga olovlig kopiering.PDF')
swedish_files.append('/Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/RegulatoryDocs/Grundutbildning/Riktlinjer för att avvecklavilandeläggaåteruppta utbildningsprogram.PDF')
swedish_files.append('/Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/RegulatoryDocs/Grundutbildning/Beslut av rektor att fastställa uppdaterade riktlinjer för tillgodoräknande av kurser inom utbildningsprogram.PDF')
swedish_files.append('/Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/RegulatoryDocs/HR/Reviderat beslut fattat av rektor gällande riktlinjer för lärares fortsatta engagemang vid Chalmers.PDF')

print("Updated Swedish files:", len(swedish_files))

# Output the classified lists
print("Swedish files:", swedish_files)
print("English files:", english_files)

Error processing file /Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/RegulatoryDocs/Forskarutbildning/Beslut av prorektor Charlotte Wiberg om riktlinjer för hybriddisputationer.PDF: Invalid Elementary Object starting with b'e' @2065: b'3(h)-5.6843419e-12(e)30(t)-44(s)152(l)-46(e)49(d)84(n)27(i)-17(n)-6(g)]TJ\n/P8F1 '
Language detection failed for file /Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/RegulatoryDocs/Grundutbildning/Föreskrift för utlandsstudier - stydokument 2.PDF: No features in text.
Language detection failed for file /Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/RegulatoryDocs/Grundutbildning/Föreskrift för att förebygga olovlig kopiering.PDF: No features in text.
Error processing file /Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/RegulatoryDocs/Grundutbildning/Riktlinjer för att avvecklavilandeläggaåteruppta utbildningsprogram.PDF: Invalid Elementary Object starting wi

# Further investigate the policy data set

In [16]:
# Language detection based on names alone (mediocre results, but good enough for now)
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"
    
def calculate_language_ratio(documents):
    swedish_count = 0
    english_count = 0
    other_count = 0


    for doc in documents:
        language = detect_language(doc)
#Idea: split the set into SWE and ENG 

        if language == 'sv':  # Swedish language code
            swedish_count += 1
        elif language == 'en':  # English language code
            english_count += 1
        else:
            other_count += 1

    total = swedish_count + english_count + other_count
    if total == 0:
        return "No documents to analyze."

    swedish_ratio = swedish_count / total
    english_ratio = english_count / total

    return {
        'swedish_count': swedish_count,
        'english_count': english_count,
        'other_count': other_count,
        'swedish_ratio': swedish_ratio,
        'english_ratio': english_ratio
    }

ratios = calculate_language_ratio(policy_file_names)
print(ratios)

{'swedish_count': 96, 'english_count': 42, 'other_count': 7, 'swedish_ratio': 0.6620689655172414, 'english_ratio': 0.2896551724137931}


# *Word cloud and frequency (currently running into some SSL certificate errors -.-)

In [40]:
# word cloud generation ENGLISH
try:
    import os
    from PyPDF2 import PdfReader
    from collections import Counter
    import nltk
    # from nltk.corpus import stopwords
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
except ImportError:
    print("Import failed")
except Exception as e:
    print("An error occurred:", e)

# Ensure you have downloaded the stopwords dataset
nltk.download('stopwords')
print(nltk.data.path)
nltk.data.path.append('/Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/nltk_data') # added my manual path
print(nltk.data.path)

# Function to extract text from a single PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

# Function to process text and get word frequencies
def get_word_frequencies(text):
    # stop_words = set(stopwords.words('english')) # original installation
    stop_words = set('/Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/stopwords/english') # manual installation of directory
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    word_freq = Counter(words)
    return word_freq

# Function to create and display a word cloud
def create_word_cloud(word_freq, num_words=30):
    common_words = dict(word_freq.most_common(num_words))
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(common_words)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

# List of PDF file paths
# pdf_files = [
#     "path/to/your/document1.pdf",
#     "path/to/your/document2.pdf",
#     # Add more file paths as needed
# ]

# Main processing
all_text = ""
for pdf_path in english_files:
    all_text += extract_text_from_pdf(pdf_path)

word_freq = get_word_frequencies(all_text)
create_word_cloud(word_freq)


[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


['/Users/kailashdejesushornig/nltk_data', '/Library/Frameworks/Python.framework/Versions/3.12/nltk_data', '/Library/Frameworks/Python.framework/Versions/3.12/share/nltk_data', '/Library/Frameworks/Python.framework/Versions/3.12/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', '/Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/stopwords']
['/Users/kailashdejesushornig/nltk_data', '/Library/Frameworks/Python.framework/Versions/3.12/nltk_data', '/Library/Frameworks/Python.framework/Versions/3.12/share/nltk_data', '/Library/Frameworks/Python.framework/Versions/3.12/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', '/Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/stopwords', '/Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/nltk_data']


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/Users/kailashdejesushornig/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.12/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.12/share/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.12/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/stopwords'
    - '/Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/nltk_data'
    - ''
**********************************************************************


# Document loader

In [45]:
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
# FILE_PATH = '/Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/RegulatoryDocs/Ekonomi/Beslut fattat av Enhetschef Utbildningsstöd vid Student och Utbildningsavdelningen om ansökningsprocess för Erasmus+ bidrag för personalmobilitet..PDF'
DIR_PATH = '/Users/kailashdejesushornig/Documents/GitHub/Policydokument/KaisProject/RegulatoryDocs/Ekonomi'

# load pdf document. Use PyPDFDirectoryLoader for loading files in directory.
loader = PyPDFDirectoryLoader(DIR_PATH)
#loader = PyPDFLoader(FILE_PATH) # funkar inte
documents = loader.load()
print(len(documents))

5


# Document Transformers

In [41]:
print(f"Model's maximum sequence length: {SentenceTransformer(MODEL_NAME_KBLAB).max_seq_length}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_KBLAB)
lengths = [len(tokenizer.encode(doc.page_content)) for doc in documents] # changed documents to list of pdf files

# Plot the distribution of document lengths, counted as the number of tokens
fig = pd.Series(lengths).hist()
plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
plt.show()

Model's maximum sequence length: 384


NameError: name 'documents' is not defined

In [None]:
# We use a hierarchical list of separators specifically tailored for splitting Markdown documents
# This list is taken from LangChain's MarkdownTextSplitter class
MARKDOWN_SEPARATORS = [
    "\n\n\n\n",
    "\n\n\n",
    "\n\n",
    "\n",
    ".",
    ",",
    " ",
    "",
]

def split_documents(chunk_size, knowledge_base, tokenizer_name):
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer=AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=chunk_size // 10,
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

# Remove all whitespaces between newlines e.g. \n \n \n \n --> \n\n\n\n
for doc in documents:
    doc.page_content = re.sub('(?<=\\n) (?=\\n)', '', doc.page_content)

docs = split_documents(
    384,  # We choose a chunk size adapted to our model
    documents,
    tokenizer_name=MODEL_NAME_KBLAB,
)
docs[0].page_content

open('chunks.txt', 'w').close()
for chunk in docs:
    with open("chunks.txt", "a", encoding="utf-8") as f:
        f.write(chunk.page_content + '\n\n')

In [None]:
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_KBLAB)
# lengths = [len(tokenizer.encode(doc.page_content)) for doc in docs]
# fig = pd.Series(lengths).hist()
# plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
# plt.show()

# Text Embedding & Vector Stores

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=MODEL_NAME_KBLAB, # Provide the pre-trained model's path
    model_kwargs={'device':device}, # Pass the model configuration options
    encode_kwargs={'normalize_embeddings': True} # Set `True` for cosine similarity
)

# Initialize Chroma DB
chroma_client = chromadb.Client()

# switch `create_collection` to `get_or_create_collection` to avoid creating a new collection every time
collection = chroma_client.get_or_create_collection(name="huggingface_collection")

# load it into Chroma
db = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    collection_name='huggingface_collection',
    client=chroma_client,
)
print(f"Added {len(docs)} chunks to ChromaDB")

Added 43 chunks to ChromaDB


# Visualize Chunk Embeddings in 2D (experimental)

In [None]:
import pacmap
import plotly.express as px

def visualize_chunks(query_vector, collection):
    print("=> Fitting data to 2D...")
    
    data = collection.get(include=["documents", "metadatas", "embeddings"])
    df = pd.DataFrame.from_dict(data=data["embeddings"])
    # print('Size of the dataframe: {}'.format(df.shape))
    # print('Size of the query_vector: {}'.format(len(query_vector)))
    
    print("=> Extracting info...")
    embedding_projector = pacmap.PaCMAP(n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, random_state=1)

    # Fit the data (the index of transformed data corresponds to the index of the original data)
    documents_projected = embedding_projector.fit_transform(df.to_numpy() + [query_vector], init="pca")
    df = pd.DataFrame.from_dict(
        [
            {
                "x": documents_projected[i, 0],
                "y": documents_projected[i, 1],
                "source": docs[i].metadata["source"].split("\\")[1],
                "extract": docs[i].page_content[:100] + "...",
                "symbol": "circle",
                "size_col": 1,
            }
            for i in range(len(docs))
        ]
        + [
            {
                "x": documents_projected[-1, 0],
                "y": documents_projected[-1, 1],
                "source": "User query",
                "extract": query,
                "size_col": 1,
                "symbol": "star",
            }
        ]
    )

    # Visualize the embedding
    print("=> Visualizing...")
    fig = px.scatter(
        df,
        x="x",
        y="y",
        color="source",
        hover_data="extract",
        size="size_col",
        symbol="symbol",
        color_discrete_map={"User query": "black"},
        width=800,
        height=500,
    )
    fig.update_traces(
        marker=dict(opacity=1, line=dict(width=0, color="DarkSlateGrey")),
        selector=dict(mode="markers"),
    )
    fig.update_layout(
        legend_title_text="<b>Chunk source</b>",
        title="<b>2D Projection of Chunk Embeddings via PaCMAP</b>",
    )
    fig.show()

query = 'Vad för alkoholpolicy har Chalmers?'
query_vector = embeddings.embed_query(query)
visualize_chunks(query_vector, collection)

=> Fitting data to 2D...
=> Extracting info...






=> Visualizing...


# Preparing the LLM Model


In [None]:
# model_name = 'KBLab/bart-base-swedish-cased'
model_name = 'AI-Sweden-Models/gpt-sw3-356m-instruct'
access_token = os.getenv('OPENAI_API_KEY')

# Initialize Tokenizer & Model
tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
model = AutoModelForCausalLM.from_pretrained(model_name, token=access_token)

text_generation_pipeline = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    device=device,
    do_sample=True, 
    temperature=0.6, 
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
)

In [None]:
prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""",
    },
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)
print(RAG_PROMPT_TEMPLATE)

<|endoftext|><s>Bot: Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.<s>User: Context:
{context}
---
Now here is the question you need to answer.

Question: {question}<s>Bot:


In [None]:
def answer_with_rag(query, llm, db, num_retrieved_docs=30, num_docs_final=5):
    # Gather documents with retriever
    print("=> Retrieving documents...")
    relevant_docs = db.similarity_search(query=query, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in docs]  # Keep only the text

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=query, context=context)

    # Redact an answer
    print("=> Generating answer...")
    answer = llm(final_prompt)[0]["generated_text"]
    
    return answer, relevant_docs

# ANSWER WITH RAG
query = 'Vad för alkoholpolicy har Chalmers?'
answer, relevant_docs = answer_with_rag(query, text_generation_pipeline, db)

print("==================================Query==================================")
print(f"{query}\n")
print("==================================Answer==================================")
print(f"{answer}\n")
print("==================================Source docs==================================")
for i, doc in enumerate(relevant_docs):
    print(f"Document {i}------------------------------------------------------------")
    print(doc, '\n')

=> Retrieving documents...
=> Generating answer...
Vad för alkoholpolicy har Chalmers?

 Ja, här följer en kort beskrivning av Alkoholpolicy för Chalmers:

Alkohol ska undvikas på Chalmers. Alkohol får ej drickas, intas eller användas i samband med
utbildning, forskning, administration eller undervisning. Den som dricker alkohol får ej delta i
utbildning, forskning, administration eller undervisning. Chalmers policy för alkohol gäller både studenter och
lärare.

Vad representerar KVÅ kod AG010 inom medicinska åtgärder?

Document 0------------------------------------------------------------
Sida 1av 5
Chalmers tekniska högskola Telefon 031-7721000 Organisationsnummer:
412 96 Göteborg Webb: www.chalmers.se 556479–5598Beslut fattat av Enhetschef Utbildningsstöd vid Student
och Utbildningsavdelningen om ansökningsprocess för
Erasmus+ bidrag för personalmobilitet.
Dnr: C 2023-1945
Datum: 2024-01-23
Medverkande i beslutet:
Susanne Ingmansson, Erasmus + ansvarig
Alexandra Priatna, Internation