# Imports

In [1]:
from langchain_community.document_loaders.pdf import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import AutoTokenizer
from langchain.vectorstores import Chroma
import chromadb
import torch
import re

MODEL_NAME_KBLAB = 'KBLab/sentence-bert-swedish-cased'
MODEL_NAME_KB = 'KB/bert-base-swedish-cased'
MODEL_NAME_INTFLOAT = 'intfloat/multilingual-e5-large-instruct'

PATH_DB = './db'
COLLECTION_NAME = 'policy_collection'

  from .autonotebook import tqdm as notebook_tqdm


# Load data

In [13]:
FILE_PATH = 'data/P2-subset/Forskarutbildning/Föreskrifter för stipendier för studenter inom Chalmers utbildningsprogram på grund- och avancerad nivå C 2019-0748.PDF'
DIR_PATH = 'data/P2-subset/Arbetsmiljö'

# load pdf document. Use PyPDFDirectoryLoader for loading files in directory.
# loader = PyPDFDirectoryLoader(DIR_PATH)
loader = PyPDFLoader(FILE_PATH)
documents = loader.load()
documents[:1]

[Document(metadata={'source': 'data/P2-subset/Forskarutbildning/Föreskrifter för stipendier för studenter inom Chalmers utbildningsprogram på grund- och avancerad nivå C 2019-0748.PDF', 'page': 0}, page_content=' \n \n \n   \nFöreskrift för stipendier för studenter inom Chalmers utbildningsprogram på grund - och avancerad \nnivå  C 2019 -0748  \n \n \nBeslut av:  \nVicerektor för \nutbildning och \nlivslångt lärande  Typ av styrdokument:  \nFöreskrift  Diarienummer:  \nC 2019- 0748  \nDatum för beslut : \n2019-06-17 Handläggare:  \nCalle Ekdahl o ch Gloria Realpe  Dokumentstruktur : \nD1.3 Verksamhetsledning  \nDokumentet gäller \nfrån och med:  \n2019 -06-17 Avdelning/motsvarande som ansvarar för \natt dokumentet skapas och/eller \nrevideras:  \nEnheten för antagning och examen  Dokumentet \nreviderat, datum:  \n- Versionsnummer : \n2.0 \nDokumentet gäller \ntill och med:  \ntillsvidare  Dokumentet ersätter tidigare del av \nbeslut , dvs enbart bilagan som kallas :  \nDet ersätte

### Split documents into chunks

In [14]:
def split_documents(chunk_size, documents, tokenizer_name):
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """

    # We use a hierarchical list of separators specifically tailored for splitting Markdown documents
    # This list is taken from LangChain's MarkdownTextSplitter class
    MARKDOWN_SEPARATORS = [
        "\n\n\n\n",
        "\n\n\n",
        "\n\n",
        "\n",
        ".",
        ",",
        " ",
        "",
    ]
    # Remove all whitespaces between newlines e.g. \n \n \n \n --> \n\n\n\n
    for doc in documents:
        doc.page_content = re.sub('(?<=\\n) (?=\\n)', '', doc.page_content)

    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer=AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=chunk_size // 10,
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in documents:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

def upload_data(docs, embedding_model, chunk_size, collection_name, persist_dir):
    """
    Create a Chroma vectorstore from a list of documents.
    """
    
    # Split the documents to chunks
    docs = split_documents(
        chunk_size,  # Choose a chunk size adapted to our model
        documents,
        tokenizer_name=MODEL_NAME_KBLAB,
    )

    # Write chunk texts to txt file
    open('output/chunks.txt', 'w').close()
    for chunk in docs:
        with open('output/chunks.txt', 'a', encoding='utf-8') as f:
            f.write(chunk.page_content + '\n\n')
    
    # Create Chroma DB with document chunks
    print(f"Added {len(docs)} chunks to ChromaDB")
    return Chroma.from_documents(
        documents=docs,
        embedding=embedding_model,
        collection_name=collection_name,
        persist_directory=persist_dir
    )
    
def get_embedding_model(model_name, device):
    # Initialize an instance of HuggingFaceEmbeddings with the specified parameters
    return HuggingFaceEmbeddings(
        model_name=model_name, # Provide the pre-trained model's path
        model_kwargs={'device':device}, # Pass the model configuration options
        encode_kwargs={'normalize_embeddings': True} # Set `True` for cosine similarity
    )

Added 8 chunks to ChromaDB


### Get embedding models maximimum sequence length (not strict)

In [4]:
# from sentence_transformers import SentenceTransformer
# print(f"Model's maximum sequence length: {SentenceTransformer(MODEL_NAME_KBLAB).max_seq_length}")

Model's maximum sequence length: 384


## Initialize embedding model

In [None]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'   # Check for CUDA enabled GPU
embedding_model = get_embedding_model(MODEL_NAME_KBLAB, device)

### Upload docuements to ChromaDB and create a vectorstore

Run this code if database is empty. 

In [None]:
# vectorstore = upload_data(documents, embedding_model, 768, COLLECTION_NAME, PATH_DB)
# collection = vectorstore._client.get_or_create_collection(name=COLLECTION_NAME)

## Initialize existing persisting storage

In [5]:
# Now we can load the persisted database from disk, and use it as normal.
client = chromadb.PersistentClient(path=PATH_DB)
collection = client.get_or_create_collection(name=COLLECTION_NAME)
vectorstore = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embedding_model,
    client=client
)
vectorstore.get()

{'ids': ['01218326-d821-4268-b1a0-3fc9936a4ba3',
  '02163c15-1049-42cc-bc79-6092ac5753ce',
  '04a913da-00af-4b00-82c7-f416e770ca77',
  '04dce218-91fe-4a99-b5e8-906bf8baa084',
  '05465d72-e89f-4de8-96e9-5189a7a698ff',
  '05ca52b5-5410-46fc-a29b-b0cf9ae89d41',
  '07573b57-e8d0-498e-8a3e-f93630adcd33',
  '09d3bade-92e6-4a6c-9752-277110a3e69c',
  '0b30c765-56fd-47d7-b3ad-962291c49359',
  '0c3cda6e-d9e0-456d-81b5-21b957ff5918',
  '0e73f62f-8803-4a9a-b72f-252b1542b0b7',
  '0f3efe66-447a-482f-9d02-5dd1722bb581',
  '0f43855f-75a3-4a70-b8ce-d8cae7faea46',
  '13adfb6f-5549-4c0c-9366-4135f28e484a',
  '1406999f-edc0-4b38-b3d7-3239a0937ee3',
  '14223456-ecd0-42e4-9906-8fc540101961',
  '1434e7c9-0447-4c87-b6ef-f2330abb6699',
  '19636569-3931-4308-8e77-a1a995fbd3bc',
  '1cb5d1aa-e06d-414b-a7de-b12b842b5925',
  '1fc9e59e-55eb-4c20-aca4-3b64abbdd1bc',
  '20649988-aa5b-4974-8b9b-ec985ca834f2',
  '23111814-24c1-4677-9312-c4e4ea2f290e',
  '25c39892-6ff9-4adb-bb30-447050e7e941',
  '276aef69-040b-473d-81ab-

# Preparing the LLM Model

In [32]:
from langchain_core.prompts import PromptTemplate

def build_prompt():
    template = """Use the following pieces of context to answer the question at the end.
    The context consists of a number of governing documents from a university. They are all in Swedish. 
    Your task is to act as an expert on the information that they contain. 
    You will later be asked various questions that should be possible to answer with the contents of the documents. 
    However, it might be that the question asked cannot be answered based on the documents’ information alone. 
    You are only allowed to answer questions based on the information from the documents.
    
    If you lack information, the information is ambiguous, or the answer for any other reason is uncertain or unclear, state that “the answer is not clear” and explain why.
    For any answer you give, you are always forced to give supporting quotes and refer to the documents from which they originate.
    Break your answer up into nicely readable paragraphs.

    {context}

    Question: {question}

    Helpful Answer:"""
    return PromptTemplate.from_template(template)

In [30]:
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.callbacks import get_openai_callback

import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

llm = ChatOpenAI(model="gpt-4o")

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | build_prompt()
    | llm
    | StrOutputParser()
)

query = 'Vad är SEB:s roll i förvaltningen av Adlerbertska Stiftelsernas medel?'
with get_openai_callback() as cb:
    answer = rag_chain.invoke(query)
    print(cb)
    print("====================================================================")
    print(answer)

Tokens Used: 3483
	Prompt Tokens: 3287
	Completion Tokens: 196
Successful Requests: 1
Total Cost (USD): $0.019374999999999996
SEB:s roll i förvaltningen av Adlerbertska Stiftelsernas medel är att förvalta de medel som finns i dessa stiftelser och att administrera utbetalningarna. Detta framgår av följande citat från dokumenten:

"SEB förvaltar medlen som finns i dessa stiftelser och det är även dem som administrerar utbetalningarna." (Föreskrift för stipendier för studenter inom Chalmers utbildningsprogram på grund - och avancerad nivå C 2019 -0748, Adlerbertska Stiftelserna)

"SEB förvaltar medlen som finns i denna stiftelse och det är även dem som administrerar utbetalningarna." (Föreskrift för stipendier för studenter inom Chalmers utbildningsprogram på grund - och avancerad nivå C 2019 -0748, Adlerbertska Hospitiestiftelsen)


### Write response to txt file

In [31]:
open('output/answer.txt', 'w').close()
with open('output/answer.txt', 'a', encoding='utf-8') as f:
    f.write(answer)