# Imports

In [4]:
from langchain_community.document_loaders.pdf import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import AutoTokenizer
from langchain.vectorstores import Chroma
import os
import chromadb
import torch
import re

MODEL_NAME_KBLAB = 'KBLab/sentence-bert-swedish-cased'
MODEL_NAME_KB = 'KB/bert-base-swedish-cased'
MODEL_NAME_INTFLOAT = 'intfloat/multilingual-e5-large-instruct'

PATH_DB = './db'
COLLECTION_NAME = 'policy_collection'

  from .autonotebook import tqdm as notebook_tqdm


# Load data

In [5]:
FILE_PATH = 'data/P2-subset/Forskarutbildning/Föreskrifter för stipendier för studenter inom Chalmers utbildningsprogram på grund- och avancerad nivå C 2019-0748.PDF'
DIR_PATH = 'data/Arbetsmiljö'

doc_names = os.listdir(DIR_PATH)

# change '.PDF' to '.pdf'
for doc_name in doc_names:
    if doc_name.endswith('.PDF'):
        old_file_path = os.path.join(DIR_PATH, doc_name)
        new_file_name = doc_name[:-4] + '.pdf'
        new_file_path = os.path.join(DIR_PATH, new_file_name)
        os.rename(old_file_path, new_file_path)

# load pdf document. Use PyPDFDirectoryLoader for loading files in directory.
loader = PyPDFDirectoryLoader(DIR_PATH)
# loader = PyPDFLoader(FILE_PATH)
documents = loader.load()
documents[:1]

[Document(metadata={'source': 'data/Arbetsmiljö/C 2022-0879 Föreskrift för strålsäkerhet och kärnteknisk verksamhet vid Chalmers slutlig.pdf', 'page': 0}, page_content='STYRDOKUMENT: Föreskrift för strålsäkerhet och kärnteknisk verksamhet vid Chalmers tekniska\nhögskola AB. Dnr C 2022-0879Beslut av:\nRektorTyp av styrdokument:\nFöreskriftDiarienummer:\nC 2022-0879\nDatum för\nbeslut:Handläggare:\nEva AlbersDokumentstruktur:\nD1.3 Verksamhetsledning\nDokumentet\ngäller från och\nmed:Avdelning/motsvarande som ansvarar för\natt dokumentet skapas och/eller\nrevideras:\nInstitutionen för Kemi och kemiteknikDokumentet\nreviderat,\ndatum:Versionsnummer:\n1\nDokumentet\ngäller till och\nmed:\nTills vidareDokumentet ersätter tidigare beslut:\nC 2021-1537Dokumentet genomgånget utan\nändring, datum:\nFöreskrift för strålsäkerhet och kärnteknisk verksamhet vid\nChalmers tekniska högskola AB\nStyrdokument vid Chalmers\nBeskrivning av föreskriften:\nFöreskrift för ”strålsäkerhet och kärnteknisk ver

### Split documents into chunks

In [6]:
def split_documents(chunk_size, documents, tokenizer_name):
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """

    # We use a hierarchical list of separators specifically tailored for splitting Markdown documents
    # This list is taken from LangChain's MarkdownTextSplitter class
    MARKDOWN_SEPARATORS = [
        "\n\n\n\n",
        "\n\n\n",
        "\n\n",
        "\n",
        ".",
        ",",
        " ",
        "",
    ]
    # Remove all whitespaces between newlines e.g. \n \n \n \n --> \n\n\n\n
    for doc in documents:
        doc.page_content = re.sub('(?<=\\n) (?=\\n)', '', doc.page_content)

    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer=AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=chunk_size // 10,
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in documents:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

def upload_data(docs, embedding_model, chunk_size, collection_name, persist_dir):
    """
    Create a Chroma vectorstore from a list of documents.
    """
    
    # Split the documents to chunks
    docs = split_documents(
        chunk_size,  # Choose a chunk size adapted to our model
        documents,
        tokenizer_name=MODEL_NAME_KBLAB,
    )

    # Write chunk texts to txt file
    # open('output/chunks.txt', 'w').close()
    # for chunk in docs:
    #     with open('output/chunks.txt', 'a', encoding='utf-8') as f:
    #         f.write(chunk.page_content + '\n\n')
    
    # Create Chroma DB with document chunks
    print(f"Added {len(docs)} chunks to ChromaDB")
    return Chroma.from_documents(
        documents=docs,
        embedding=embedding_model,
        collection_name=collection_name,
        persist_directory=persist_dir
    )
    
def get_embedding_model(model_name, device):
    # Initialize an instance of HuggingFaceEmbeddings with the specified parameters
    return HuggingFaceEmbeddings(
        model_name=model_name, # Provide the pre-trained model's path
        model_kwargs={'device':device}, # Pass the model configuration options
        encode_kwargs={'normalize_embeddings': True} # Set `True` for cosine similarity
    )

### Get embedding models maximimum sequence length (not strict)

In [None]:
# from sentence_transformers import SentenceTransformer
# print(f"Model's maximum sequence length: {SentenceTransformer(MODEL_NAME_KBLAB).max_seq_length}")

Model's maximum sequence length: 384


## Initialize embedding model

In [7]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'   # Check for CUDA enabled GPU
embedding_model = get_embedding_model(MODEL_NAME_KBLAB, device)

### Upload docuements to ChromaDB and create a vectorstore

Run this code if database is empty. 

In [12]:
# vectorstore = upload_data(documents, embedding_model, 768, COLLECTION_NAME, PATH_DB)
# collection = vectorstore._client.get_or_create_collection(name=COLLECTION_NAME)

Added 17 chunks to ChromaDB


## Initialize existing persisting storage

In [8]:
# Now we can load the persisted database from disk, and use it as normal.
client = chromadb.PersistentClient(path=PATH_DB)
collection = client.get_or_create_collection(name=COLLECTION_NAME)
vectorstore = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embedding_model,
    client=client
)
vectorstore.get()

{'ids': ['102603e5-1aff-4515-95b7-9c32bb80b4cb',
  '1a29bc96-879e-4382-ad27-4835ea02f02b',
  '1bfe5f3f-fce9-4cfe-9194-8ebc554ce3a0',
  '29ecdcba-934d-490a-8757-07a4b14b094f',
  '2a073cfb-53a4-426f-a649-cfb6721f94c8',
  '55fe31f4-650d-4859-b93a-a89616a1d787',
  '5dd9c80c-ebdc-4e77-9976-2b96e107ffbb',
  '66ddce80-d875-4a1a-bfb0-346a8d3aaff1',
  '708551e0-3bf7-4695-bdb9-3117d4f8e9f6',
  '94239123-baa6-4f8e-8a01-afca7d598ed7',
  '9ef7da22-b26e-4163-a878-92c61719d5f7',
  'ae946a05-14a2-4067-bf25-66ae8df9bb5f',
  'af204445-66d7-4fc4-889e-c917a5596819',
  'b01ad458-961f-466e-870a-f3507af40df5',
  'bee4c594-73a3-4df9-a2c2-0e716bc4c5de',
  'c1ecf4b2-408c-4301-9d99-c7b9b91857d8',
  'fe3b11a5-87f1-4735-960b-010f082ec389'],
 'embeddings': None,
 'metadatas': [{'page': 1,
   'source': 'data/Arbetsmiljö/Work environment policy 2022-2024.pdf',
   'start_index': 0},
  {'page': 8,
   'source': 'data/Arbetsmiljö/C 2022-0879 Föreskrift för strålsäkerhet och kärnteknisk verksamhet vid Chalmers slutlig.p

# Preparing the LLM Model

In [12]:
from langchain_core.prompts import PromptTemplate

def build_prompt():
    template = """Use the following pieces of context to answer the question at the end.
    The context consists of a number of governing documents from a university. They are all in Swedish. 
    Your task is to act as an expert on the information that they contain. 
    You will later be asked various questions that should be possible to answer with the contents of the documents. 
    However, it might be that the question asked cannot be answered based on the documents’ information alone. 
    You are only allowed to answer questions based on the information from the documents.
    
    If you lack information, the information is ambiguous, or the answer for any other reason is uncertain or unclear, state that “the answer is not clear” and explain why.
    For any answer you give, you are always forced to give supporting quotes and refer to the documents from which they originate.
    Break your answer up into nicely readable paragraphs.

    The question will be formulated either in swedish or en english. You are forced to answer in the same language as the question. If the language is ambiguous, default to swedish.

    {context}

    Question: {question}

    Helpful Answer:"""
    return PromptTemplate.from_template(template)

In [20]:
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.callbacks import get_openai_callback
from langchain_community.chat_models import ChatOllama

import getpass
import os

# os.environ["OPENAI_API_KEY"] = getpass.getpass()

# llm = ChatOpenAI(model="gpt-4o") 
llm = ChatOllama(model="llama3")

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | build_prompt()
    | llm
    | StrOutputParser()
)

query = 'What are the most important documents for arbtesmiljö?'

## loop here to see reproducability is not importante
for i in range(10):
    answer = rag_chain.invoke(query)
    print(["Svar nr", i])
    print("====================================================================")
    print(answer)

['Svar nr', 0]
Based on the provided context, the most important documents for arbeTSmiljö (work environment) at Chalmers are:

* POLICY DOCUMENT: Chalmers Work Environment Policy, Ref. no. C 2021-1894
* STYRDOKUMENT: Chalmers Arbetsmiljöpolicy, Dnr C 2021-1894

These two documents outline the work environment policy and guidelines for Chalmers, emphasizing the importance of creating a safe and healthy work environment. They provide the framework for managing and improving the work environment, including reporting and investigating accidents and near-accidents, as well as reviewing and revising the systematic work environment management on an annual basis.

As stated in the documents:

* "The work environment at Chalmers shall reduce the risks of accidents and work or study-related ill health, among students and employees." (POLICY DOCUMENT)
* "Verksamheten ska undersöka och riskbedöma såväl den fysiska som den organisatoriska och sociala arbetsmiljön regelbundet, och vid förändringar,