# Imports

In [56]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders.pdf import PyPDFLoader, PyPDFDirectoryLoader
from langchain_community.callbacks import get_openai_callback
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate

from transformers import AutoTokenizer

import openai
import os
import chromadb
import torch
import re

MODEL_NAME_KBLAB = 'KBLab/sentence-bert-swedish-cased'
MODEL_NAME_KB = 'KB/bert-base-swedish-cased'
MODEL_NAME_INTFLOAT = 'intfloat/multilingual-e5-large-instruct'

PATH_DB = './db'
COLLECTION_NAME = 'policy_collection'

FILE_PATH = './src/documents/Alkohol- och drogpolicy.pdf'
DIR_PATH = './src/documents'

# Helper functions

In [37]:
from typing import List

def save_to_dir(texts: List[str], file_name: str, dir_path: str ='outputs'):
    """
    Save text output to a .txt file in an `outputs` directory. 
    """
    
    # Create an output directory if it doesn't exist.
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
        print("Folder %s created!" % dir_path)

    # Run this to write the answer to a txt file in the output folder
    file_path = dir_path + '/' +  file_name + '.txt'
    open(file_path, 'w').close()
    for text in texts:
        with open(file_path, 'a', encoding='utf-8') as f:
            f.write(text + "\n\n")

def split_documents(chunk_size, documents, tokenizer_name):
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """

    # We use a hierarchical list of separators specifically tailored for splitting Markdown documents
    # This list is taken from LangChain's MarkdownTextSplitter class
    MARKDOWN_SEPARATORS = [
        "\n\n\n\n",
        "\n\n\n",
        "\n\n",
        "\n",
        ".",
        ",",
        " ",
        "",
    ]
    # Remove all whitespaces between newlines e.g. \n \n \n \n --> \n\n\n\n
    for doc in documents:
        doc.page_content = re.sub('(?<=\\n) (?=\\n)', '', doc.page_content)

    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer=AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=chunk_size // 10,
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in documents:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

def upload_data(docs, embedding_model, chunk_size, collection_name, persist_dir):
    """
    Create a Chroma vectorstore from a list of documents.
    """
    
    # Split the documents to chunks
    docs = split_documents(
        chunk_size,  # Choose a chunk size adapted to our model
        documents,
        tokenizer_name=MODEL_NAME_KBLAB,
    )

    # Write chunk texts to txt file
    chunks = [chunk.page_content for chunk in docs]
    save_to_dir(chunks, 'chunks')
    
    # Create Chroma DB with document chunks
    print(f"Added {len(docs)} chunks to ChromaDB")
    return Chroma.from_documents(
        documents=docs,
        embedding=embedding_model,
        collection_name=collection_name,
        persist_directory=persist_dir
    )

# Chroma vectorstore and Embedding model

### Initialize embedding model

In [31]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'   # Check for CUDA enabled GPU
embedding_model = HuggingFaceEmbeddings(
    model_name=MODEL_NAME_KBLAB, # Provide the pre-trained model's path
    model_kwargs={'device':device}, # Pass the model configuration options
    encode_kwargs={'normalize_embeddings': True} # Set `True` for cosine similarity
)

### Load and upload documents to ChromaDB and create a vectorstore

Run this code if database is empty. Comment to not run again.

In [57]:
# Load pdf document. Use PyPDFDirectoryLoader for loading files in directory.
# loader = PyPDFLoader(FILE_PATH)
# loader = PyPDFDirectoryLoader(DIR_PATH)
# documents = loader.load()
# print('Nr. of documents:', len(documents))
# print('A Document object:', documents[:1])

# Create vectorstore with the documents
# vectorstore = upload_data(documents, embedding_model, 768, COLLECTION_NAME, PATH_DB)

Nr. of documents: 1
A Document object: [Document(metadata={'source': 'src\\documents\\Alkohol- och drogpolicy.pdf', 'page': 0}, page_content='STYRDOKUMENT: Chalmers Alkohol-och drogpolicy C-2023-1732. Beslut av rektor 2024-03-18.Beslut av:\nRektorTyp av styrdokument:\nPolicyDiarienummer:\nC 2023-1732\nDatum för beslut :\nSe e-signeringHandläggare :\nKatrin Axelsson\nDokumentet gäller från\noch med\n2024-03-18Avdelning/motsvarande som ansvarar för att dokumentet skapas och/eller\nrevideras:\nHR-avdelningen\nDokumentet gäller till\noch med:\nTills vidareDokumentet ersätter tidigare beslut:\nC 2008/538 - Policy och föreskrift avseende alkohol och droger vid Chalmers.\nFöreliggande policy ersätter policy-delen i tidigare beslut. Föreskrift gäller fram tills\nny version av föreskrift beslutas.\nPolicy avseende alkohol-och droger\nStyrdokument vid Chalmers tekniska högskola AB\nEn alkohol- och drogfri högskola är en förutsättning för en god arbets- och studiemiljö. Alkohol och\ndroger hör in

### Initialize existing persisting storage

Now we can load the persisted database from disk, and use it as normal.

In [12]:
client = chromadb.PersistentClient(path=PATH_DB)
vectorstore = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embedding_model,
    client=client
)
vectorstore.get()

{'ids': ['01218326-d821-4268-b1a0-3fc9936a4ba3',
  '02163c15-1049-42cc-bc79-6092ac5753ce',
  '04a913da-00af-4b00-82c7-f416e770ca77',
  '04dce218-91fe-4a99-b5e8-906bf8baa084',
  '05465d72-e89f-4de8-96e9-5189a7a698ff',
  '05ca52b5-5410-46fc-a29b-b0cf9ae89d41',
  '07573b57-e8d0-498e-8a3e-f93630adcd33',
  '09d3bade-92e6-4a6c-9752-277110a3e69c',
  '0b30c765-56fd-47d7-b3ad-962291c49359',
  '0c3cda6e-d9e0-456d-81b5-21b957ff5918',
  '0e73f62f-8803-4a9a-b72f-252b1542b0b7',
  '0f3efe66-447a-482f-9d02-5dd1722bb581',
  '0f43855f-75a3-4a70-b8ce-d8cae7faea46',
  '13adfb6f-5549-4c0c-9366-4135f28e484a',
  '1406999f-edc0-4b38-b3d7-3239a0937ee3',
  '14223456-ecd0-42e4-9906-8fc540101961',
  '1434e7c9-0447-4c87-b6ef-f2330abb6699',
  '19636569-3931-4308-8e77-a1a995fbd3bc',
  '1cb5d1aa-e06d-414b-a7de-b12b842b5925',
  '1f6253f8-54cf-4574-8ebc-0a3af840c1a6',
  '1fc9e59e-55eb-4c20-aca4-3b64abbdd1bc',
  '20649988-aa5b-4974-8b9b-ec985ca834f2',
  '23111814-24c1-4677-9312-c4e4ea2f290e',
  '25c39892-6ff9-4adb-bb30-

# Preparing the LLM Model

### Prompt

In [20]:
def build_prompt():
    template = """Use the following pieces of context to answer the question at the end.
    The context consists of a number of governing documents from a university. They are all in Swedish. 
    Your task is to act as an expert on the information that they contain. 
    You will later be asked various questions that should be possible to answer with the contents of the documents. 
    However, it might be that the question asked cannot be answered based on the documents’ information alone. 
    You are only allowed to answer questions based on the information from the documents.
    
    If you lack information, the information is ambiguous, or the answer for any other reason is uncertain or unclear, state that “the answer is not clear” and explain why.
    For any answer you give, you are always forced to give supporting quotes and refer to the documents from which they originate.
    Answer in Swedish.
    Break your answer up into nicely readable paragraphs.

    {context}

    Question: {question}

    Helpful Answer:"""
    return PromptTemplate.from_template(template)

### LLM model

In [21]:
from langchain_community.chat_models import ChatOllama

# Check if the OPENAI_API_KEY environment variable is set. Prompt the user to set it if not.
if "OPENAI_API_KEY" not in os.environ:
    openai.api_key = input(
        "Please enter your OpenAI API Key. You can get it from https://platform.openai.com/account/api-keys\n"
    )
else:
    openai.api_key = os.getenv('OPENAI_API_KEY')

# Initialize LLM model. Can be switched to other LLM models like llama3.
# llm = ChatOpenAI(model="gpt-3.5-turbo")
llm = ChatOllama(model="llama3")

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()

# Join the document content into one file.
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Initialize a RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | build_prompt()
    | llm
    | StrOutputParser()
)

### Run RAG

In [22]:
query = 'Antag att jag arbetar på Chalmers. Jag har en kompis med mig till arbetsplatsen. Jag ska jobba i två timmar till. Får min kompis dricka öl på mitt kontor medans han väntar på mig? '

# get_openai_callback() prints token usage and the cost.
if type(llm) == ChatOpenAI:
    with get_openai_callback() as cb:
        answer = rag_chain.invoke(query)
        print(cb)
else:
    answer = rag_chain.invoke(query)
    
print("===========================Query====================================")
print(query)
print("===========================Answer===================================")
print(answer)

What a lovely scenario!

As you work at Chalmers, I'm happy to help you with this question.

Firstly, let's look at the context. We have various regulations and guidelines for safety and security, including those related to nuclear technology (SSMFS 2008:1, SSM2022-840, etc.).

Now, regarding your friend's request to drink beer on your office desk while waiting for you... I think it's unlikely that this would be allowed.

Here's why:

1. **Campus rules**: As an employee of Chalmers, you and your friend are subject to the organization's regulations, which include rules about food and beverages in common areas.
2. **Safety considerations**: Having open containers of beer on a desk could pose a risk to both individuals and the surrounding environment (e.g., electrical equipment).
3. **Professional setting**: Your office is a professional space where you're expected to maintain a certain level of decorum and respect for your colleagues.

Considering these factors, I think it's best to advi

In [39]:
save_to_dir([answer], 'outputs', 'answer')

Folder answer created!


# Visualize Chunk Embeddings in 2D (experimental)

In [52]:
import pacmap
import plotly.express as px
import pandas as pd

def visualize_chunks(query_vector, collection, path_split):
    print('=> Fitting data to 2D...')
    
    data = collection.get(include=['documents', 'metadatas', 'embeddings'])
    df = pd.DataFrame.from_dict(data=data['embeddings'])
    metadatas = data['metadatas']
    documents = data['documents']
    
    print('=> Extracting info...')
    embedding_projector = pacmap.PaCMAP(n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, random_state=1)

    # Fit the data (the index of transformed data corresponds to the index of the original data)
    documents_projected = embedding_projector.fit_transform(df.to_numpy() + [query_vector], init='pca')
    df = pd.DataFrame.from_dict(
        [
            {
                'x': documents_projected[i, 0],
                'y': documents_projected[i, 1],
                'source': metadatas[i]['source'].split(path_split)[2], # May give error. If so, check the 'source' attribute string and change the split() condition
                'extract': documents[i][:100] + '...',
                'symbol': 'circle',
                'size_col': 0.6,
            }
            for i in range(len(documents))
        ]
        + [
            {
                'x': documents_projected[-1, 0],
                'y': documents_projected[-1, 1],
                'source': 'User query',
                'extract': query,
                'size_col': 0.1,
                'symbol': 'star',
            }
        ]
    )

    # Visualize the chunk vector embeddings
    print('=> Visualizing...')
    fig = px.scatter(df, x='x', y='y', width=800, height=500,
        color='source',
        hover_data='extract',
        size='size_col',
        symbol='symbol',
        color_discrete_map={'User query': 'black'},
    )
    fig.update_traces(
        marker=dict(opacity=1, line=dict(width=0, color='DarkSlateGrey')),
        selector=dict(mode='markers'),
    )
    fig.update_layout(
        legend_title_text='<b>Chunk source</b>',
        title='<b>2D Projection of Chunk Embeddings via PaCMAP</b>',
    )
    fig.show()

# Embedd a query
query = 'Hur är strålsäkerhetsarbetet organiserat?'
query_vector = embedding_model.embed_query(query)

# Get collection
collection = vectorstore._client.get_collection(COLLECTION_NAME)

# Print path to source to get what token to split the path. 
# Change `path_split` accordingly in next code cell.
print(collection.get()['metadatas'][0]['source'])

data/P2-subset/Arbetsmiljö/C 2022-0879 Föreskrift för strålsäkerhet och kärnteknisk verksamhet vid Chalmers slutlig.PDF


### Visualize from collection

In [54]:
import warnings
warnings.simplefilter("ignore", UserWarning)

visualize_chunks(query_vector, collection, path_split='/')

=> Fitting data to 2D...
=> Extracting info...
=> Visualizing...
