### Useful links

https://python.langchain.com/v0.1/docs/use_cases/question_answering/quickstart/#retrieval-and-generation-generate

https://docs.trychroma.com/guides

# Imports & variables

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_community.callbacks import get_openai_callback
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from chromadb.utils.embedding_functions import OllamaEmbeddingFunction
from langchain_community.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma
from transformers import AutoTokenizer
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.llms import Ollama
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import RetrievalQA
import gradio as gr
import openai
import os
import chromadb

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders.pdf import PyPDFLoader, PyPDFDirectoryLoader
from langchain_community.callbacks import get_openai_callback
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from transformers import AutoTokenizer

import openai
import os
import chromadb
import torch
import re
import gradio as gr
from typing import List

# Various models
MODEL_NAME_KBLAB = 'KBLab/sentence-bert-swedish-cased' # a Swedish-English bilingual model designed for mapping sentences and paragraphs into a dense vector space
MODEL_NAME_KB = 'KB/bert-base-swedish-cased' # a Swedish language model based on BERT, developed by the National Library of Sweden (KBLab)
MODEL_NAME_INTFLOAT = 'intfloat/multilingual-e5-large-instruct' # a multilingual text embedding model that supports 94 languages

PATH_DB = '/Users/kailashdejesushornig/Documents/GitHub/P2_Policydokument/db'
COLLECTION_NAME = 'policy'

FILE_PATH = './src/documents/Alkohol- och drogpolicy.pdf'
DIR_PATH = './src/documents'

# Helper functions

In [2]:
def save_to_dir(texts: List[str], file_name: str, dir_path: str ='outputs'):
    """
    Save text output to a .txt file in an `outputs` directory. 
    """
    
    # Create an output directory if it doesn't exist.
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
        print("Folder %s created!" % dir_path)

    # Run this to write the answer to a txt file in the output folder
    file_path = dir_path + '/' +  file_name + '.txt'
    open(file_path, 'w').close()
    for text in texts:
        with open(file_path, 'a', encoding='utf-8') as f:
            f.write(text + "\n\n")

def split_documents(chunk_size, documents, tokenizer_name):
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """

    # We use a hierarchical list of separators specifically tailored for splitting Markdown documents
    # This list is taken from LangChain's MarkdownTextSplitter class
    MARKDOWN_SEPARATORS = [
        "\n\n\n\n",
        "\n\n\n",
        "\n\n",
        "\n",
        ".",
        ",",
        " ",
        "",
    ]
    # Remove all whitespaces between newlines e.g. \n \n \n \n --> \n\n\n\n
    for doc in documents:
        doc.page_content = re.sub('(?<=\\n) (?=\\n)', '', doc.page_content)

    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer=AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=chunk_size // 10,
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in documents:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

def upload_data(documents, embedding_model, chunk_size, collection_name, persist_dir):
    """
    Create a Chroma vectorstore from a list of documents.
    """
    
    # Split the documents to chunks
    docs = split_documents(
        chunk_size,  # Choose a chunk size adapted to our model
        documents,
        tokenizer_name=MODEL_NAME_KBLAB,
    )

    # Write chunk texts to txt file
    # chunks = [chunk.page_content for chunk in docs]
    # save_to_dir(chunks, 'chunks')
    
    # Create Chroma DB with document chunks
    print(f"Added {len(docs)} chunks to ChromaDB")
    return Chroma.from_documents(
        documents=docs,
        embedding=embedding_model,
        collection_name=collection_name,
        persist_directory=persist_dir
    )

# Chroma vectorstore and Embedding model

### Initialize embedding model

In [3]:
# device = 'cuda:0' if torch.cuda.is_available() else 'cpu'   # Check for CUDA enabled GPU
# embedding_model = HuggingFaceEmbeddings(
#     model_name=MODEL_NAME_KBLAB, # Provide the pre-trained model's path
#     model_kwargs={'device':device}, # Pass the model configuration options
#     encode_kwargs={'normalize_embeddings': True} # Set `True` for cosine similarity
# )

### Initialize existing persisting storage

We created load_data.py to upload docs to persitent storage

In [10]:
# # Now we can load the persisted database from disk, and use it as normal.
# # Instantiate a persistent chroma client in the persist_directory.
# # This will automatically load any previously saved collections.
# # Learn more at docs.trychroma.com
ollama_ef = OllamaEmbeddings(model="mxbai-embed-large")
client_db = chromadb.PersistentClient(path=PATH_DB)

#NEW
from chromadb.utils.embedding_functions import OllamaEmbeddingFunction

ollama_ef = OllamaEmbeddingFunction(
    url="http://localhost:11434/api/embeddings",
    model_name="mxbai-embed-large",
)

print(client_db.list_collections())

# # Get the collection.
collection = client_db.get_collection(name=COLLECTION_NAME, embedding_function=ollama_ef)
# collection.get()["metadatas"]

[Collection(id=36e6f6da-91fe-431d-b7f6-1887192dfd75, name=policy_collection), Collection(id=4993b05f-1944-46f2-ba1a-54f053721598, name=policy), Collection(id=5557934c-0ed0-4f34-9872-19a0c2d47521, name=research), Collection(id=fd0b785b-38cd-4665-981b-6937df5bfae1, name=langchain)]


Now we can load the persisted database from disk, and use it as normal.

In [11]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma

# Langchain embeddings
ollama_emb = OllamaEmbeddings(model="mxbai-embed-large")
vectorstore = Chroma(
    client=client_db,
    embedding_function=ollama_emb
)

### System prompt for every chat instance

In [12]:
# define the system prompt
def build_prompt():
    template = """ #Background
    Use the following pieces of context to answer the question at the end.
    The context consists of a number of governing documents from a university. They are all in Swedish. 
    Your task is to act as an expert on the information that they contain. 
    You will later be asked various questions that should be possible to answer with the contents of the documents. 
    However, it might be that the question asked cannot be answered based on the documents’ information alone. 
    You are only allowed to answer questions based on the information from the documents.
    
    #ADDITION
    Answer with as much information as you can find. Keep in mind that some documents may be old and no longer valid. 
    If a document mentions that it replaces previous documents via its file number, take into account which document is the current valid one and which should prevail. 
    If you lack information, the information is ambiguous, or the answer for any other reason is uncertain or unclear, state that "SVARET ÄR INTE SÄKERT” and explain why.
    For any answer you give, you are always forced to give supporting quotes and refer to the documents from which they originate.
    Answer in Swedish.
    Break your answer up into nicely readable paragraphs.

    #RESPONSEFOMAT
    Start by repeating the question with a sentence.

    Provide answers in the format: 
    - Topic: (e.g. finance, recruitment)	
        - Document title: (include full name of the document including the file extension  + Diarienummer)
            - Quote and the page it comes from, as well as an interpretation of the quotation.

    For each answer you give, you are always required to provide supporting quotes and refer to the documents from which they are derived.

    #DISCLAIMER 
    End any answer you give with "Observera att denna information är baserad på min sökning i de dokument som tillhandahålls och att jag kanske inte har hittat alla relevanta policyer eller riktlinjer. Om du är osäker på någon specifik aspekt rekommenderar jag att du kontaktar respektive avdelning på Chalmers eller andra relevanta myndigheter för förtydligande."
    
    {context}

    Question: {question}

    Helpful Answer:"""
    return PromptTemplate.from_template(template)

## Simple RAG


In [13]:
from langchain_community.chat_models import ChatOllama

# Check if the OPENAI_API_KEY environment variable is set. Prompt the user to set it if not.
if 'OPENAI_API_KEY' not in os.environ:
    openai.api_key = input(
        "Please enter your OpenAI API Key. You can get it from https://platform.openai.com/account/api-keys\n"
    )
else:
    openai.api_key = os.getenv('OPENAI_API_KEY')

# Initialize LLM model. Can be switched to other LLM models like llama3.
llm = ChatOpenAI(openai_api_key=os.getenv('OPENAI_API_KEY'), model="gpt-4o-mini") #or "gpt-4o"
#llm = ChatOllama(model="llama3.1")

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()

# Join the document content into one file.
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Initialize a RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | build_prompt()
    | llm
    | StrOutputParser()
)

In [14]:
query = """Jag är anställd på Chalmers och vill rekrytera en framgångsrik professor från USA till Chalmers. Vilka dokument är relevanta för mig att ha i åtanke när jag bjuder in professorn för att sälja in Chalmers och en ledig tjänst på universitetet? Jag vill:
1. flyga över forskaren från USA,
2. bjuda in honom och hans fru på middag,
3. låta honom bo på det finaste hotellet i Göteborg.
Helst vill jag att Chalmers betalar för alltihop, eftersom just den här professorn och vad han kan tillföra skulle vara ovärderligt för Chalmers. Relevanta områden är ekonomi och rekrytering av professorer. 
"""

# get_openai_callback() prints token usage and the cost.
if type(llm) == ChatOpenAI:
    with get_openai_callback() as cb:
        answer = rag_chain.invoke(query)
        print(cb)
else:
    answer = rag_chain.invoke(query)
    
print("===========================Query====================================")
print(query)
print("===========================Answer===================================")
print(answer)

Tokens Used: 1172
	Prompt Tokens: 638
	Completion Tokens: 534
Successful Requests: 1
Total Cost (USD): $0.0
Jag är anställd på Chalmers och vill rekrytera en framgångsrik professor från USA till Chalmers. Vilka dokument är relevanta för mig att ha i åtanke när jag bjuder in professorn för att sälja in Chalmers och en ledig tjänst på universitetet? Jag vill:
1. flyga över forskaren från USA,
2. bjuda in honom och hans fru på middag,
3. låta honom bo på det finaste hotellet i Göteborg.
Helst vill jag att Chalmers betalar för alltihop, eftersom just den här professorn och vad han kan tillföra skulle vara ovärderligt för Chalmers. Relevanta områden är ekonomi och rekrytering av professorer. 

Frågan handlar om vilka dokument som är relevanta för rekrytering av en professor och kostnaderna för att bjuda in denne till Chalmers.

- Topic: Rekrytering
    - Document title: Rekryteringspolicy för Chalmers tekniska högskola, Dnr 123-4567
        - "Rekrytering av professorer ska ske i enlighet m

## Add GUI

In [15]:
def respond(question,history):
    return  rag_chain.invoke(question)


gr.ChatInterface(
    respond,
    chatbot=gr.Chatbot(height=500),
    textbox=gr.Textbox(placeholder="Ask me question related to the governing documents at Chalmers", container=False, scale=7),
    title="Emilia Chatbot",
    examples=["Is it allowed to drink beer on campus?", 
              "Can I invite over a professor from the states and let Chalmers pay for his stay?", 
              "My name is Carl XVI Gustaf, can I park my Jaguar anywhere on campus?"],
    cache_examples=True,
    retry_btn=None,

).launch(share = True)

Using cache from '/Users/kailashdejesushornig/Documents/GitHub/P2_Policydokument/script/gradio_cached_examples/14' directory. If method or examples have changed since last caching, delete this folder to clear cache.

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://3a8f1b990be7d14d68.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




To save response to an txt file

In [None]:
# save_to_dir([answer], 'outputs', 'answer')

# Visualize Chunk Embeddings in 2D (experimental)

In [None]:
# import pacmap
# import plotly.express as px
# import pandas as pd

# def visualize_chunks(query_vector, collection, path_split):
#     print('=> Fitting data to 2D...')
    
#     data = collection.get(include=['documents', 'metadatas', 'embeddings'])
#     df = pd.DataFrame.from_dict(data=data['embeddings'])
#     metadatas = data['metadatas']
#     documents = data['documents']
#     print(metadatas[0]['source'].split(path_split)[1])
    
#     print('=> Extracting info...')
#     embedding_projector = pacmap.PaCMAP(n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, random_state=1)

#     # Fit the data (the index of transformed data corresponds to the index of the original data)
#     documents_projected = embedding_projector.fit_transform(df.to_numpy() + [query_vector], init='pca')
#     df = pd.DataFrame.from_dict(
#         [
#             {
#                 'x': documents_projected[i, 0],
#                 'y': documents_projected[i, 1],
#                 'source': metadatas[i]['source'].split(path_split)[1], # May give error. If so, check the 'source' attribute string and change the split() condition
#                 'extract': documents[i][:100] + '...',
#                 'symbol': 'circle',
#                 'size_col': 0.6,
#             }
#             for i in range(len(documents))
#         ]
#         + [
#             {
#                 'x': documents_projected[-1, 0],
#                 'y': documents_projected[-1, 1],
#                 'source': 'User query',
#                 'extract': query,
#                 'size_col': 0.1,
#                 'symbol': 'star',
#             }
#         ]
#     )

#     # Visualize the chunk vector embeddings
#     print('=> Visualizing...')
#     fig = px.scatter(df, x='x', y='y', width=800, height=500,
#         color='source',
#         hover_data='extract',
#         size='size_col',
#         symbol='symbol',
#         color_discrete_map={'User query': 'black'},
#     )
#     fig.update_traces(
#         marker=dict(opacity=1, line=dict(width=0, color='DarkSlateGrey')),
#         selector=dict(mode='markers'),
#     )
#     fig.update_layout(
#         legend_title_text='<b>Chunk source</b>',
#         title='<b>2D Projection of Chunk Embeddings via PaCMAP</b>',
#     )
#     fig.show()

# # Embedd a query
# query = 'Hur är strålsäkerhetsarbetet organiserat?'
# query_vector = embedding_model.embed_query(query)

# # Get collection
# collection = vectorstore._client.get_collection(COLLECTION_NAME)

# # Print path to source to get what token to split the path. 
# # Change `path_split` accordingly in next code cell.
# print(collection.get()['metadatas'][0]['source'])

### Visualize from collection

In [17]:
# import warnings
# warnings.simplefilter("ignore", UserWarning)

# visualize_chunks(query_vector, collection, path_split='\\')