# Imports

In [None]:
from langchain_community.document_loaders.pdf import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import AutoTokenizer
from langchain.vectorstores import Chroma
import chromadb
import torch
import re

MODEL_NAME_KBLAB = 'KBLab/sentence-bert-swedish-cased'
MODEL_NAME_KB = 'KB/bert-base-swedish-cased'
MODEL_NAME_INTFLOAT = 'intfloat/multilingual-e5-large-instruct'

PATH_DB = './db'
COLLECTION_NAME = 'policy_collection'

# Load data

In [None]:
FILE_PATH = 'data/P2-subset/Forskarutbildning/Föreskrifter för stipendier för studenter inom Chalmers utbildningsprogram på grund- och avancerad nivå C 2019-0748.PDF'
DIR_PATH = 'data/P2-subset/Arbetsmiljö'

# load pdf document. Use PyPDFDirectoryLoader for loading files in directory.
# loader = PyPDFDirectoryLoader(DIR_PATH)
loader = PyPDFLoader(FILE_PATH)
documents = loader.load()
documents[:1]

### Split documents into chunks

In [None]:
def split_documents(chunk_size, documents, tokenizer_name):
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """

    # We use a hierarchical list of separators specifically tailored for splitting Markdown documents
    # This list is taken from LangChain's MarkdownTextSplitter class
    MARKDOWN_SEPARATORS = [
        "\n\n\n\n",
        "\n\n\n",
        "\n\n",
        "\n",
        ".",
        ",",
        " ",
        "",
    ]
    # Remove all whitespaces between newlines e.g. \n \n \n \n --> \n\n\n\n
    for doc in documents:
        doc.page_content = re.sub('(?<=\\n) (?=\\n)', '', doc.page_content)

    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer=AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=chunk_size // 10,
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in documents:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

def upload_data(docs, embedding_model, chunk_size, collection_name, persist_dir):
    """
    Create a Chroma vectorstore from a list of documents.
    """
    
    # Split the documents to chunks
    docs = split_documents(
        chunk_size,  # Choose a chunk size adapted to our model
        documents,
        tokenizer_name=MODEL_NAME_KBLAB,
    )

    # Write chunk texts to txt file
    open('output/chunks.txt', 'w').close()
    for chunk in docs:
        with open('output/chunks.txt', 'a', encoding='utf-8') as f:
            f.write(chunk.page_content + '\n\n')
    
    # Create Chroma DB with document chunks
    print(f"Added {len(docs)} chunks to ChromaDB")
    return Chroma.from_documents(
        documents=docs,
        embedding=embedding_model,
        collection_name=collection_name,
        persist_directory=persist_dir
    )
    
def get_embedding_model(model_name, device):
    # Initialize an instance of HuggingFaceEmbeddings with the specified parameters
    return HuggingFaceEmbeddings(
        model_name=model_name, # Provide the pre-trained model's path
        model_kwargs={'device':device}, # Pass the model configuration options
        encode_kwargs={'normalize_embeddings': True} # Set `True` for cosine similarity
    )

### Get embedding models maximimum sequence length (not strict)

In [None]:
# from sentence_transformers import SentenceTransformer
# print(f"Model's maximum sequence length: {SentenceTransformer(MODEL_NAME_KBLAB).max_seq_length}")

## Initialize embedding model

In [None]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'   # Check for CUDA enabled GPU
embedding_model = get_embedding_model(MODEL_NAME_KBLAB, device)

### Upload docuements to ChromaDB and create a vectorstore

Run this code if database is empty. 

In [None]:
# vectorstore = upload_data(documents, embedding_model, 768, COLLECTION_NAME, PATH_DB)
# collection = vectorstore._client.get_or_create_collection(name=COLLECTION_NAME)

## Initialize existing persisting storage

In [None]:
# Now we can load the persisted database from disk, and use it as normal.
client = chromadb.PersistentClient(path=PATH_DB)
collection = client.get_or_create_collection(name=COLLECTION_NAME)
vectorstore = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embedding_model,
    client=client
)
vectorstore.get()

# Preparing the LLM Model

In [None]:
from langchain_core.prompts import PromptTemplate

def build_prompt():
    template = """Use the following pieces of context to answer the question at the end.
    The context consists of a number of governing documents from a university. They are all in Swedish. 
    Your task is to act as an expert on the information that they contain. 
    You will later be asked various questions that should be possible to answer with the contents of the documents. 
    However, it might be that the question asked cannot be answered based on the documents’ information alone. 
    You are only allowed to answer questions based on the information from the documents.
    
    If you lack information, the information is ambiguous, or the answer for any other reason is uncertain or unclear, state that “the answer is not clear” and explain why.
    For any answer you give, you are always forced to give supporting quotes and refer to the documents from which they originate.
    Answer in Swedish and break your answer up into nicely readable paragraphs.

    {context}

    Question: {question}

    Helpful Answer:"""
    return PromptTemplate.from_template(template)

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.callbacks import get_openai_callback

import os

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=OPENAI_API_KEY)

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | build_prompt()
    | llm
    | StrOutputParser()
)

### Write response to txt file

In [None]:
query = 'Vad är SEB:s roll i förvaltningen av Adlerbertska Stiftelsernas medel?'
with get_openai_callback() as cb:
    answer = rag_chain.invoke(query)
    print(cb)
    print("====================================================================")
    print(answer)

open('output/answer.txt', 'w').close()
with open('output/answer.txt', 'a', encoding='utf-8') as f:
    f.write(answer)

# Visualize Chunk Embeddings in 2D (experimental)

In [None]:
# import pacmap
# import plotly.express as px

# def visualize_chunks(query_vector, collection):
#     print('=> Fitting data to 2D...')
    
#     data = collection.get(include=['documents', 'metadatas', 'embeddings'])
#     df = pd.DataFrame.from_dict(data=data['embeddings'])
#     metadatas = data['metadatas']
#     documents = data['documents']
#     # print('Size of the dataframe: {}'.format(df.shape))
#     # print('Size of the query_vector: {}'.format(len(query_vector)))
    
#     print('=> Extracting info...')
#     embedding_projector = pacmap.PaCMAP(n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, random_state=1)

#     # Fit the data (the index of transformed data corresponds to the index of the original data)
#     documents_projected = embedding_projector.fit_transform(df.to_numpy() + [query_vector], init='pca')
#     df = pd.DataFrame.from_dict(
#         [
#             {
#                 'x': documents_projected[i, 0],
#                 'y': documents_projected[i, 1],
#                 'source': metadatas[i]['source'].split('/')[2], # May give error. If so, check the 'source' attribute string and change the split() condition
#                 'extract': documents[i][:100] + '...',
#                 'symbol': 'circle',
#                 'size_col': 0.6,
#             }
#             for i in range(len(documents))
#         ]
#         + [
#             {
#                 'x': documents_projected[-1, 0],
#                 'y': documents_projected[-1, 1],
#                 'source': 'User query',
#                 'extract': query,
#                 'size_col': 0.1,
#                 'symbol': 'star',
#             }
#         ]
#     )

#     # Visualize the chunk vector embeddings
#     print('=> Visualizing...')
#     fig = px.scatter(df, x='x', y='y', width=800, height=500,
#         color='source',
#         hover_data='extract',
#         size='size_col',
#         symbol='symbol',
#         color_discrete_map={'User query': 'black'},
#     )
#     fig.update_traces(
#         marker=dict(opacity=1, line=dict(width=0, color='DarkSlateGrey')),
#         selector=dict(mode='markers'),
#     )
#     fig.update_layout(
#         legend_title_text='<b>Chunk source</b>',
#         title='<b>2D Projection of Chunk Embeddings via PaCMAP</b>',
#     )
#     fig.show()

# # Embedd a query
# query = 'Hur är strålsäkerhetsarbetet organiserat?'
# query_vector = embedding_model.embed_query(query)

# # Visualize from collection
# visualize_chunks(query_vector, collection)