In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

In [1]:
from utils import *

  from tqdm.autonotebook import tqdm, trange


# Document Splitting

In [5]:
#%pip install langchain --upgrade
#%pip install --upgrade --quiet pypdf

In [1]:
pdf_filepath = "handbook.pdf"

In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import re

In [6]:
chunk_size = 500
chunk_overlap = 100

# define the splitter 
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", ".", " ", ""]
)

In [7]:
loader = PyPDFLoader(pdf_filepath)
documents = loader.load()

chunks = r_splitter.split_documents(documents)

In [8]:
len(chunks)

342

In [6]:
chunks[-1]

Document(metadata={'source': 'handbook.pdf', 'page': 45}, page_content='[date|req|signer1]\n_________________________                           _________________________\n Signature                                                                         Date\n[text|req|signer1]\n_________________________\n Print Name\n46')

In [7]:
chunks

[Document(metadata={'source': 'handbook.pdf', 'page': 0}, page_content='Zania, Inc.\nZania Employee Handbook\nSeptember 07, 2023'),
 Document(metadata={'source': 'handbook.pdf', 'page': 1}, page_content='TABLE OF CONTENTS\nCORE POLICIES\n4\n1.0 WELCOME\n4\n1.1 A Welcome Policy\n4\n1.2 At-Will Employment\n4\n2.0 INTRODUCTORY LANGUAGE AND POLICIES\n5\n2.1 About the Company\n5\n2.2 Company Facilities\n5\n2.3 Ethics Code\n5\n2.4 Mission Statement\n5\n2.5 Our Organization\n5\n2.6 Revisions to Handbook\n5\n3.0 HIRING AND ORIENTATION POLICIES\n5\n3.1 Accommodations for Pregnant Employees\n5\n3.2 Conflicts of Interest\n6\n3.3 Employment Authorization Verification\n6\n3.4 Employment of Relatives and Friends\n6\n3.5 Job Descriptions\n7'),
 Document(metadata={'source': 'handbook.pdf', 'page': 1}, page_content='6\n3.4 Employment of Relatives and Friends\n6\n3.5 Job Descriptions\n7\n3.6 New Hires and Introductory Periods\n7\n3.7 Training Program\n7\n4.0 WAGE AND HOUR POLICIES\n7\n4.1 Attendance\n7\

In [31]:
# function to process and obtain chunks of document
def get_chunks(file):
    """
    Reads the file, split the document and return chunks of document
    """
    loader = PyPDFLoader(file)
    documents = loader.load()

    chunks = r_splitter.split_documents(documents)
    return chunks

# VectorStore (Chromadb)

In [43]:
# %pip install langchain_huggingface

In [15]:
# from langchain_huggingface import HuggingFaceEmbeddings

# embeddings_model = HuggingFaceEmbeddings(model="sentence-transformers/all-mpnet-base-v2")

In [14]:
from sentence_transformers import SentenceTransformer
if not os.path.exists('model'):
    os.mkdir('model')
    embeddings_model = SentenceTransformer('all-mpnet-base-v2')
    embeddings_model.save(path='model/all-mpnet-base-v2')

In [9]:
#from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

# get the embeddings model
model_name = "model/all-mpnet-base-v2"
model_kwargs = {}
encode_kwargs = {}
embeddings_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [26]:
from langchain.vectorstores import Chroma

In [10]:
persist_directory = 'chroma/'
vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings_model,
    # persist_directory=persist_directory
)

In [33]:
# func to process and get a vectore db
def get_vectordb(file):
    """
    Process the given file and returns a vectore database
    """

    # reads, splits and obtain chunks of input file
    chunks = get_chunks(file)

    # embeds and indexing of chunks
    persist_directory = 'chroma/'
    vector_store = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings_model,
        # persist_directory=persist_directory
    )

    return vector_store


In [34]:
vector_store = get_vectordb(pdf_filepath)

In [None]:
print(vector_store._collection.count())

342


In [12]:
# search for the relevant docs for user query

user_query = "What is the document about ?"
rel_docs = vector_store.similarity_search(user_query,k=3)


for res in rel_docs:
    print(f"======\n{res.page_content}\n[{res.metadata}]\n======\n")

TABLE OF CONTENTS
CORE POLICIES
4
1.0 WELCOME
4
1.1 A Welcome Policy
4
1.2 At-Will Employment
4
2.0 INTRODUCTORY LANGUAGE AND POLICIES
5
2.1 About the Company
5
2.2 Company Facilities
5
2.3 Ethics Code
5
2.4 Mission Statement
5
2.5 Our Organization
5
2.6 Revisions to Handbook
5
3.0 HIRING AND ORIENTATION POLICIES
5
3.1 Accommodations for Pregnant Employees
5
3.2 Conflicts of Interest
6
3.3 Employment Authorization Verification
6
3.4 Employment of Relatives and Friends
6
3.5 Job Descriptions
7
[{'page': 1, 'source': 'handbook.pdf'}]

Acknowledgment of Receipt and Review
By signing below, I acknowledge that I have received a copy of the Zania, Inc. Employee Handbook (handbook) and that I
have read it, understand it, and agree to comply with it. I understand that the Company has the maximum discretion
permitted by law to interpret, administer, change, modify, or delete the rules, regulations, procedures, and benefits contained
[{'page': 45, 'source': 'handbook.pdf'}]

agreement, the answe

In [20]:
context = ""

for doc in rel_docs:
    context += "```\n" + "Source: " + doc.metadata['source'] + "\ncontent:\n" + doc.page_content + "```\n"


In [21]:
print(context)

```
Source: handbook.pdf
content:
introductory period/after completing # days of employment]].
Deposits Into Your Leave Account
Vacation is calculated according to [[your work anniversary year/the calendar year/the fiscal year, which begins on [date]
and ends on [date]]]. 
[[
EMPLOYERS MUST CHOOSE ONE
:]]
[[
Option 1
:]]
The amount of vacation received each year is based on your length of service and [[is granted in a lump sum at the```
```
Source: handbook.pdf
content:
introductory period/after completing # days of employment]].
Deposits Into Your Leave Account
Vacation is calculated according to [[your work anniversary year/the calendar year/the fiscal year, which begins on [date]
and ends on [date]]]. 
[[
EMPLOYERS MUST CHOOSE ONE
:]]
[[
Option 1
:]]
The amount of vacation received each year is based on your length of service and [[is granted in a lump sum at the```
```
Source: handbook.pdf
content:
Vacation granted during your first year of employment will be prorated based on your

In [17]:
def get_rel_docs(user_query, document_storage, filters=[]):
    """ 
    Gets the Relevant Docs Chunks from the VectorStore
    """
    rel_docs = document_storage.similarity_search(
        query=user_query,
        k=3,
        filter=filters, # [{"term": {"metadata.source.keyword": ""}}]
    )

    return rel_docs


In [18]:
get_rel_docs(user_query)

[Document(metadata={'page': 1, 'source': 'handbook.pdf'}, page_content='TABLE OF CONTENTS\nCORE POLICIES\n4\n1.0 WELCOME\n4\n1.1 A Welcome Policy\n4\n1.2 At-Will Employment\n4\n2.0 INTRODUCTORY LANGUAGE AND POLICIES\n5\n2.1 About the Company\n5\n2.2 Company Facilities\n5\n2.3 Ethics Code\n5\n2.4 Mission Statement\n5\n2.5 Our Organization\n5\n2.6 Revisions to Handbook\n5\n3.0 HIRING AND ORIENTATION POLICIES\n5\n3.1 Accommodations for Pregnant Employees\n5\n3.2 Conflicts of Interest\n6\n3.3 Employment Authorization Verification\n6\n3.4 Employment of Relatives and Friends\n6\n3.5 Job Descriptions\n7'),
 Document(metadata={'page': 45, 'source': 'handbook.pdf'}, page_content='Acknowledgment of Receipt and Review\nBy signing below, I acknowledge that I have received a copy of the Zania, Inc. Employee Handbook (handbook) and that I\nhave read it, understand it, and agree to comply with it. I understand that the Company has the maximum discretion\npermitted by law to interpret, administer, cha

# LLM Response

In [2]:
OPENAI_API_KEY = ""

TOKENIZERS_PARALLELISM=False

In [20]:
import openai


In [None]:
def get_openai_response(prompt):
    #openai.api_key = os.getenv("OPENAI_API_KEY")
    client = openai.OpenAI(api_key=OPENAI_API_KEY)
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        raise Exception(f"Error getting response from OpenAI API: {e}")
    

In [71]:
#messages = [HumanMessage(content="")]

In [19]:
qna_prompt = """
You are a helpful assistant that provides concise answers to the query based on the provided context.  

Here is the user's query:
"{user_query}"

Below is the relevant context:
"{context}"

Based on this context, provide a clear and accurate answer to the user's query. 
If the provided information does not contain the answer, say "Data Not Available"
"""

In [22]:
formatted_prompt = qna_prompt.format(user_query=user_query, context=context)
get_openai_response(formatted_prompt)

'The vacation policy includes options for calculating vacation based on either the length of service or an accrual system. Vacation granted in the first year is prorated based on the hire date. For eligible employees, vacation can be accrued over time, subject to a maximum limit. Further specifics, such as the number of hours or days accrued, are not provided in the content.'

In [38]:
def get_query_response(user_query, document_storage):
    """
    Extract relevant context and Obtain the LLM response for given query.
    """
    # get the rel docs
    rel_docs = get_rel_docs(user_query, document_storage, filters=None)

    # prepare context
    context = ""
    for doc in rel_docs:
        context += "```\n" + "Source: " + doc.metadata['source'] + "\ncontent:\n" + doc.page_content + "```\n"

    # format the prompt with user query and context
    formatted_prompt = qna_prompt.format(user_query=user_query, context=context)
    
    # get openai response
    response = get_openai_response(formatted_prompt)

    return response, rel_docs


In [25]:
user_query = "Can I reimburse all my expenses ?"
response, rel_docs = get_query_response(user_query)
print(response)

Based on the company policies, you can reimburse only eligible expenses that comply with company policies and have proper documentation. Personal expenses unrelated to work, fines, unauthorized purchases, or expenses violating company policies or local regulations are not reimbursable. Approval from your supervisor or department head is required to evaluate the appropriateness of expenses.


In [16]:
# function to process given documents and get llm responses to a list of questions
def get_responses_questions_list(filepath, queries_list):
    """
    Process the given file and generate llm responses.
    Returns: list of question-answer pair
    """
    responses = []
    # process the document and create a vectorstore 
    print("Processing the Document....")
    vector_store = get_vectordb(filepath)
    print("Document Processed and Vectore Database Created")
    print("Getting Responses for each question...")
    for query in queries_list:
        # get the llm response
        response, rel_docs = get_query_response(query, document_storage=vector_store)

        # store the question-answer pair in responses
        responses.append({
            "question": query,
            "answer": response
        })
    print("Generated the answers for all questions.")

    return responses

# Test Cases

In [1]:
from utils import *

  from tqdm.autonotebook import tqdm, trange


In [2]:
pdf_filepath = "handbook.pdf"

questions_list = [
"What is the name of the company?",
"Who is the CEO of the company?",
"What is their vacation policy?",
"What is the termination policy?"
]

In [3]:
responses = get_responses_questions_list(pdf_filepath, questions_list)

Processing the Document....
Document Processed and Vectore Database Created
Getting Responses for each question...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Generated the answers for all questions.


In [4]:
responses

[{'question': 'What is the name of the company?',
  'answer': 'The name of the company is Zania, Inc.'},
 {'question': 'Who is the CEO of the company?',
  'answer': 'The CEO of the company is Shruti Gupta.'},
 {'question': 'What is their vacation policy?',
  'answer': 'The vacation policy includes the following key points:\n\n1. Vacation is calculated based on your length of service and is either granted in a lump sum or accrued over time, depending on the chosen policy.\n2. Vacation granted in the first year is prorated based on the hire date.\n3. Employees must request vacation in advance, at least a certain number of days or weeks prior, and requests will be granted considering business needs.\n4. Vacation must be taken in increments of at least a specified number of hours or days.\n5. Part-time employees receive vacation time proportionally to their work schedule.\n\nFor specific accrual rates and amounts, further details would be needed as they are represented by placeholders in t

123
