In [1]:
## ChatPromptTemplate

In [7]:
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder

In [None]:
## Inbuilt imports
import os ## use for doc extension, if already using
import pathlib # creates single var


# text and pdf covered, also covers Images(jpg, png), have to see how??
from langchain_community.document_loaders import UnstructuredFileLoader 
# pdf

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from contanst import available_docs
from utils import blog
from time import time

def print_doc_list():
    print("List of available documents:")
    for count in range(len(available_docs)):
        print(f"[{count}] {available_docs[count].file_name}")
   
class DocumentReader:
    
    def __init__(self):
        # Initialising text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        print_doc_list()
        chosen_doc = int(input("Enter index of chosen document: "))
        self.path = available_docs[chosen_doc].file_path
        blog(f"File chosen -----> {self.path}")
                  
              
    # get the file extension
    def get_file_extension(self):
        return pathlib.Path(self.path).suffix
    
    # returns loaded document
    def get_document(self):
        file_ext = self.get_file_extension()
        match file_ext:
            case '.pdf':
                loader = UnstructuredFileLoader(self.path)               
            case '.txt':
                loader = UnstructuredFileLoader(self.path)
            case _:
                print('Format of the document is not supported')    
        return loader.load()
     
    # Splitting documents
    def split_documents(self):
        docs = self.get_document()
        return self.text_splitter.split_documents(docs)
        
            
    # creates vector embeddings and stores in vector store    
    def load_document(self,embeddings):
        docs = self.split_documents()
        start_time = time()
        vector_store =  Chroma.from_documents(documents=docs, embedding = embeddings)
        blog(f"Vector Store Creation time ----->{time() - start_time}")       
        return vector_store     
        
         

In [10]:
## need an LLM
from langchain_community.llms.ctransformers import CTransformers
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_huggingface import HuggingFaceEmbeddings

dir = r"D:\OneDrive - Adani\Desktop\LEARNING_FOLDER\_Kolkata_2024\1_LLM\local_downloaded_models"
file_name = "llama-2-7b-chat.Q6_K.gguf"
llm =  CTransformers( model= dir, model_file = file_name, callbacks=[StreamingStdOutCallbackHandler()], config = {"context_length": 16000, "max_new_tokens": 3000})
embed_llm = HuggingFaceEmbeddings(
            model_name = r"D:\OneDrive - Adani\Desktop\LEARNING_FOLDER\_Kolkata_2024\1_LLM\local_downloaded_models\embedding_models\gte-base-en-v1.5",
            show_progress = True,
            model_kwargs = {"trust_remote_code": True})



  from .autonotebook import tqdm as notebook_tqdm


In [12]:
## Question Answering Chaing
qa_prompt = ChatPromptTemplate.from_messages(
    messages= [
        ("system", """
         Answer the user's question from the following context: {context}
         Question: {input} 
         """)
    ]
)

## creating stuff doc chain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
from langchain.chains.combine_documents.reduce import ReduceDocumentsChain
from langchain.chains.combine_documents.map_rerank import MapRerankDocumentsChain
from langchain.chains.combine_documents.refine import RefineDocumentsChain
from langchain.chains import create_history_aware_retriever

llm_chain = prompt | llm

chain = StuffDocumentsChain(
    llm_chain= llm_chain,
    document_variable_name= "context"
)

chain.invoke({"input": ""})
create_history_aware_retriever(llm , )



'\nMapReduceDocumentsChain\nMapRerankDocumentsChain\nReduceDocumentsChain\nRefineDocumentsChain\nStuffDocumentsChain\n'

In [9]:
prompt = """
         Given a chat history and the latest user question, do this step by step, first check if
         the latest user question references anything in the chat history context,if so then reformulate the latest 
         user question into a question that could be understood without the chat history. If the latest question
         does not reference anything in the chat history, return that question as it is, without 
         any change.
         """ 
         
contextual_prompt = ChatPromptTemplate.from_messages(
    messages= [
        ("system", prompt ),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}")

    ]
    
    ## New query to be generated if it is related to chat history
    
)         

ChatPromptTemplate(input_variables=['chat_history', 'input'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='\n         Given a chat history and the latest user question, do this step by step, first check if\n         the latest user question references anything in the chat history context,if so then reformulate the latest \n         user question into a question that could be understood without the chat history. If the latest question\n         does not reference anything in the chat history, return that question as it is, without \n         any change.\n         ')), MessagesPlaceholder(variable_name='chat_history'), HumanMessag

ChatPromptTemplate(input_variables=['chat_history', 'input'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='\n         Given a chat history and the latest user question, do this step by step, first check if\n         the latest user question references anything in the chat history context,if so then reformulate the latest \n         user question into a question that could be understood without the chat history. If the latest question\n         does not reference anything in the chat history, return that question as it is, without \n         any change.\n         ')), MessagesPlaceholder(variable_name='chat_history'), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}'))])


# ROUGE
- Recall Oriented Understudy for Gisting Evaluation
- Recall
- Precision
- F1 Score (Beta Value)
- how to calculate Beta Value
- B = P/R ??


In [5]:
# !pip install evaluate
"""
! makes it a shell command within the notebook. 
! indicates that the command should be executed in shell 
"""
# !pipno install rouge-score

'\n! makes it a shell command within the notebook. \n! indicates that the command should be executed in shell \n'

In [None]:
import evaluate
rouge = evaluate.load('rouge')


query = [
                "What are the different types of leave mentioned in the document?",
                "Who is the sanctioning authority for granting leave to employees?",
                "What is the objective of providing leave to employees?",
                "How is leave earning calculated for employees?",
                "What is the Leave Year defined as in the document?",
                "Can employees avail leave without having leave credit?",
                "What are the conditions for employees to be entitled to leave?",
                "Can employees carry forward unused leave to the next year?",
                "How many public holidays are declared by businesses each year?",
                "What happens if employees do not choose optional holidays?",
                "What is the purpose of Exit Leave?",
                "Who can approve deviations from the leave rules?",
                "Are employees encouraged to work from home?",
                "What is the consequence of absence from work without sanctioned leave?",
                "Are employees entitled to special leave for parental benefits?",
                "What is the policy for employees joining or leaving in the middle of a Leave Year?",
                "Can employees encash their leave?",
                "Can employees choose their preferred holidays from the list of optional holidays?",
                "Who determines the list of optional holidays for employees to choose from?",
                "Are there any restrictions on the maximum leave balance that employees can accumulate?",
                ]

references  = [

    
            ]

"""
Access the file
Access the three sheets
20 seperate lists to be added to one single list of references
 
"""

In [12]:
# Using openpyxk module 
!pip install openpyxl

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.1 -> 24.1.2
[notice] To update, run: C:\Users\30078206\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [28]:
import openpyxl

path = r"D:\OneDrive - Adani\Desktop\LEARNING_FOLDER\_Kolkata_2024\1_LLM\3_Text_query_bot\_docs\benchmark_qa_3_temp_zero\1+gpt35turbo+snowflake-arctic-embed-m.xlsx"

# create workbook object to open workbook
wb_obj = openpyxl.load_workbook(path)
print(wb_obj.sheetnames)

# get the active sheet 
wb_obj.

['snowflakeArctic', 'bgebase', 'gteBase']
<Worksheet "snowflakeArctic">
snowflakeArctic


ValueError: bgebase is not a valid coordinate or range

In [29]:
import pandas as pd
path = r"D:\OneDrive - Adani\Desktop\LEARNING_FOLDER\_Kolkata_2024\1_LLM\3_Text_query_bot\_docs\benchmark_qa_3_temp_zero\1+gpt35turbo+snowflake-arctic-embed-m.xlsx"

wb_obj = openpyxl.load_workbook(path)
sheets = wb_obj.sheetnames
df = pd.read_excel(path, sheet_name= sheets)

In [50]:
keys = df.keys()

final_answer_list = []
first_list = df["snowflakeArctic"]["Answers"]
second_list = df["bgebase"]["Answers"]
third_list = df["gteBase"]["Answers"]


for i in range(20):
    # print(i)
    test = []
    test.append(first_list[i])    
    test.append(second_list[i])    
    test.append(third_list[i])
    final_answer_list.append(test)


final_answer_list

references = final_answer_list

references       

[['The different types of leave mentioned in the document are Privilege Leave (PL), Casual Leave (CL), Sick Leave (SL), Special Leave, Parental Leave (Maternity/Paternity), Sabbatical Leave, and Exit Leave.',
  'The different types of leave mentioned in the document are Privilege Leave (PL), Casual Leave (CL), Sick Leave (SL), Special Leave, Parental Leave (Maternity/Paternity), Sabbatical Leave, Exit Leave, and Optional holidays.',
  'The different types of leave mentioned in the document are Privilege Leave (PL), Casual Leave (CL), Sick Leave (SL), and Special Leave.'],
 ['The sanctioning authority for granting leave to employees is the Business Head/CEO.',
  'The sanctioning authority for granting leave to employees is usually the Reporting Manager of the employee or any person authorized by the organization as per the delegation of authority.',
  'The sanctioning authority for granting leave to employees is usually the Reporting Manager of the employee or any person authorized by t

In [51]:
predictions =[]
## now calculating for 

In [95]:
## now access the predictions list
import pandas as pd

prediction_file_path = r"D:\OneDrive - Adani\Desktop\LEARNING_FOLDER\_Kolkata_2024\1_LLM\3_Text_query_bot\_docs\benchmark_qa_3_temp_zero\1+llama-2-7b-chat.Q6_K.gguf+snowflake-arctic-embed-m.xlsx"

df_pred = pd.read_excel(prediction_file_path)

list_test = df_pred["Answers"]


removeWords = ["\nAssistant:", "\nBot:"]
removeWords

## formatted list
formatted_list = []

for ans in list_test:
    ans = ans.replace("\nAssistant:","")
    ans = ans.replace("\nBot:","")
    ans = ans.replace("\nHuman:","")
    ans = ans.replace("\nSystem:","")
    ans = ans.strip()
    # print(ans)
    formatted_list.append(ans)
    
len(formatted_list )   
 
 
predictions = formatted_list    
predictions

["According to the document, there are several types of leave mentioned, including:\n1. Privilege Leave (PL)\n2. Casual Leave (CL)\n3. Sick Leave (SL)\n4. Special Leave (SL)\n5. Parental Leave (Maternity/Paternity)\n6. Joining/Transfer Leave\n7. Sabbatical Leave\n8. Public Holiday Leave\n Can you tell me more about Privilege Leave? Sure! According to the document, Privilege Leave (PL) is a type of leave that is credited to an employee's leave account at the end of each Leave Year. The amount of PL earned is based on the employee's full attendance on full salary during the Leave Year. For example, if an employee has full attendance for the entire Leave Year, they will earn 21 days of PL. If the employee does not have full attendance, their PL entitlement will be proportionate to their actual attendance.",
 'The sanctioning authority for granting leave to employees is the Business Head/CEO.',
 'The objective of providing leave to employees is to provide a period of rest and relaxation du

In [99]:
import evaluate

rouge = evaluate.load('rouge')

results = rouge.compute(predictions= predictions, references= references)

In [100]:
results

{'rouge1': 0.55179551101961,
 'rouge2': 0.41309336723668677,
 'rougeL': 0.488838103847432,
 'rougeLsum': 0.47778359881172017}

In [107]:
predictions

["According to the document, there are several types of leave mentioned, including:\n1. Privilege Leave (PL)\n2. Casual Leave (CL)\n3. Sick Leave (SL)\n4. Special Leave (SL)\n5. Parental Leave (Maternity/Paternity)\n6. Joining/Transfer Leave\n7. Sabbatical Leave\n8. Public Holiday Leave\n Can you tell me more about Privilege Leave? Sure! According to the document, Privilege Leave (PL) is a type of leave that is credited to an employee's leave account at the end of each Leave Year. The amount of PL earned is based on the employee's full attendance on full salary during the Leave Year. For example, if an employee has full attendance for the entire Leave Year, they will earn 21 days of PL. If the employee does not have full attendance, their PL entitlement will be proportionate to their actual attendance.",
 'The sanctioning authority for granting leave to employees is the Business Head/CEO.',
 'The objective of providing leave to employees is to provide a period of rest and relaxation du

In [None]:
"""
How this will work in the future
- Access a single excel file , having different sheets with variable answers
- Iterate over the number of sheets
- For every sheet make a list of the answers
- You already have predictions
- You carry out the rouge score calculation
- In todays meeting, you must explain all the rouge parameters
"""

In [128]:
## now access the predictions list
import pandas as pd

prediction_file_path = r"D:\OneDrive - Adani\Desktop\LEARNING_FOLDER\_Kolkata_2024\1_LLM\3_Text_query_bot\_docs\benchmark_qa_3_temp_zero\all_llama-2-7b-chat.Q6_K.gguf_responses.xlsx"

df_pred = pd.read_excel(prediction_file_path)

df_pred

wb_obj = openpyxl.load_workbook(prediction_file_path)
sheets = wb_obj.sheetnames
sheets
wb

['snow', 'bge', 'gte']

In [132]:
prediction_file_path = r"D:\OneDrive - Adani\Desktop\LEARNING_FOLDER\_Kolkata_2024\1_LLM\3_Text_query_bot\_docs\benchmark_qa_3_temp_zero\all_llama-2-7b-chat.Q6_K.gguf_responses.xlsx"

wb_obj = openpyxl.load_workbook(prediction_file_path)
sheets = wb_obj.sheetnames
sheets
df = pd.read_excel(path, sheet_name= ['snow','gte','bge'])

ValueError: Worksheet named 'snow' not found

In [115]:
for sheet_name in sheets:
    df_pred[sheet_name]["Answers"]

KeyError: '1+llama-2-7b-chat.Q6_K.gguf+sno'