# QA on a corpus of PDF files.

> Building a simple chatbot to allow the user to ask questions about a collection of PDF files (e.g., grant proposals). 

In [6]:
#| default_exp tools.pdf_corpus_qa_llamaindex

In [3]:
#| export

import os
import pandas as pd

import re
import os

from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index import Document, ServiceContext, set_global_service_context
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.node_parser import SentenceWindowNodeParser
    
from alhazen.core import get_llamaindex_llm, get_langchain_llm

from llama_index import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext,
)

import gradio as gr
import os

from pathlib import Path
from llama_index import download_loader

import fire

In [5]:
#| export 

from llama_index import download_loader
PyMuPDFReader = download_loader("PyMuPDFReader")

class PdfFileCollectionLlamaIndexChatBot:

    def __init__(self, 
                doc_dir, 
                llm_name, 
                embed_model_name="BAAI/bge-small-en-v1.5", 
                sentence_window_size=10):

        self.embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
        
        if doc_dir[-1:] != '/':
            doc_dir += '/'
        self.doc_dir = doc_dir

        self.llm = get_llamaindex_llm(llm_name)     

        # create the sentence window node parser w/ default settings
        self.node_parser = SentenceWindowNodeParser.from_defaults(
            window_size=sentence_window_size,
            window_metadata_key="window",
            original_text_metadata_key="original_text",
            )
        self.embed_model = HuggingFaceEmbedding(model_name=embed_model_name)

        # create a service context
        self.service_context = ServiceContext.from_defaults(
            llm=self.llm,
            embed_model=self.embed_model,
            node_parser=self.node_parser)
                
        self.loader = PyMuPDFReader()

    def load_all_documents(self):
        
        # List all pdf files in doc_dir by walking the whole directory tree
        pdf_file_list = []
        for root, dirs, files in os.walk(self.doc_dir):
            for file in files:
                if file.endswith('.pdf'):
                    pdf_file_list.append(os.path.join(root, file))

        # Load each PDF block as a document
        documents = []
        print('Loading Documents...')
        for file in pdf_file_list:
            pdf_documents = self.loader.load_data(file_path=file, metadata=True)
            documents.extend(pdf_documents)
        print('Loading Complete!')

        self.index = VectorStoreIndex.from_documents(documents, service_context=self.service_context)
        self.query_engine = self.index.as_query_engine()
    
    def load_one_document(self, file_path):
        pdf_document = self.loader.load_data(file_path=file_path, metadata=True)
        self.index = VectorStoreIndex.from_documents(pdf_document, service_context=self.service_context)
        self.query_engine = self.index.as_query_engine()
        
    def run_batch_of_questions_over_each_file(self, questions):
        l =[]
        # List all pdf files in doc_dir by walking the whole directory tree
        pdf_file_list = []
        for root, dirs, files in os.walk(self.doc_dir):
            for file in files:
                print(file)
                if file.endswith('.pdf'):
                    self.load_one_document(os.path.join(root, file))
                    for q in questions:
                        print('\t%s'%(q))
                        a = self.query_engine.query(q)
                        print('\t\t%s'%(a))
                        l.append({'file': file, 'question': q, 'answer': a})
        df = pd.DataFrame(l)
        return df
    
    def run_gradio(self):
        
        def add_text(history, text):
            #print('add_text: history: %s, text: %s'%(history, text))
            history = history + [(text, None)]
            return history, gr.Textbox(value="", interactive=False)

        def add_file(history, file):
            #print('add_history: history: %s, file: %s'%(history, file))
            history = history + [((file.name,), None)]
            return history

        def bot(history):
            #print('bot: history: %s'%(history))
            # prompt to send to the agent is the last message from the user
            prompt = history[-1][0]
            response = self.query_engine.query(prompt)
            print('RESPONSE: %s'%(str(response)))
            history[-1][1] = str(response)
            print('WHOLE HISTORY: %s'%(history))
            return history

        with gr.Blocks() as demo:
            chatbot = gr.Chatbot(
                [],
                elem_id="chatbot",
                bubble_full_width=False,
                #avatar_images=(None, files(alhazen_resources).joinpath('alhazen.png'))
            )
            with gr.Row():
                txt = gr.Textbox(
                    scale=4,
                    show_label=False,
                    placeholder="Enter text and press enter, or upload files",
                    container=False,
                )
                btn = gr.UploadButton("📁", file_types=["image", "video", "audio"])

            txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(bot, chatbot, chatbot)
            txt_msg.then(lambda: gr.Textbox(interactive=True), None, [txt], queue=False)
            
            file_msg = btn.upload(add_file, [chatbot, btn], [chatbot], queue=False).then(bot, chatbot, chatbot)

        demo.queue()
        demo.launch()

In [3]:
def pdf_corpus_chatbot(doc_dir, 
        llm_name, 
        embed_model_name="BAAI/bge-small-en-v1.5", 
        sentence_window_size=3):
    
    chatbot = PdfFileCollectionLlamaIndexChatBot(doc_dir, 
                llm_name,
                embed_model_name=embed_model_name, 
                sentence_window_size=sentence_window_size)
    chatbot.run_gradio()

#os.environ['LLMS_TEMP_DIR'] = '/tmp/alhazen'
#pdf_corpus_chatbot('/Users/gburns/Documents/2023H2/makeathon/grant_analysis', 'llama-2-70b-chat')

#os.environ['OPENAI_API_KEY'] = '<add your key here>'
#pdf_corpus_chatbot('/Users/gburns/Documents/2023H2/makeathon/grant_analysis', 'gpt-3.5')

In [1]:
#Did they ask for a no-cost extension?
#Do they have any money left?
#Missing information (are all the fields filled out?)
#P0: summarizing research outputs
#P1: Given a set of milestones, did the grantee report progress on any of them? 
#What progress did grantees make towards their milestones?
#What outputs support these milestones?

questions = [
    'Did they ask for a no-cost extension?',
    'Do they have any money left?',
    'Summarize all direct and indirect research outputs',
    'Summarize all direct and indirect research outputs, preferably by their identifiers (URL, DOI, etc)', ### <- in case the previous one fails
    '''What progress did grantees make towards the following milestones? 
    1) Gather a diverse dataset of biological images containing different 
    types of cells and imaging conditions. Develop preprocessing pipelines
    to standardize and enhance the quality of the data. 2) Design and 
    train deep learning models for cell segmentation. Explore various 
    architectures, including convolutional neural networks (CNNs) and 
    recurrent neural networks (RNNs), to adapt to different cell types 
    and imaging modalities. Investigate the use of transfer learning 
    from pre-trained models to boost segmentation performance. 3) 
    Assess the performance of developed models on benchmark datasets 
    and real-world biological images. Employ metrics such as intersection 
    over union (IoU), accuracy, and F1 score to measure the accuracy and robustness 
    of the segmentation results. 4) Implement the best-performing model into a user-
    friendly software tool for cell biologists and researchers. Ensure compatibility 
    with various microscopy platforms and data formats. Create documentation and provide
    user support. 5) Collaborate with domain experts and end-users to validate the tool's 
    effectiveness in real research scenarios. Gather feedback and fine-tune the model and software based on user input.'''
]

#os.environ['LLMS_TEMP_DIR'] = '/path/to/temp/dir'
#cb1 = PdfFileCollectionLlamaIndexChatBot('/path/to/input/files/', 'llama-2-70b-chat')
#df1 = cb1.run_batch_of_questions_over_each_file(questions)
#df1.to_csv('/path/to/output/fileUsers/gburns/Documents/2023H2/makeathon/grant_analysis/outputs/llama-2-70b-chat.csv', index=False, header=True, sep='\t')
#os.environ['OPENAI_API_KEY'] = '<add your key here>'
#pdf_corpus_chatbot('/Users/gburns/Documents/2023H2/makeathon/grant_analysis', 'gpt-3.5')

