In [56]:
import gradio as gr
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TfidfRetriever
from haystack.nodes import FARMReader
from haystack import Pipeline
from haystack.pipelines import ExtractiveQAPipeline
import PyPDF2


In [57]:
document_store = InMemoryDocumentStore()
retriever = TfidfRetriever(document_store=document_store)
reader = FARMReader(
    model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
    
# AshtonIsNotHere/GatorTron-OG
# deepset/roberta-base-squad2
# distilbert-base-uncased


querying_pipeline = Pipeline()
querying_pipeline.add_node(
    component=retriever, name="Retriever", inputs=["Query"])
querying_pipeline.add_node(
    component=reader, name="Reader", inputs=["Retriever"])


In [58]:

def print_answers(results):
    fields = ["answer", "score", "context"]  # "context",
    answers = results["answers"]
    filtered_answers = []

    for ans in answers:
        filtered_ans = {
            field: getattr(ans, field)
            for field in fields
            if getattr(ans, field) is not None
        }
        filtered_answers.append(filtered_ans)

    return filtered_answers


def run_once(f):
    def wrapper(*args, **kwargs):
        if not wrapper.has_run:
            wrapper.has_run = True
            return f(*args, **kwargs)
    wrapper.has_run = False
    return wrapper


In [59]:

pipe = ExtractiveQAPipeline(reader, retriever)


@run_once
def written_document(pdf_file):
    dict_of_docs = []
    pdfReader = PyPDF2.PdfReader(pdf_file)
    for index, page in enumerate(pdfReader.pages):
        text = page.extract_text()
        document = {"content": text, "meta": {
            "name": pdf_file + " - " + str(index + 1)}}
        dict_of_docs.append(document)
    document_store.write_documents(dict_of_docs)
    return None


def predict(question, pdf_file):
    written_document(pdf_file)
    result = pipe.run(query=question, params={
        "Retriever": {"top_k": 3}, "Reader": {"top_k": 2}})
    answers = print_answers(result)
    return answers


In [60]:
# Load an initial PDF file
print(predict("What is colonoscopy?",
      "sample_reports/colonoscopy.pdf"))




Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

[{'answer': 'views the inside of your large intestine (colon) for bleeding, inflammation, polyps and tumors', 'score': 0.6540099382400513, 'context': 'r Colonoscopy\nA colonoscopy views the inside of your large intestine (colon) for bleeding, inflammation, polyps and tumors. See Figure \n1.\n \nDuring th'}, {'answer': 'special diet', 'score': 0.24448636174201965, 'context': 'ns now.\nA thorough colon, or bowel, cleansing along with following a special diet before your colonoscopy is \nkey to its success.\nThe best colon clean'}]


In [61]:

title = "Search PDF Documents with Sparse Passage Retrieval"
description = """
<center>Sample Questions: What is colonoscopy? </center>
"""

iface = gr.Interface(fn=predict,
                     inputs=[gr.Textbox(lines=3, label='Ask an open question!'),
                             gr.File(file_count="single",
                                            type="file", label="Upload a pdf"),
                             ],
                     outputs="text",
                     title=title, description=description,
                     flagging_options=["top", "medium", "bad"],
                     interpretation="default",
                     theme="default"  # "default", "huggingface", "dark-grass", "peach"
                     )

iface.launch(
    # share=True,
    # auth=("admin", "pass1234"),
    # enable_queue=True # cannot be enabled with auth enabled
)




Running on local URL:  http://127.0.0.1:7870

To create a public link, set `share=True` in `launch()`.


