In [9]:
import gradio as gr
import os
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TfidfRetriever
from haystack.nodes import FARMReader, PreProcessor, PDFToTextConverter
from haystack import Pipeline
from haystack.pipelines import ExtractiveQAPipeline
import PyPDF2


In [10]:
document_store = InMemoryDocumentStore()
retriever = TfidfRetriever(document_store=document_store)
reader = FARMReader(
    model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)


querying_pipeline = Pipeline()
querying_pipeline.add_node(
    component=retriever, name="Retriever", inputs=["Query"])
querying_pipeline.add_node(
    component=reader, name="Reader", inputs=["Retriever"])


In [11]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
    split_overlap=3
)


def print_answers(results):
    fields = ["answer", "score"]  # "context",
    answers = results["answers"]
    filtered_answers = []

    for ans in answers:
        filtered_ans = {
            field: getattr(ans, field)
            for field in fields
            if getattr(ans, field) is not None
        }
        filtered_answers.append(filtered_ans)

    return filtered_answers


def run_once(f):
    def wrapper(*args, **kwargs):
        if not wrapper.has_run:
            wrapper.has_run = True
            return f(*args, **kwargs)
    wrapper.has_run = False
    return wrapper


In [12]:

pipe = ExtractiveQAPipeline(reader, retriever)


@run_once
def written_document(pdf_file):
    converter = PDFToTextConverter(
        remove_numeric_tables=True, valid_languages=["en"])
    document = [converter.convert(file_path=pdf_file.name, meta=None)[0]]
    preprocessed_docs = preprocessor.process(document)
    document_store.write_documents(preprocessed_docs)
    return None


def predict(question, pdf_file):
    written_document(pdf_file)
    result = pipe.run(query=question, params={
        "Retriever": {"top_k": 20}, "Reader": {"top_k": 5}})
    answers = print_answers(result)
    return answers


title = "Search PDF Business Reports with Sparse Passage Retrieval"
description = """
<center>Sample Questions: What are strategic initiatives? </center>
"""

iface = gr.Interface(fn=predict,
                     inputs=[gr.inputs.Textbox(lines=3, label='Ask an open question!'),
                             gr.inputs.File(file_count="single",
                                            type="file", label="Upload a pdf"),
                             ],
                     outputs="text",
                     title=title, description=description,
                     flagging_options=["top", "medium", "bad"],
                     interpretation="default",
                     theme="dark-grass"  # "default", "huggingface", "dark-grass", "peach"
                     )

iface.launch(
    # share=True,
    # auth=("admin", "pass1234"),
    # enable_queue=True # cannot be enabled with auth enabled
)




Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "/home/M267492/venv/rasa/lib/python3.8/site-packages/haystack/nodes/file_converter/pdf.py", line 8, in <module>
    from pdf2image import convert_from_path
ModuleNotFoundError: No module named 'pdf2image'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/M267492/venv/rasa/lib/python3.8/site-packages/haystack/utils/import_utils.py", line 30, in safe_import
    module = importlib.import_module(import_path)
  File "/usr/lib/python3.8/importlib/__init__.py", line 127, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1014, in _gcd_import
  File "<frozen importlib._bootstrap>", line 991, in _find_and_load
  File "<frozen importlib._bootstrap>", line 975, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 671, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 848, 