In [None]:
import logging
from pprint import pprint

from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor, TfidfRetriever, EmbeddingRetriever
from haystack.utils import convert_files_to_docs, print_answers
from haystack.document_stores import InMemoryDocumentStore, FAISSDocumentStore
from haystack.nodes import FARMReader, TransformersReader
from haystack.pipelines import ExtractiveQAPipeline


%load_ext autoreload
%autoreload 2

In [None]:
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

## get PDFs

In [None]:
# converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
# doc_txt = converter.convert(file_path="data/tutorial8/classics.txt", meta=None)[0]

# converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en"])
# doc_docx = converter.convert(file_path="data/tutorial8/heavy_metal.docx", meta=None)[0]

converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
doc_pdf = converter.convert(file_path="/data/kg_pdfs_test/dt-csm-solution-brief.pdf", meta=None)[0]

In [None]:
# doc_pdf

In [None]:
PDFS_PATH="/data/kg_pdfs_test/"

all_docs = convert_files_to_docs(dir_path=PDFS_PATH)

## Preprocessing 

In [None]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=128,  # smaller splits works better? 
    split_respect_sentence_boundary=True,
)

all_docs_process = preprocessor.process(all_docs)

print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(all_docs_process)}")

In [None]:
all_docs_process[:2]

## Document Store 

In [None]:
# In-Memory Document Store
# document_store = InMemoryDocumentStore()


# The FAISSDocumentStore uses a SQL(SQLite in-memory be default) database under-the-hood to store the document text and other meta data. 
# The vector embeddings of the text are indexed on a FAISS Index that later is queried for searching answers.
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", similarity="dot_product")

In [None]:
document_store.write_documents(all_docs_process)

In [None]:
document_store.get_document_count()

## Retriever


In [None]:
# An in-memory TfidfRetriever based on Pandas dataframes

tfidf_ret = TfidfRetriever(document_store=document_store)

In [None]:
# Sentence BERT embeddings retriever
SENT_TRANS_MODEL = "sentence-transformers/multi-qa-mpnet-base-dot-v1"

embedd_ret = EmbeddingRetriever(
    document_store=document_store,
    embedding_model=SENT_TRANS_MODEL,
    model_format="sentence_transformers",
)

# Important:
# Now that we initialized the Retriever, we need to call update_embeddings() to iterate over all previously indexed documents 
# and update their embedding representation.
# While this can be a time consuming operation (depending on the corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it to the existing document embeddings, which is very fast.

document_store.update_embeddings(embedd_ret)

In [None]:
document_store.get_all_documents()[55]

In [None]:
query = "What is streaming data?"
query1 = "How is deep learning used in industry?"
query2 = "What is a data mesh?"

**tfidf retriever:**

In [None]:
pprint(tfidf_ret.retrieve(query2, top_k=5))

**embeddings retriever:**

In [None]:
pprint(embedd_ret.retrieve(query2, top_k=5))

## Reader


In [None]:
# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)
# reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
# OR
# use Transformer models
DisBERT_SQD_MODEL = "distilbert-base-uncased-distilled-squad"
T5_L_SQD_MODEL = "/data/t5-large"
ROBERTA_BASE_MODEL = "deepset/roberta-base-squad2"


RoBERTa_squad_reader = TransformersReader(model_name_or_path=ROBERTA_BASE_MODEL, 
                                          tokenizer=ROBERTA_BASE_MODEL, 
                                          use_gpu=True)
# distBert_squad_reader = TransformersReader(model_name_or_path=DisBERT_SQD_MODEL, 
#                                            tokenizer=DisBERT_SQD_MODEL, 
#                                            use_gpu=True)

In [None]:
ans = RoBERTa_squad_reader.predict(query, 
                                   documents=tfidf_ret.retrieve(query, top_k=5),
                                   top_k=3)

pprint(ans.get('answers'))

## Pipeline 

In [None]:
query = "What is streaming data?"
query1 = "How is deep learning used in industry?"
query2 = "What is a data mesh?"
query3 = "What do data scientists work on?"
query4 = "How can cloud storage costs be reduced?"

**tfidf retriever:**

In [None]:
pipe1 = ExtractiveQAPipeline(RoBERTa_squad_reader, tfidf_ret)

In [None]:
prediction = pipe1.run(
    query=query3, 
    params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 3}}
)

(prediction)

In [None]:
# ...or use a util to simplify the output
# Change `minimum` to `medium` or `all` to control the level of detail

# print_answers(prediction, details="all")

**embeddings retriever:**

In [None]:
pipe2 = ExtractiveQAPipeline(RoBERTa_squad_reader, embedd_ret)

In [None]:
prediction = pipe2.run(
    query=query3, 
    params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 3}}
)

(prediction)

In [None]:
# ...or use a util to simplify the output
# Change `minimum` to `medium` or `all` to control the level of detail

# print_answers(prediction, details="all")