In [1]:
#from haystack.utils import launch_es
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.utils import fetch_archive_from_http
from haystack import Pipeline
from haystack.nodes import TextConverter, PreProcessor
from haystack.nodes import BM25Retriever
from haystack.nodes import FARMReader
from haystack.nodes import DensePassageRetriever
from haystack.utils import print_answers
from haystack.nodes import PDFToTextConverter
from pathlib import Path

import os
from pprint import pprint
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [2]:
#!docker start es_v1

In [2]:
# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_store_research = ElasticsearchDocumentStore(
    host=host,
    username="",
    password="",
    index="document_research",
    similarity="dot_product",
    embedding_dim=768
)



In [3]:
#document_store.delete_all_documents()
len(document_store_research.get_all_documents())

361981

In [5]:
doc_dir = "data_nutrition/pdf"

In [6]:
indexing_pipeline = Pipeline()

text_converter = PDFToTextConverter(
    remove_numeric_tables=True,
    valid_languages=["en"]
)

preprocessor = PreProcessor(
    clean_whitespace=True,
    clean_header_footer=True,
    clean_empty_lines=True,
    split_by="word",
    split_length=100,
    split_overlap=20,
    split_respect_sentence_boundary=True,
)

pdftotext version 4.04 [www.xpdfreader.com]
Copyright 1996-2022 Glyph & Cog, LLC


In [7]:
indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"])
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
indexing_pipeline.add_node(component=document_store_research, name="DocumentStore", inputs=["PreProcessor"])

In [11]:
files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
#indexing_pipeline.run_batch(file_paths=files_to_index, meta=[{'year': 1999}, {'year': 1999}, {'year': 1999}, {'year': 1999}, {'year': 1999}])
indexing_pipeline.run_batch(file_paths=files_to_index)

INFO:haystack.pipelines.base:It seems that an indexing Pipeline is run, so using the nodes' run method instead of run_batch.


Converting files:   0%|          | 0/2165 [00:00<?, ?it/s]

Syntax Error: Couldn't read xref table
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table
Syntax Error: Couldn't read xref table
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table
Syntax Error: Couldn't read xref table
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table
Syntax Error: Couldn't read xref table
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table
Syntax Error: Couldn't read xref table
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table
Syntax Error: Couldn't read xref table
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table
Syntax Error: Couldn't read xref table
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table
Syntax Error: Couldn't read xref table
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table


Preprocessing:   0%|          | 0/2165 [00:00<?, ?docs/s]



{'documents': [<Document: {'content': 'Full Terms & Conditions of access and use can be found at\nDownload by: [Athabasca University] Date: 29 November 2016, At: 08:57\nCritical Reviews in Food Science and Nutrition\nISSN: 1040-8398 (Print) 1549-7852 (Online) Journal homepage: http://www.tandfonline.com/loi/bfsn20\nA Review of Mycotoxins in Food and Feed Products\nin Portugal and Estimation of Probable Daily\nIntakes\nLuís Abrunhosa, Héctor Morales, Célia Soares, Thalita Calado, Ana Sofia Vila-\nChã, Martinha Pereira & Armando Venâncio\nTo cite this article: Luís Abrunhosa, Héctor Morales, Célia Soares, Thalita Calado, Ana Sofia\nVila-Chã, Martinha Pereira & Armando Venâncio (2016) A Review of Mycotoxins in Food and\nFeed Products in Portugal and Estimation of Probable Daily Intakes, Critical Reviews in Food\nTo link to this article: http://dx.doi.org/10.1080/10408398.2012.720619\nAccepted author version posted online: 02\nJul 2014.\n', 'content_type': 'text', 'score': None, 'meta': {'

In [12]:
retriever = DensePassageRetriever(
    document_store=document_store_research,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base"
)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO:haystack.modeling.model.language_model:Auto-detected model language: english


In [13]:
document_store_research.update_embeddings(retriever)

INFO:haystack.document_stores.search_engine:Updating embeddings for all 361981 docs ...


Updating embeddings:   0%|          | 0/361981 [00:00<?, ? Docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

In [12]:
#retriever = BM25Retriever(document_store=document_store)

In [13]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.model.language_model: * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
INFO:haystack.modeling.model.language_model:Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.
INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1


In [14]:
querying_pipeline = Pipeline()
querying_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
querying_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])

In [20]:
prediction = querying_pipeline.run(
    query="Is sourdough bread good for health?",
    params={
        "Retriever": {"top_k": 10},
        "Reader": {"top_k": 5}
    }
)

Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

In [24]:
pprint(prediction)

{'answers': [<Answer {'answer': 'sourdough bread is healthy', 'type': 'extractive', 'score': 0.44866815209388733, 'context': 'used by sourdough fermentation and a\nconsumer perception that sourdough bread is healthy(33),\nstudies evaluating the role of sourdough for appetite an', 'offsets_in_document': [{'start': 594, 'end': 620}], 'offsets_in_context': [{'start': 62, 'end': 88}], 'document_ids': ['647c6f56157c5c829d2dead43aae7b4e'], 'meta': {'_split_id': 13, 'year': 1999}}>,
             <Answer {'answer': 'sourdough bread induced a significantly\nlower plasma glucose response', 'type': 'extractive', 'score': 0.2772848308086395, 'context': '0, 60, 120, and 180 min.\nIn IGT subjects sourdough bread induced a significantly\nlower plasma glucose response at 30 minutes (p = 0.048)\nand a smaller', 'offsets_in_document': [{'start': 399, 'end': 468}], 'offsets_in_context': [{'start': 41, 'end': 110}], 'document_ids': ['9e356b16035b2dc5bd46f8c710fe96f5'], 'meta': {'_split_id': 5, 'year': 199

In [22]:
print_answers(
    prediction,
    details="medium" ## Choose from `minimum`, `medium` and `all`
)


Query: Is sourdough bread good for health?
Answers:
[   {   'answer': 'sourdough bread is healthy',
        'context': 'used by sourdough fermentation and a\n'
                   'consumer perception that sourdough bread is healthy(33),\n'
                   'studies evaluating the role of sourdough for appetite an',
        'score': 0.44866815209388733},
    {   'answer': 'sourdough bread induced a significantly\n'
                  'lower plasma glucose response',
        'context': '0, 60, 120, and 180 min.\n'
                   'In IGT subjects sourdough bread induced a significantly\n'
                   'lower plasma glucose response at 30 minutes (p = 0.048)\n'
                   'and a smaller',
        'score': 0.2772848308086395},
    {   'answer': 'leavened for 8 h using a\nstarter',
        'context': 'Sourdough bread was leavened for 8 h using a\n'
                   'starter containing autochthonous Saccharomyces cerevisiae\n'
                   'and several bacilli able