In [1]:
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.utils import fetch_archive_from_http
from haystack import Pipeline
from haystack.nodes import TextConverter, PreProcessor
from haystack.nodes import BM25Retriever
from haystack.nodes import FARMReader
from haystack.nodes import DensePassageRetriever
from haystack.utils import print_answers
from haystack.nodes import PDFToTextConverter
from haystack import Document
from pathlib import Path
import texthero as hero

import pandas as pd
import os
from pprint import pprint
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [2]:
{
  "encoded": "SDYtdEFZY0JaclEwQUFKZmJnb0s6NWFtSEFvSXZSbld4WXhiR2JERVkxdw==",
  "api_key": "5amHAoIvRnWxYxbGbDEY1w",
  "id": "H6-tAYcBZrQ0AAJfbgoK",
  "name": "aipi540-nlp-project"
}

{'encoded': 'SDYtdEFZY0JaclEwQUFKZmJnb0s6NWFtSEFvSXZSbld4WXhiR2JERVkxdw==',
 'api_key': '5amHAoIvRnWxYxbGbDEY1w',
 'id': 'H6-tAYcBZrQ0AAJfbgoK',
 'name': 'aipi540-nlp-project'}

In [3]:
# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_store_reddit = ElasticsearchDocumentStore(
    host=host,
    index="document_reddit",
    similarity="cosine",
    embedding_dim=768
)



In [3]:
#document_store_reddit.delete_all_documents()
len(document_store_reddit.get_all_documents())

25264

In [4]:
data = pd.read_pickle("../data/reddit/nutrition.pkl")

In [12]:
data["concat"] = data["Thread Title"] + " " + data["Comment Body"]
data["url"] = "https://www.reddit.com/r/nutrition/comments/" + data["Thread ID"] + "/comment/" + data["Comment ID"] + "/"

In [21]:
custom_pipeline = [hero.preprocessing.lowercase,
                   hero.preprocessing.remove_whitespace,
                   hero.preprocessing.remove_angle_brackets,
                   hero.preprocessing.remove_html_tags,
                   hero.preprocessing.remove_urls]

data['concat_clean'] = hero.clean(data['concat'], custom_pipeline)

In [27]:
data_list = []
for body, title, url in zip(list(data["concat_clean"]), list(data["Thread Title"]), list(data["url"])):
    doc = Document(body, meta={"title": title, "url": url})
    data_list.append(doc)

In [28]:
indexing_pipeline = Pipeline()

preprocessor = PreProcessor(
    clean_whitespace=True,
    clean_header_footer=True,
    clean_empty_lines=True
)

In [29]:
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["Query"])
indexing_pipeline.add_node(component=document_store_reddit, name="DocumentStore", inputs=["PreProcessor"])

In [32]:
indexing_pipeline.run_batch(documents=data_list)

Preprocessing:   0%|          | 0/25286 [00:00<?, ?docs/s]



{'documents': [<Document: {'content': "men eating mediterranean style diet fart seven times more than men eating a high fat western style diet the reason is very obvious, its the *sudden change* in diet that causes this. according to the study, these men's diets were shifted drastically and suddenly. they didn't have the proper bowel flora to digest this new input and therefore some weird gases formed and thus farting. if they stayed on this diet for 6 months or more their bowel flora would adjust and they would be just fine. it takes a while to build up the proper flora to digest different types of foods.", 'content_type': 'text', 'score': None, 'meta': {'title': 'Men eating Mediterranean style diet fart seven times more than men eating a high fat western style diet', 'url': 'https://www.reddit.com/r/nutrition/comments/zobijg/comment/j0m43ym/', '_split_id': 0}, 'id_hash_keys': ['content'], 'embedding': None, 'id': 'f7983e4b4bbf3e3c8bc6b69c6f4691d6'}>,
  <Document: {'content': 'men eat

In [4]:
retriever = DensePassageRetriever(
    document_store=document_store_reddit,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base"
)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO:haystack.modeling.model.language_model:Auto-detected model language: english


In [5]:
document_store_reddit.update_embeddings(retriever, update_existing_embeddings=False)

INFO:haystack.document_stores.search_engine:Updating embeddings for all 0 docs without embeddings...


Updating embeddings: 0 Docs [00:00, ? Docs/s]

: 

In [35]:
#retriever = BM25Retriever(document_store=document_store)

In [5]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.model.language_model: * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
INFO:haystack.modeling.model.language_model:Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.
INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1


In [6]:
querying_pipeline = Pipeline()
querying_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
querying_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])

In [7]:
prediction = querying_pipeline.run(
    query="How do I get thinner?",
    params={
        "Retriever": {"top_k": 10},
        "Reader": {"top_k": 5}
    }
)

Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

In [8]:
answers = pd.DataFrame([i.to_dict() for i in prediction["answers"]])

In [9]:
answers['document_ids'] = answers['document_ids'].apply(lambda x: x[0])

In [15]:
answers['meta'].apply(lambda x: x["url"])

0    https://www.reddit.com/r/nutrition/comments/10as2zt/comment/j49ky1s/
1    https://www.reddit.com/r/nutrition/comments/10as2zt/comment/j471pw5/
2    https://www.reddit.com/r/nutrition/comments/10as2zt/comment/j47yo07/
3    https://www.reddit.com/r/nutrition/comments/10as2zt/comment/j4b9iic/
4    https://www.reddit.com/r/nutrition/comments/10z5i7h/comment/j855q8n/
Name: meta, dtype: object

In [None]:
answers["thread_link"] = answers['meta'].apply(lambda x: x["url"])
documents = pd.DataFrame([i.to_dict() for i in prediction["documents"]])
merge = pd.merge(documents, answers, left_on="id", right_on="document_ids", how="inner")
results = merge[["content", "answer", "score_y", "thread_link"]].sort_values(by="score_y", ascending=False).to_dict(orient="records")

In [34]:
[i.to_dict() for i in prediction["answers"]]

[{'answer': 'eat more/exercise less',
  'type': 'extractive',
  'score': 0.5534850358963013,
  'context': ' poster was asserting.\n\nThis is the questionable part:\n\n> Naturally skinny people have been proven to be able to eat more/exercise less and stay thin.',
  'offsets_in_document': [{'start': 277, 'end': 299}],
  'offsets_in_context': [{'start': 113, 'end': 135}],
  'document_ids': ['a884b808bcd5cc6cb94c2a9340f1f580'],
  'meta': {'_split_id': 0}},
 {'answer': 'intermittent fasting',
  'type': 'extractive',
  'score': 0.5146870613098145,
  'context': "crazy below 128, which is where I'm at now. I achieved it briefly by intermittent fasting but was preoccupied with food and it didn't feel sustainable",
  'offsets_in_document': [{'start': 472, 'end': 492}],
  'offsets_in_context': [{'start': 69, 'end': 89}],
  'document_ids': ['72b7b1d31ded896a99f189f573ef318e'],
  'meta': {'_split_id': 0}},
 {'answer': 'diet',
  'type': 'extractive',
  'score': 0.35249239206314087,
  'context': "ly

In [24]:
answers = pd.DataFrame([i.to_dict() for i in prediction["answers"]])
answers['document_ids'] = answers['document_ids'].apply(lambda x: x[0])

In [44]:
answers

Unnamed: 0,answer,type,score,context,offsets_in_document,offsets_in_context,document_ids,meta
0,eat more/exercise less,extractive,0.553485,poster was asserting.\n\nThis is the questionable part:\n\n> Naturally skin...,"[{'start': 277, 'end': 299}]","[{'start': 113, 'end': 135}]",a884b808bcd5cc6cb94c2a9340f1f580,{'_split_id': 0}
1,intermittent fasting,extractive,0.514687,"crazy below 128, which is where I'm at now. I achieved it briefly by intermi...","[{'start': 472, 'end': 492}]","[{'start': 69, 'end': 89}]",72b7b1d31ded896a99f189f573ef318e,{'_split_id': 0}
2,diet,extractive,0.352492,ly Revert to 'Preferred Weight'? Some people aren’t naturally “thin” and die...,"[{'start': 87, 'end': 91}]","[{'start': 73, 'end': 77}]",e175e9617275b37daef6aea60daf1213,{'_split_id': 0}
3,calorie diligence and becoming more active,extractive,0.280066,I never said that. I've kept 20lbs off myself through calorie diligence and ...,"[{'start': 160, 'end': 202}]","[{'start': 54, 'end': 96}]",e3b2f67f94ed0d8d5fb2cac0f30f5bee,{'_split_id': 0}
4,pear shaped,extractive,0.213638,"Weight'? You're absolutely right of course, my issue is more that I'm pear s...","[{'start': 108, 'end': 119}]","[{'start': 70, 'end': 81}]",b26c84716d18ef9bbdd05fcb879917e4,{'_split_id': 0}


In [38]:
documents = pd.DataFrame([i.to_dict() for i in prediction["documents"]])

In [42]:
merge = pd.merge(documents, answers, left_on="id", right_on="document_ids", how="inner")

In [49]:
merge[["content", "answer", "score_y"]].sort_values(by="score_y", ascending=False).to_dict(orient="records")

[{'content': "Do Bodies Really Revert to 'Preferred Weight'? I agree with you that some people are 6 ft and lanky and some people are 5 ft and busty. That's not what the original poster was asserting.\n\nThis is the questionable part:\n\n> Naturally skinny people have been proven to be able to eat more/exercise less and stay thin.",
  'answer': 'eat more/exercise less',
  'score_y': 0.5534850358963013},
 {'content': "Do Bodies Really Revert to 'Preferred Weight'? I can offer another anecdotal example- I was 128lb from high school until having kids, then had large fluctuations since having kids. After each kid, it took some effort to diet/ exercise and get back to my former weight, but no matter how hard i try i really struggle to get below it. I would love to be 120lb (I'm 5'6) but my hunger cues just seem to go crazy below 128, which is where I'm at now. I achieved it briefly by intermittent fasting but was preoccupied with food and it didn't feel sustainable",
  'answer': 'intermitte

In [15]:
prediction["documents"][2].to_dict()["content"]

"muscle loss in deficit Yepp.\n\n0.7% - 1.5% of total body weight lost each week is a good rule of thumb for most people.\n\nHell, when I was big I was eating 2,500 calories a day, doing zero exercise, and was losing around 4-5lbs a week on average at the beginning, couple days where I'd lose 10lbs overnight that even stayed off"

In [16]:
pprint(prediction)

{'answers': [<Answer {'answer': 'eat more/exercise less', 'type': 'extractive', 'score': 0.5534850358963013, 'context': ' poster was asserting.\n\nThis is the questionable part:\n\n> Naturally skinny people have been proven to be able to eat more/exercise less and stay thin.', 'offsets_in_document': [{'start': 277, 'end': 299}], 'offsets_in_context': [{'start': 113, 'end': 135}], 'document_ids': ['a884b808bcd5cc6cb94c2a9340f1f580'], 'meta': {'_split_id': 0}}>,
             <Answer {'answer': 'intermittent fasting', 'type': 'extractive', 'score': 0.5146870613098145, 'context': "crazy below 128, which is where I'm at now. I achieved it briefly by intermittent fasting but was preoccupied with food and it didn't feel sustainable", 'offsets_in_document': [{'start': 472, 'end': 492}], 'offsets_in_context': [{'start': 69, 'end': 89}], 'document_ids': ['72b7b1d31ded896a99f189f573ef318e'], 'meta': {'_split_id': 0}}>,
             <Answer {'answer': 'diet', 'type': 'extractive', 'score': 0.35249

In [22]:
print_answers(
    prediction,
    details="medium" ## Choose from `minimum`, `medium` and `all`
)


Query: Is sourdough bread good for health?
Answers:
[   {   'answer': 'sourdough bread is healthy',
        'context': 'used by sourdough fermentation and a\n'
                   'consumer perception that sourdough bread is healthy(33),\n'
                   'studies evaluating the role of sourdough for appetite an',
        'score': 0.44866815209388733},
    {   'answer': 'sourdough bread induced a significantly\n'
                  'lower plasma glucose response',
        'context': '0, 60, 120, and 180 min.\n'
                   'In IGT subjects sourdough bread induced a significantly\n'
                   'lower plasma glucose response at 30 minutes (p = 0.048)\n'
                   'and a smaller',
        'score': 0.2772848308086395},
    {   'answer': 'leavened for 8 h using a\nstarter',
        'context': 'Sourdough bread was leavened for 8 h using a\n'
                   'starter containing autochthonous Saccharomyces cerevisiae\n'
                   'and several bacilli able