In [19]:
#Installing the required modules

%%bash

pip install --upgrade pip
pip install farm-haystack[colab,inference]



In [20]:
#Since I saved the document corpus in a drive folder, I need to mount the drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
file_path = "/content/drive/MyDrive/NLP/UST/small_faq_covid.csv"

In [22]:
from haystack.document_stores import InMemoryDocumentStore
document_store = InMemoryDocumentStore()

In [23]:
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/all-MiniLM-L6-v2",
    use_gpu=True,
    scale_score=False,
)

In [24]:
import pandas as pd

# Get dataframe with columns "question", "answer" and some custom metadata
df = pd.read_csv(f"{file_path}")

# Minimal cleaning
df.fillna(value="", inplace=True)
df["question"] = df["question"].apply(lambda x: x.strip())
print(df.head())

# Create embeddings for our questions from the FAQs
# In contrast to most other search use cases, we don't create the embeddings here from the content of our documents,
# but rather from the additional text field "question" as we want to match "incoming question" <-> "stored question".
questions = list(df["question"].values)
df["embedding"] = retriever.embed_queries(queries=questions).tolist()
df = df.rename(columns={"question": "content"})

# Write documents to the DocumentStore
document_store.write_documents(df.to_dict(orient="records"))


                                            question  \
0                       What is a novel coronavirus?   
1  Why is the disease being called coronavirus di...   
2  Why might someone blame or avoid individuals a...   
3  How can people help stop stigma related to COV...   
4                   What is the source of the virus?   

                                              answer  \
0  A novel coronavirus is a new coronavirus that ...   
1  On February 11, 2020 the World Health Organiza...   
2  People in the U.S. may be worried or anxious a...   
3  People can fight stigma and help, not hurt, ot...   
4  Coronaviruses are a large family of viruses. S...   

                                         answer_html  \
0  <p>A novel coronavirus is a new coronavirus th...   
1  <p>On February 11, 2020 the World Health Organ...   
2  <p>People in the U.S. may be worried or anxiou...   
3  <p>People can fight stigma and help, not hurt,...   
4  <p>Coronaviruses are a large family of viru

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [28]:
# Ask question & print answers
from haystack.pipelines import FAQPipeline
from haystack.utils import print_answers

pipe = FAQPipeline(retriever=retriever)
prediction = pipe.run(
    query="How is the virus spreading?",
    params={"Retriever": {"top_k": 1}}
    )
#print_answers(prediction, details="minimum")
print_answers(prediction, details="medium")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'Query: How is the virus spreading?'
'Answers:'
[   {   'answer': 'This virus was first detected in Wuhan City, Hubei '
                  'Province, China. The first infections were linked to a live '
                  'animal market, but the virus is now spreading from '
                  'person-to-person. It’s important to note that '
                  'person-to-person spread can happen on a continuum. Some '
                  'viruses are highly contagious (like measles), while other '
                  'viruses are less so.\n'
                  '\n'
                  'The virus that causes COVID-19 seems to be spreading easily '
                  'and sustainably in the community (“community spread”) in '
                  'some affected geographic areas. Community spread means '
                  'people have been infected with the virus in an area, '
                  'including some who are not sure how or where they became '
                  'infected.\n'
                  '