In [17]:
from haystack import Pipeline
from haystack.pipelines import DocumentSearchPipeline
from haystack.utils import (
    clean_wiki_text,
    convert_files_to_dicts,
    fetch_archive_from_http,
    print_documents,
)
from haystack.nodes import (
    PreProcessor,
    DensePassageRetriever,
    SentenceTransformersRanker,
)
from haystack.document_stores import FAISSDocumentStore

# Create a Document Store

In [18]:
# get the data
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
docs = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
# docs = []

# load and clean documents with processor
processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=0
)
documents = processor.process(docs)

Found data stored in `data/article_txt_got`. Delete this first if you really want to fetch new data.
Converting data/article_txt_got/145_Elio_M._García_Jr._and_Linda_Antonsson.txt
Converting data/article_txt_got/368_Jaime_Lannister.txt
Converting data/article_txt_got/133_Game_of_Thrones__Season_5__soundtrack_.txt
Converting data/article_txt_got/515_The_Door__Game_of_Thrones_.txt
Converting data/article_txt_got/119_Walk_of_Punishment.txt
Converting data/article_txt_got/369_Samwell_Tarly.txt
Converting data/article_txt_got/356_Tales_of_Dunk_and_Egg.txt
Converting data/article_txt_got/195_World_of_A_Song_of_Ice_and_Fire.txt
Converting data/article_txt_got/25_Game_of_Thrones__Season_2__soundtrack_.txt
Converting data/article_txt_got/460_Battle_of_the_Bastards.txt
Converting data/article_txt_got/511_After_the_Thrones.txt
Converting data/article_txt_got/69_The_Red_Woman.txt
Converting data/article_txt_got/407_The_Long_Night__Game_of_Thrones_.txt
Converting data/article_txt_got/201_A_Game_of_

Converting data/article_txt_got/400_Winterfell__Game_of_Thrones_episode_.txt
Converting data/article_txt_got/373_Tywin_Lannister.txt
Converting data/article_txt_got/75_Blackwater__Game_of_Thrones_.txt
Converting data/article_txt_got/378_A_Game_of_Thrones__board_game_.txt
Converting data/article_txt_got/512_Home__Game_of_Thrones_.txt
Converting data/article_txt_got/80_A_Song_of_Ice_and_Fire_fandom.txt
Converting data/article_txt_got/229_Game_of_Thrones.txt
Converting data/article_txt_got/56_First_of_His_Name.txt
Converting data/article_txt_got/263_Tormund_Giantsbane.txt
Converting data/article_txt_got/513_Oathbreaker__Game_of_Thrones_.txt
Converting data/article_txt_got/487_Ramsay_Bolton.txt
Converting data/article_txt_got/365_A_Song_of_Ice_and_Fire_Roleplaying.txt
Converting data/article_txt_got/73_A_Man_Without_Honor.txt
Converting data/article_txt_got/208_Robb_Stark.txt
Converting data/article_txt_got/343_Catelyn_Stark.txt
Converting data/article_txt_got/232_Tommen_Baratheon.txt
Conv

### Semantic Search with FAISS

In [19]:
document_store = FAISSDocumentStore(
    similarity='dot_product',
    return_embedding=True,
    faiss_index_factory_str="Flat"
)
document_store.write_documents(documents, duplicate_documents='overwrite')

# don't save yet - this type of doc_store will need to be updated for a retriever

# Create a Document Retriever

This will take a search term as input and return relevant documents from the document store.

In [20]:
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    max_seq_len_query=64,
    max_seq_len_passage=256,
    batch_size=16,
    use_gpu=True,
    embed_title=True,
    use_fast_tokenizers=True,
)

Using devices: CPU
Number of GPUs: 0


In [21]:
# Important:
# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation.
# While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
document_store.update_embeddings(retriever)

# save/load document store
document_store.save("my_dataset.faiss")
document_store = FAISSDocumentStore.load("my_dataset.faiss")

Updating embeddings for 3443 docs...
Updating Embedding:   0%|          | 0/3443 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/3456 [00:00<?, ? Docs/s]

Documents Processed: 10000 docs [32:01,  5.20 docs/s]           


# Create a Ranker (Optional)

The ranker helps to sort semantically similar content first.  It is more useful for non-semantic retrievers like BM25, but can still help with Dense Passage retrievers. 

In [22]:
# ranker is optional, but can improve the results of the retriever
ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2")

Using devices: CPU
Number of GPUs: 0


# Create the Pipeline

In [23]:
p = Pipeline()
p.add_node(component=retriever_b, name="Retriever", inputs=["Query"])
# p.add_node(component=ranker, name="Ranker", inputs=["Retriever"])

In [24]:
# run a search
search_term = "international climate conferences"

nbr_results_to_return = int(25) # need to limit results, otherwise all docs in corpus will be returned
result = p.run(
    query=search_term,
    params={"top_k": nbr_results_to_return}
)

Exception: Exception while running node `Retriever` with input `{'root_node': 'Query', 'params': {'top_k': 25}, 'query': 'international climate conferences', 'node_id': 'Retriever'}`: unsupported operand type(s) for *: 'float' and 'NoneType', full stack trace: Traceback (most recent call last):
  File "/Users/nicholaslincoln/opt/anaconda3/lib/python3.8/site-packages/haystack/pipelines/base.py", line 337, in run
    node_output, stream_id = self.graph.nodes[node_id]["component"]._dispatch_run(**node_input)
  File "/Users/nicholaslincoln/opt/anaconda3/lib/python3.8/site-packages/haystack/nodes/base.py", line 233, in _dispatch_run
    output, stream = self.run(**run_inputs, **run_params)
  File "/Users/nicholaslincoln/opt/anaconda3/lib/python3.8/site-packages/haystack/nodes/retriever/base.py", line 219, in run
    output, stream = run_query_timed(query=query, filters=filters, top_k=top_k, index=index)
  File "/Users/nicholaslincoln/opt/anaconda3/lib/python3.8/site-packages/haystack/nodes/retriever/base.py", line 70, in wrapper
    ret = fn(*args, **kwargs)
  File "/Users/nicholaslincoln/opt/anaconda3/lib/python3.8/site-packages/haystack/nodes/retriever/base.py", line 235, in run_query
    documents = self.retrieve(query=query, filters=filters, top_k=top_k, index=index)
  File "/Users/nicholaslincoln/opt/anaconda3/lib/python3.8/site-packages/haystack/nodes/retriever/dense.py", line 207, in retrieve
    documents = self.document_store.query_by_embedding(query_emb=query_emb[0], top_k=top_k, filters=filters, index=index)
  File "/Users/nicholaslincoln/opt/anaconda3/lib/python3.8/site-packages/haystack/document_stores/memory.py", line 199, in query_by_embedding
    score = np.dot(query_emb, doc.embedding) / (
  File "<__array_function__ internals>", line 5, in dot
TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'


In [None]:
len(result['documents'])

In [None]:
len(docs)

In [None]:
result['documents']