# OpenSearch support all similarities

In [None]:
from haystack.document_stores import OpenSearchDocumentStore
from haystack.nodes import DensePassageRetriever
from haystack.pipelines import DocumentSearchPipeline
from haystack.utils import clean_wiki_text, convert_files_to_dicts, fetch_archive_from_http, print_answers
document_store = OpenSearchDocumentStore(port=9201, index="document", similarity="dot_product")
retriever = DensePassageRetriever(document_store=document_store,
                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                  max_seq_len_query=64,
                                  max_seq_len_passage=256,
                                  batch_size=16,
                                  use_gpu=False,
                                  embed_title=True,
                                  use_fast_tokenizers=True)
# indexing happening here
dot_pipe = DocumentSearchPipeline(retriever)

In [None]:
document_store = OpenSearchDocumentStore(port=9201, index="document", similarity="cosine")
retriever = DensePassageRetriever(document_store=document_store,
                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                  max_seq_len_query=64,
                                  max_seq_len_passage=256,
                                  batch_size=16,
                                  use_gpu=False,
                                  embed_title=True,
                                  use_fast_tokenizers=True)
cos_pipe = DocumentSearchPipeline(retriever)

In [6]:
dot_prediction = dot_pipe.run(
    query="Who created the Dothraki vocabulary?", params={"Retriever": {"top_k": 10}}
)
[(doc.meta, doc.score) for doc in dot_prediction["documents"]]



[({'_split_id': 1, 'name': '214_Dothraki_language.txt'}, 0.6769540665167387),
 ({'_split_id': 0, 'name': '214_Dothraki_language.txt'}, 0.6745885570608239),
 ({'_split_id': 3, 'name': '214_Dothraki_language.txt'}, 0.672530182282669),
 ({'_split_id': 2, 'name': '214_Dothraki_language.txt'}, 0.6719121511193881),
 ({'_split_id': 9, 'name': '9_Game_of_Thrones_Tapestry.txt'},
  0.6707789192234145),
 ({'_split_id': 9, 'name': '87_Valar_Dohaeris.txt'}, 0.6680837806296385),
 ({'_split_id': 0,
   'name': '469_Outline_of_A_Song_of_Ice_and_Fire_franchise.txt'},
  0.6675130212722301),
 ({'_split_id': 0,
   'name': '504_List_of_A_Song_of_Ice_and_Fire_video_games.txt'},
  0.6673373773617681),
 ({'_split_id': 6, 'name': '214_Dothraki_language.txt'}, 0.6659702524455079),
 ({'_split_id': 3, 'name': '9_Game_of_Thrones_Tapestry.txt'},
  0.6654072191483222)]

In [8]:
prediction = cos_pipe.run(
    query="Who created the Dothraki vocabulary?", params={"Retriever": {"top_k": 10}}
)
[(doc.meta, doc.score) for doc in prediction["documents"]]



[({'_split_id': 1, 'name': '214_Dothraki_language.txt'}, 0.6769540665167387),
 ({'_split_id': 0, 'name': '214_Dothraki_language.txt'}, 0.6745885570608239),
 ({'_split_id': 3, 'name': '214_Dothraki_language.txt'}, 0.672530182282669),
 ({'_split_id': 2, 'name': '214_Dothraki_language.txt'}, 0.6719121511193881),
 ({'_split_id': 9, 'name': '9_Game_of_Thrones_Tapestry.txt'},
  0.6707789192234145),
 ({'_split_id': 9, 'name': '87_Valar_Dohaeris.txt'}, 0.6680837806296385),
 ({'_split_id': 0,
   'name': '469_Outline_of_A_Song_of_Ice_and_Fire_franchise.txt'},
  0.6675130212722301),
 ({'_split_id': 0,
   'name': '504_List_of_A_Song_of_Ice_and_Fire_video_games.txt'},
  0.6673373773617681),
 ({'_split_id': 6, 'name': '214_Dothraki_language.txt'}, 0.6659702524455079),
 ({'_split_id': 3, 'name': '9_Game_of_Thrones_Tapestry.txt'},
  0.6654072191483222)]

## Indexing tools

In [None]:
document_store.delete_all_documents()
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Convert files to dicts
dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(dicts)

In [None]:
# Important: 
# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation. 
# While this can be a time consuming operation (depending on corpus size), it only needs to be done once. 
# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
document_store.update_embeddings(retriever)

## Full similarity support

In [None]:
document_store = OpenSearchDocumentStore(port=9201, index="document_all_sim", full_similarity_support=True, similarity="dot_product")

In [None]:
from haystack.nodes import DensePassageRetriever
retriever = DensePassageRetriever(document_store=document_store,
                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                  max_seq_len_query=64,
                                  max_seq_len_passage=256,
                                  batch_size=16,
                                  use_gpu=True,
                                  embed_title=True,
                                  use_fast_tokenizers=True)
# indexing happening here

In [4]:
from haystack.pipelines import DocumentSearchPipeline
pipe = DocumentSearchPipeline(retriever)

In [None]:
prediction = pipe.run(
    query="Who created the Dothraki vocabulary?", params={"Retriever": {"top_k": 10}}
)

dot_product Results

In [6]:
[(doc.meta, doc.score) for doc in prediction["documents"]]

[({'_split_id': 1, 'name': '214_Dothraki_language.txt'}, 0.6769568438386782),
 ({'_split_id': 0, 'name': '214_Dothraki_language.txt'}, 0.6745939791526845),
 ({'_split_id': 3, 'name': '214_Dothraki_language.txt'}, 0.6725383837194138),
 ({'_split_id': 2, 'name': '214_Dothraki_language.txt'}, 0.6719167804731334),
 ({'_split_id': 9, 'name': '9_Game_of_Thrones_Tapestry.txt'},
  0.6707756398218706),
 ({'_split_id': 9, 'name': '87_Valar_Dohaeris.txt'}, 0.6680890360325746),
 ({'_split_id': 0,
   'name': '469_Outline_of_A_Song_of_Ice_and_Fire_franchise.txt'},
  0.6675157289272265),
 ({'_split_id': 0,
   'name': '504_List_of_A_Song_of_Ice_and_Fire_video_games.txt'},
  0.667340485329317),
 ({'_split_id': 6, 'name': '214_Dothraki_language.txt'}, 0.6659787390071147),
 ({'_split_id': 3, 'name': '9_Game_of_Thrones_Tapestry.txt'},
  0.6653999165018826)]

## And now cosine similarity

In [None]:
cos_document_store = OpenSearchDocumentStore(port=9201, index="document_all_sim", full_similarity_support=True, similarity="cosine")
cos_retriever = DensePassageRetriever(document_store=cos_document_store,
                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                  max_seq_len_query=64,
                                  max_seq_len_passage=256,
                                  batch_size=16,
                                  use_gpu=False,
                                  embed_title=True,
                                  use_fast_tokenizers=True)
cos_pipe = DocumentSearchPipeline(cos_retriever)

In [None]:
cos_prediction = cos_pipe.run(
    query="Who created the Dothraki vocabulary?", params={"Retriever": {"top_k": 10}}
)

In [9]:
[(doc.meta, doc.score) for doc in cos_prediction["documents"]]

[({'_split_id': 0, 'name': '214_Dothraki_language.txt'}, 0.86852325),
 ({'_split_id': 1, 'name': '214_Dothraki_language.txt'}, 0.8657590150000001),
 ({'_split_id': 9, 'name': '87_Valar_Dohaeris.txt'}, 0.8617915),
 ({'_split_id': 0,
   'name': '504_List_of_A_Song_of_Ice_and_Fire_video_games.txt'},
  0.8612226000000001),
 ({'_split_id': 2, 'name': '214_Dothraki_language.txt'}, 0.8594471),
 ({'_split_id': 6, 'name': '229_Game_of_Thrones.txt'}, 0.85899538),
 ({'_split_id': 0,
   'name': '469_Outline_of_A_Song_of_Ice_and_Fire_franchise.txt'},
  0.85796915),
 ({'_split_id': 2, 'name': '130_Game_of_Thrones_title_sequence.txt'},
  0.85729623),
 ({'_split_id': 9, 'name': '9_Game_of_Thrones_Tapestry.txt'}, 0.8571682),
 ({'_split_id': 5, 'name': '450_Baelor.txt'}, 0.85715997)]

In [10]:
[(doc.meta, doc.score) for doc in prediction["documents"]]

[({'_split_id': 1, 'name': '214_Dothraki_language.txt'}, 0.6769568438386782),
 ({'_split_id': 0, 'name': '214_Dothraki_language.txt'}, 0.6745939791526845),
 ({'_split_id': 3, 'name': '214_Dothraki_language.txt'}, 0.6725383837194138),
 ({'_split_id': 2, 'name': '214_Dothraki_language.txt'}, 0.6719167804731334),
 ({'_split_id': 9, 'name': '9_Game_of_Thrones_Tapestry.txt'},
  0.6707756398218706),
 ({'_split_id': 9, 'name': '87_Valar_Dohaeris.txt'}, 0.6680890360325746),
 ({'_split_id': 0,
   'name': '469_Outline_of_A_Song_of_Ice_and_Fire_franchise.txt'},
  0.6675157289272265),
 ({'_split_id': 0,
   'name': '504_List_of_A_Song_of_Ice_and_Fire_video_games.txt'},
  0.667340485329317),
 ({'_split_id': 6, 'name': '214_Dothraki_language.txt'}, 0.6659787390071147),
 ({'_split_id': 3, 'name': '9_Game_of_Thrones_Tapestry.txt'},
  0.6653999165018826)]

# Misc

original flat impl

In [10]:
[(doc.meta, doc.score) for doc in prediction["documents"]]

[({'_split_id': 1, 'name': '214_Dothraki_language.txt'}, 0.6769568438386782),
 ({'_split_id': 0, 'name': '214_Dothraki_language.txt'}, 0.6745939791526845),
 ({'_split_id': 3, 'name': '214_Dothraki_language.txt'}, 0.6725383837194138),
 ({'_split_id': 2, 'name': '214_Dothraki_language.txt'}, 0.6719167804731334),
 ({'_split_id': 9, 'name': '9_Game_of_Thrones_Tapestry.txt'},
  0.6707756398218706),
 ({'_split_id': 9, 'name': '87_Valar_Dohaeris.txt'}, 0.6680890360325746),
 ({'_split_id': 0,
   'name': '469_Outline_of_A_Song_of_Ice_and_Fire_franchise.txt'},
  0.6675157289272265),
 ({'_split_id': 0,
   'name': '504_List_of_A_Song_of_Ice_and_Fire_video_games.txt'},
  0.667340485329317),
 ({'_split_id': 6, 'name': '214_Dothraki_language.txt'}, 0.6659787390071147),
 ({'_split_id': 3, 'name': '9_Game_of_Thrones_Tapestry.txt'},
  0.6653999165018826)]

non-global flat impl

In [14]:
[(doc.meta, doc.score) for doc in prediction["documents"]]

[({'_split_id': 1, 'name': '214_Dothraki_language.txt'}, 0.6769568438386782),
 ({'_split_id': 0, 'name': '214_Dothraki_language.txt'}, 0.6745939791526845),
 ({'_split_id': 3, 'name': '214_Dothraki_language.txt'}, 0.6725383837194138),
 ({'_split_id': 2, 'name': '214_Dothraki_language.txt'}, 0.6719167804731334),
 ({'_split_id': 9, 'name': '9_Game_of_Thrones_Tapestry.txt'},
  0.6707756398218706),
 ({'_split_id': 9, 'name': '87_Valar_Dohaeris.txt'}, 0.6680890360325746),
 ({'_split_id': 0,
   'name': '469_Outline_of_A_Song_of_Ice_and_Fire_franchise.txt'},
  0.6675157289272265),
 ({'_split_id': 0,
   'name': '504_List_of_A_Song_of_Ice_and_Fire_video_games.txt'},
  0.667340485329317),
 ({'_split_id': 6, 'name': '214_Dothraki_language.txt'}, 0.6659787390071147),
 ({'_split_id': 3, 'name': '9_Game_of_Thrones_Tapestry.txt'},
  0.6653999165018826)]

exact nonglobal flat impl

In [7]:
[(doc.meta, doc.score) for doc in prediction["documents"]]

[({'_split_id': 1, 'name': '214_Dothraki_language.txt'}, 0.6769567782327921),
 ({'_split_id': 0, 'name': '214_Dothraki_language.txt'}, 0.6745940889111461),
 ({'_split_id': 3, 'name': '214_Dothraki_language.txt'}, 0.6725383528871421),
 ({'_split_id': 2, 'name': '214_Dothraki_language.txt'}, 0.6719166922952791),
 ({'_split_id': 9, 'name': '9_Game_of_Thrones_Tapestry.txt'},
  0.6707757612814838),
 ({'_split_id': 9, 'name': '87_Valar_Dohaeris.txt'}, 0.6680889783785924),
 ({'_split_id': 0,
   'name': '469_Outline_of_A_Song_of_Ice_and_Fire_franchise.txt'},
  0.6675157289272265),
 ({'_split_id': 0,
   'name': '504_List_of_A_Song_of_Ice_and_Fire_video_games.txt'},
  0.6673404298300241),
 ({'_split_id': 6, 'name': '214_Dothraki_language.txt'}, 0.6659787390071147),
 ({'_split_id': 3, 'name': '9_Game_of_Thrones_Tapestry.txt'},
  0.6653998719733076)]

exact nonglobal flat cosine

In [12]:
[(doc.meta, doc.score) for doc in prediction["documents"]]

[({'_split_id': 0, 'name': '214_Dothraki_language.txt'}, 1.3216234999999998),
 ({'_split_id': 1, 'name': '214_Dothraki_language.txt'}, 1.31649075),
 ({'_split_id': 9, 'name': '87_Valar_Dohaeris.txt'}, 1.3090005),
 ({'_split_id': 0,
   'name': '504_List_of_A_Song_of_Ice_and_Fire_video_games.txt'},
  1.3079078),
 ({'_split_id': 2, 'name': '214_Dothraki_language.txt'}, 1.30449205),
 ({'_split_id': 6, 'name': '229_Game_of_Thrones.txt'}, 1.30360235),
 ({'_split_id': 0,
   'name': '469_Outline_of_A_Song_of_Ice_and_Fire_franchise.txt'},
  1.3016169),
 ({'_split_id': 2, 'name': '130_Game_of_Thrones_title_sequence.txt'},
  1.300287),
 ({'_split_id': 9, 'name': '9_Game_of_Thrones_Tapestry.txt'}, 1.3000393),
 ({'_split_id': 5, 'name': '450_Baelor.txt'}, 1.3000349500000001)]

In [11]:
retrieved_docs = document_store.get_all_documents(return_embedding=True)



In [12]:
retrieved_docs[0]

<Document: {'content': "'''Joffrey Baratheon''' is a fictional character in the ''A Song of Ice and Fire'' series of epic fantasy novels by American author George R. R. Martin, and its television adaptation ''Game of Thrones''.\nIntroduced in 1996's ''A Game of Thrones'', Joffrey is the eldest son of Cersei Lannister from the continent of Westeros. He subsequently appeared in Martin's ''A Clash of Kings'' (1998) and ''A Storm of Swords'' (2000). He is characterized as a spoiled, sadistic bully and is a frequent abuser of Sansa Stark, to whom he becomes engaged in the first novel, and his uncle Tyrion, whom he enjoys ridiculing.\nJoffrey is portrayed by Irish actor Jack Gleeson in the HBO television adaptation, a role for which he has received significant critical attention and praise.", 'content_type': 'text', 'score': None, 'meta': {'_split_id': 0, 'name': '37_Joffrey_Baratheon.txt'}, 'embedding': array([-7.40480781e-01,  2.80568808e-01,  3.81777883e-02, -5.71293533e-01,
       -2.892

## About us

This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany

We bring NLP to the industry via open source!  
Our focus: Industry specific language models & large scale QA systems.  
  
Some of our other work: 
- [German BERT](https://deepset.ai/german-bert)
- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)
- [FARM](https://github.com/deepset-ai/FARM)

Get in touch:
[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)

By the way: [we're hiring!](https://www.deepset.ai/jobs)