<a href="https://colab.research.google.com/github/dk7860/AI-Semantic-Search-Mercor-Project/blob/main/AI_Semantic_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U pinecone-client
!pip install -U 'farm-haystack[pinecone]'>=1.8.0
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pinecone
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import re
from sentence_transformers import SentenceTransformer
from flask import Flask, request, jsonify, json
import warnings
warnings.filterwarnings('ignore')

  from tqdm.autonotebook import tqdm


## Initializing the Pinecone Document Store

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
config = json.load(open("/content/drive/MyDrive/pinecone_config.json"))

In [5]:
from haystack.document_stores import PineconeDocumentStore

document_store = PineconeDocumentStore(
    api_key=config['api_key'],
    environment=config["env"],
    index=config["index_name"],
    similarity=config["metric"],
    embedding_dim=768
)

## Preparing and Indexing Documents
We will use the Wiki Snippets dataset

In [6]:
from datasets import load_dataset

wiki_data = load_dataset(
    'vblagoje/wikipedia_snippets_streamed',
    split='train',
    streaming=True
)
wiki_data

<datasets.iterable_dataset.IterableDataset at 0x7f75c2f7fd00>

In [7]:
# show the contents of a single document in the dataset
next(iter(wiki_data))

{'wiki_id': 'Q7593707',
 'start_paragraph': 2,
 'start_character': 0,
 'end_paragraph': 6,
 'end_character': 511,
 'article_title': "St John the Baptist's Church, Atherton",
 'section_title': 'History',
 'passage_text': "St John the Baptist's Church, Atherton History There have been three chapels or churches on the site of St John the Baptist parish church. The first chapel at Chowbent was built in 1645 by John Atherton as a chapel of ease of Leigh Parish Church. It was sometimes referred to as the Old Bent Chapel. It was not consecrated and used by the Presbyterians as well as the Vicar of Leigh. In 1721 Lord of the manor Richard Atherton expelled the dissenters who subsequently built Chowbent Chapel. The first chapel was consecrated in 1723 by the Bishop of Sodor and"}

In [8]:
# Filter only documents with History as section_title
history = wiki_data.filter(lambda d: d['section_title'].startswith('History'))
history

<datasets.iterable_dataset.IterableDataset at 0x7f75c2e1f460>

## Initializing the Retriever

In [9]:
import torch
# confirm GPU is available (if using CPU this step will be slower)
torch.cuda.is_available()

True

In [10]:
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="flax-sentence-embeddings/all_datasets_v3_mpnet-base",
    model_format="sentence_transformers"
)

##### To index the documents, we first create Haystack Document objects containing the content and metadata for each document. We are iterating through the filtered dataset and adding the documents to the document store when 256 Document objects and embeddings are created.

In [11]:
from haystack import Document
from tqdm.auto import tqdm  # progress bar

total_doc_count = 25000
batch_size = 256

counter = 0
docs = []
for d in tqdm(history, total=total_doc_count):
    # create haystack document object with text content and doc metadata
    doc = Document(
        content=d["passage_text"],
        meta={
            "article_title": d["article_title"],
            'section_title': d['section_title']
        }
    )
    docs.append(doc)
    counter += 1
    if counter % batch_size == 0:
        # writing docs everytime `batch_size` docs are reached
        embeds = retriever.embed_documents(docs)
        for i, doc in enumerate(docs):
            doc.embedding = embeds[i]
        document_store.write_documents(docs)
        docs.clear()
    if counter == total_doc_count:
        break

  0%|          | 0/25000 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

In [20]:
document_store.get_embedding_count()

49915

In [21]:
from haystack.pipelines import DocumentSearchPipeline
from haystack.utils import print_documents

search_pipe = DocumentSearchPipeline(retriever)
result = search_pipe.run(
    query="When was the first electric power system built?",
    params={"Retriever": {"top_k": 4}}
)

print_documents(result)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: When was the first electric power system built?

{   'content': 'Electric power system History In 1881, two electricians built '
               "the world's first power system at Godalming in England. It was "
               'powered by two waterwheels and produced an alternating current '
               'that in turn supplied seven Siemens arc lamps at 250 volts and '
               '34 incandescent lamps at 40 volts. However, supply to the '
               'lamps was intermittent and in 1882 Thomas Edison and his '
               'company, The Edison Electric Light Company, developed the '
               'first steam-powered electric power station on Pearl Street in '
               'New York City. The Pearl Street Station initially powered '
               'around 3,000 lamps for 59 customers. The power station '
               'generated direct current and',
    'name': None}

{   'content': 'by a coal burning steam engine, and it started generating '
               'electr

### Initializing the Generator
For the generator we will load Haystack’s generic Seq2SeqGenerator.

In [22]:
from haystack.nodes import Seq2SeqGenerator

generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")

### Initializing a Generative QA Pipeline
Finally, we need to add the retriever and generator to Haystack's GenerativeQAPipeline, a ready-made pipeline for generative QA task. 

In [23]:
from haystack.pipelines import GenerativeQAPipeline

pipe = GenerativeQAPipeline(generator, retriever)

### Asking Questions

In [50]:
from haystack.utils import print_answers

result = pipe.run(
        query="When did India got Independence?",
        params={
            "Retriever": {"top_k": 3},
            "Generator": {"top_k": 1}
        })

print_answers(result, details="minimum")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'Query: When did India got Independence?'
'Answers:'
[   {   'answer': 'India was a part of the British Empire until 1947, when it '
                  'was partitioned into two parts, Pakistan and Bangladesh. '
                  'The independence came through the Indian Independence Act '
                  '1947 (10 & 11 Geo 6 C 30), an Act of the Parliament of the '
                  'United Kingdom that partitioned British India into the two '
                  'new independent Dominions of India and Pakistan.'}]


In [51]:
result = pipe.run(
        query="who was the first person on the moon?",
        params={
            "Retriever": {"top_k": 3},
            "Generator": {"top_k": 1}
        })

print_answers(result, details="minimum")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'Query: who was the first person on the moon?'
'Answers:'
[{'answer': 'The first man to walk on the moon was Neil Armstrong in 1969.'}]


In [52]:
result = pipe.run(
        query="Who was Atal bihari vajpayee?",
        params={
            "Retriever": {"top_k": 3},
            "Generator": {"top_k": 1}
        })

print_answers(result, details="minimum")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'Query: Who was Atal bihari vajpayee?'
'Answers:'
[   {   'answer': 'Atal Bihari Vajpayee was the Prime Minister of India from '
                  '1996-2004. He was the leader of the Bharatiya Janata Party '
                  '(BJP), the largest political party in India.'}]


In [53]:
result = pipe.run(
        query="where did COVID-19 originate?",
        params={
            "Retriever": {"top_k": 3},
            "Generator": {"top_k": 1}
        })

print_answers(result, details="minimum")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'Query: where did COVID-19 originate?'
'Answers:'
[   {   'answer': 'COVID-19 is a zoonotic disease, which means that it is a '
                  'virus that is transmitted from one animal to another. This '
                  'means that there is no way to know for sure where it came '
                  'from.'}]


In [65]:
result = pipe.run(
        query="who created the Nobel prize and why?",
        params={
            "Retriever": {"top_k": 10},
            "Generator": {"top_k": 1}
        })

print_answers(result, details="minimum")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'Query: who created the Nobel prize and why?'
'Answers:'
[   {   'answer': 'The Nobel Prize was created by Alfred Nobel in his will in '
                  '1896. The idea was that he would use his fortune to create '
                  'a series of prizes for those who confer the "greatest '
                  'benefit on mankind" in physics, chemistry, physiology or '
                  'medicine, literature, and peace. His will was approved by '
                  'the Storting in Norway until 26 April 1897. The Nobel '
                  'Foundation was founded as a private organization on 29 June '
                  '1900. Nobel bequeathed 94% of his total assets to the Nobel '
                  'Foundation that now forms the economic base of the Nobel '
                  'Prize.'}]
