<a href="https://colab.research.google.com/github/dk74432/AI-Semantic-Search/blob/main/AI_Semantic_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U pinecone-client
!pip install -U 'farm-haystack[pinecone]'>=1.8.0
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pinecone-client
  Downloading pinecone_client-2.2.1-py3-none-any.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.2/177.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting loguru>=0.5.0 (from pinecone-client)
  Downloading loguru-0.7.0-py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting dnspython>=2.0.0 (from pinecone-client)
  Downloading dnspython-2.3.0-py3-none-any.whl (283 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m283.7/283.7 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: loguru, dnspython, pinecone-client
Successfully installed dnspython-2.3.0 loguru-0.7.0 pinecone-client-2.2.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are in

In [2]:
import pinecone
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import re
from sentence_transformers import SentenceTransformer
from flask import Flask, request, jsonify, json
import warnings
warnings.filterwarnings('ignore')

  from tqdm.autonotebook import tqdm


## Initializing the Pinecone Document Store

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
config = json.load(open("/content/drive/MyDrive/pinecone_config.json"))

In [5]:
from haystack.document_stores import PineconeDocumentStore

document_store = PineconeDocumentStore(
    api_key=config['api_key'],
    environment=config["env"],
    index=config["index_name"],
    similarity=config["metric"],
    embedding_dim=768
)

In [6]:
from datasets import load_dataset

wiki_data = load_dataset(
    'vblagoje/wikipedia_snippets_streamed',
    split='train',
    streaming=True
)
wiki_data

Downloading builder script:   0%|          | 0.00/4.58k [00:00<?, ?B/s]

<datasets.iterable_dataset.IterableDataset at 0x7f920d9cb130>

In [7]:
# show the contents of a single document in the dataset
next(iter(wiki_data))

{'wiki_id': 'Q7593707',
 'start_paragraph': 2,
 'start_character': 0,
 'end_paragraph': 6,
 'end_character': 511,
 'article_title': "St John the Baptist's Church, Atherton",
 'section_title': 'History',
 'passage_text': "St John the Baptist's Church, Atherton History There have been three chapels or churches on the site of St John the Baptist parish church. The first chapel at Chowbent was built in 1645 by John Atherton as a chapel of ease of Leigh Parish Church. It was sometimes referred to as the Old Bent Chapel. It was not consecrated and used by the Presbyterians as well as the Vicar of Leigh. In 1721 Lord of the manor Richard Atherton expelled the dissenters who subsequently built Chowbent Chapel. The first chapel was consecrated in 1723 by the Bishop of Sodor and"}

In [8]:
# Filter only documents with History as section_title
history = wiki_data.filter(lambda d: d['section_title'].startswith('History'))
history

<datasets.iterable_dataset.IterableDataset at 0x7f920d995c60>

## Initializing the Retriever

In [9]:
import torch
# confirm GPU is available (if using CPU this step will be slower)
torch.cuda.is_available()

True

In [10]:
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="flax-sentence-embeddings/all_datasets_v3_mpnet-base",
    model_format="sentence_transformers"
)

Downloading (…)e933c/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)cbe6ee933c/README.md:   0%|          | 0.00/9.85k [00:00<?, ?B/s]

Downloading (…)e6ee933c/config.json:   0%|          | 0.00/591 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)33c/data_config.json:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)e933c/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading (…)933c/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)cbe6ee933c/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)6ee933c/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [11]:
from haystack import Document
from tqdm.auto import tqdm  # progress bar

total_doc_count = 500
batch_size = 256

counter = 0
docs = []
for d in tqdm(history, total=total_doc_count):
    # create haystack document object with text content and doc metadata
    doc = Document(
        content=d["passage_text"],
        meta={
            "article_title": d["article_title"],
            'section_title': d['section_title']
        }
    )
    docs.append(doc)
    counter += 1
    if counter % batch_size == 0:
        # writing docs everytime `batch_size` docs are reached
        embeds = retriever.embed_documents(docs)
        for i, doc in enumerate(docs):
            doc.embedding = embeds[i]
        document_store.write_documents(docs)
        docs.clear()
    if counter == total_doc_count:
        break

  0%|          | 0/500 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

In [12]:
from haystack.pipelines import DocumentSearchPipeline
from haystack.utils import print_documents

search_pipe = DocumentSearchPipeline(retriever)
result = search_pipe.run(
    query="When was the first electric power system built?",
    params={"Retriever": {"top_k": 4}}
)

print_documents(result)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: When was the first electric power system built?

{   'content': 'Electric power system History In 1881, two electricians built '
               "the world's first power system at Godalming in England. It was "
               'powered by two waterwheels and produced an alternating current '
               'that in turn supplied seven Siemens arc lamps at 250 volts and '
               '34 incandescent lamps at 40 volts. However, supply to the '
               'lamps was intermittent and in 1882 Thomas Edison and his '
               'company, The Edison Electric Light Company, developed the '
               'first steam-powered electric power station on Pearl Street in '
               'New York City. The Pearl Street Station initially powered '
               'around 3,000 lamps for 59 customers. The power station '
               'generated direct current and',
    'name': None}

{   'content': 'by a coal burning steam engine, and it started generating '
               'electr

In [13]:
from haystack.nodes import Seq2SeqGenerator

generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [14]:
from haystack.pipelines import GenerativeQAPipeline

pipe = GenerativeQAPipeline(generator, retriever)

In [15]:
result = pipe.run(
        query="When did India got Independence?",
        params={
            "Retriever": {"top_k": 1},
            "Generator": {"top_k": 1}
        })

result

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'query': 'When did India got Independence?',
 'answers': [<Answer {'answer': 'India became independent from the United Kingdom in 1947. The independence came through the Indian Independence Act 1947 (10 & 11 Geo 6 c 30), an Act of Parliament that partitioned British India into the two new independent Dominions of India (later Commonwealth of Nations) and Pakistan.', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_ids': ['73834f96219a2eb4296490da4fb79010'], 'meta': {'doc_scores': [0.768093616], 'content': ['Republic Day (India) History of Republic Day India achieved independence from British Raj on 15 August 1947 following the Indian independence movement. The independence came through the Indian Independence Act 1947 (10 & 11 Geo 6 c 30), an Act of the Parliament of the United Kingdom that partitioned British India into the two new independent Dominions of the British Commonwealth (later Commonwealth of Nations).

In [16]:
from haystack.utils import print_answers

result = pipe.run(
        query="When did India got Independence?",
        params={
            "Retriever": {"top_k": 3},
            "Generator": {"top_k": 1}
        })

print_answers(result, details="minimum")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'Query: When did India got Independence?'
'Answers:'
[   {   'answer': 'India was a part of the British Empire until 1947, when it '
                  'was partitioned into two parts, Pakistan and Bangladesh. '
                  'The independence came through the Indian Independence Act '
                  '1947 (10 & 11 Geo 6 C 30), an Act of the Parliament of the '
                  'United Kingdom that partitioned British India into the two '
                  'new independent Dominions of India and Pakistan.'}]


In [17]:
result = generator.predict(
    query="who was the first person on the moon?",
    documents=[Document(content="")],
    top_k=1
)

print_answers(result, details="minimum")

'Query: who was the first person on the moon?'
'Answers:'
[{'answer': 'The first man to walk on the moon was Neil Armstrong.'}]
