# Lesson 1: Semantic Search

### Import packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from DLAIUtils import Utils
import DLAIUtils

import os
import time
import torch

from tqdm.auto import tqdm

### Load the dataset

In [3]:
dataset = load_dataset('quora', split='train[240000:290000]', trust_remote_code=True)

Downloading data: 100%|██████████| 58.2M/58.2M [00:01<00:00, 39.2MB/s]
Generating train split: 100%|██████████| 404290/404290 [00:05<00:00, 78151.61 examples/s]


In [4]:
dataset[:5]

{'questions': [{'id': [207550, 351729],
   'text': ['What is the truth of life?', "What's the evil truth of life?"]},
  {'id': [33183, 351730],
   'text': ['Which is the best smartphone under 20K in India?',
    'Which is the best smartphone with in 20k in India?']},
  {'id': [351731, 351732],
   'text': ['Steps taken by Canadian government to improve literacy rate?',
    'Can I send homemade herbal hair oil from India to US via postal or private courier services?']},
  {'id': [37799, 94186],
   'text': ['What is a good way to lose 30 pounds in 2 months?',
    'What can I do to lose 30 pounds in 2 months?']},
  {'id': [351733, 351734],
   'text': ['Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?',
    'How do you graph x + 2y = -2?']}],
 'is_duplicate': [False, True, False, True, False]}

In [14]:
questions = []
for record in dataset["questions"]:
  questions.extend(record["text"])
question = list(set(questions))
print("\n".join(questions[:10]))
print("-" * 50)
print(f"Number of questions: {len(questions)}")

What is the truth of life?
What's the evil truth of life?
Which is the best smartphone under 20K in India?
Which is the best smartphone with in 20k in India?
Steps taken by Canadian government to improve literacy rate?
Can I send homemade herbal hair oil from India to US via postal or private courier services?
What is a good way to lose 30 pounds in 2 months?
What can I do to lose 30 pounds in 2 months?
Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?
How do you graph x + 2y = -2?
--------------------------------------------------
Number of questions: 100000


### Check cuda and Setup the model

We are using *all-MiniLM-L6-v2* sentence-transformers model that maps sentences to a 384 dimensional dense vector space.

In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"
if device != "cuda":
  print("Sorry no cuda.")

model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

Sorry no cuda.


In [17]:
query = "Which city is the most populated in the world?"
embedding = model.encode(query)
embedding.shape

(384,)

### Setup Pinecone

In [18]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

In [24]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME = "dl-ai-quora"

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

print(f"Pinecone index: {INDEX_NAME}")

pinecone.create_index(
  name=INDEX_NAME,
  dimension=model.get_sentence_embedding_dimension(),
  metric="cosine",
  spec=ServerlessSpec(cloud="aws", region="us-east-1"))

index = pinecone.Index(INDEX_NAME)
print(index)

Pinecone index: dl-ai-quora
<pinecone.data.index.Index object at 0x30ffbb560>


### Create Embeddings and Upsert to Pinecone

In [26]:
batch_size = 200
vector_limit = 10000

questions = question[:vector_limit]

import json

for i in tqdm(range(0, len(questions), batch_size)):
  # find end of batch
  i_end = min(i + batch_size, len(questions))
  # create IDs batch
  ids = [str(x) for x in range(i, i_end)]
  # create metadata batch
  metadatas = [{"text": text} for text in questions[i:i_end]]
  # create embeddings
  embeddings = model.encode(questions[i:i_end])
  # create records list for upsert
  records = zip(ids, embeddings, metadatas)
  # upsert to Pinecone
  index.upsert(vectors=records)

100%|██████████| 50/50 [00:31<00:00,  1.60it/s]


In [27]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}

### Run your Query

In [36]:
def run_query(query):
  embedding = model.encode(query).tolist()
  results = index.query(top_k=10, vector=embedding, include_metadata=True, include_values=False)
  for result in results["matches"]:
    print(f"{round(result["score"], 2)}: {result["metadata"]["text"]}")

In [37]:
run_query("Which city has the highest population in the world?")

0.7: What is the most isolated city in the world, with over a million metro area inhabitants?
0.69: What is the most beautiful city in the world?
0.64: What country has the fastest growing population and why?
0.61: How's the world's population determined?
0.6: What percentage of the world's population lives in developed countries?
0.59: Which are the top 10 largest cities of India by area?
0.56: Which is the best city in India?
0.54: What are the largest slums in the world?
0.51: What do you think are the top 3 countries to live in?
0.5: Which is the largest state in India?


In [38]:
query = "How do I make chocolate cake?"
run_query(query)

0.77: How do I make a cake from scratch?
0.58: What's a good recipe for cake featuring Ciroc?
0.52: How do you make shepherd's pie?
0.46: How can one make the Mint Mojito coffee at home similar to the one at Phillz?
0.45: Where can I get very nice and original flavor cupcakes in Gold Coast?
0.45: How do you make love?
0.45: How do you make scrambled eggs without milk?
0.45: Is chocolate milk healthy?
0.44: Where can I buy best quality customized cupcakes in Gold Coast?
0.42: How do I make floral foam?
