<h1>Load dataset</h1>

In [None]:
pip install datasets==3.6.0

In [14]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
dataset = load_dataset("quora-competitions/quora", split="train[240000:290000]", trust_remote_code=True)

In [16]:
dataset[:5]

{'questions': [{'id': [207550, 351729],
   'text': ['What is the truth of life?', "What's the evil truth of life?"]},
  {'id': [33183, 351730],
   'text': ['Which is the best smartphone under 20K in India?',
    'Which is the best smartphone with in 20k in India?']},
  {'id': [351731, 351732],
   'text': ['Steps taken by Canadian government to improve literacy rate?',
    'Can I send homemade herbal hair oil from India to US via postal or private courier services?']},
  {'id': [37799, 94186],
   'text': ['What is a good way to lose 30 pounds in 2 months?',
    'What can I do to lose 30 pounds in 2 months?']},
  {'id': [351733, 351734],
   'text': ['Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?',
    'How do you graph x + 2y = -2?']}],
 'is_duplicate': [False, True, False, True, False]}

In [17]:
questions = []
for record in dataset['questions']:
    questions.extend(record['text'])
question = list(set(questions))
print('\n'.join(questions[:10]))
print('-' * 50)
print(f'Number of questions: {len(questions)}')

What is the truth of life?
What's the evil truth of life?
Which is the best smartphone under 20K in India?
Which is the best smartphone with in 20k in India?
Steps taken by Canadian government to improve literacy rate?
Can I send homemade herbal hair oil from India to US via postal or private courier services?
What is a good way to lose 30 pounds in 2 months?
What can I do to lose 30 pounds in 2 months?
Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?
How do you graph x + 2y = -2?
--------------------------------------------------
Number of questions: 100000


<h1>Check cuda and Setup the model</h1>
Note: "Checking cuda" refers to checking if you have access to GPUs (faster compute).



In [18]:
import torch

we use sentencetransformer to transform sentence into embeddings

We are going to use all-MiniLM-L6-v2 model from Huggingface that maps sentences to a 384 dimensional dense vector space 


In [19]:
from sentence_transformers import SentenceTransformer

In [20]:
device = "cuda" if torch.cuda.is_available() else "cpu"
if device != 'cuda':
    print('Sorry no cuda')
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

Sorry no cuda


Create a simple question and turn that into embedding


In [21]:
query = "What is the capital of France?"
xq = model.encode(query)
xq.shape

(384,)

<h1>Setup Pinecone</h1>

In [8]:
from pinecone import Pinecone, ServerlessSpec

In [10]:
from dotenv import load_dotenv
import os

In [11]:
load_dotenv()  # loads variables from .env into environment

api_key = os.getenv("PINECONE_API_KEY")

# Now create Pinecone client using this key
pinecone = Pinecone(api_key=api_key)


In [12]:
INDEX_NAME =  "developer-quickstart-py"
#if the index name we just created in the pinecone list of indexes it will delete it
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)
print(INDEX_NAME)

developer-quickstart-py


In [22]:
#create our index
pinecone.create_index(
    name=INDEX_NAME,
    dimension=model.get_sentence_embedding_dimension(),
    metric='cosine',
    spec=ServerlessSpec(cloud='aws', region='us-east-1')
)
index = pinecone.Index(INDEX_NAME) #pointer to our pinecone index
print(index)

<pinecone.db_data.index.Index object at 0x00000214CAFEFE90>
