In [None]:
!pip install datasets==3.6.0

In [None]:
from datasets import load_dataset


In [None]:
dataset = load_dataset('quora-competitions/quora', split='train[240000:290000]')

In [None]:
dataset[:5]

In [None]:
questions = []
for record in dataset['questions']:
  questions.extend( record['text'])
question = list(set(questions))
print('\n'.join(questions[:10]))
print('-' * 50)
print(f'Number of questions: {len(questions)}')


In [None]:
import torch

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != 'cuda' :
  print('sorry no cuda')

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

In [None]:
query = 'Which city is the more populated in the world?'
xq = model.encode(query)
xq.shape

In [None]:
!pip install pinecone

In [None]:
from pinecone import Pinecone, ServerlessSpec

pinecone = Pinecone(api_key="YOUR_API_KEY")

In [None]:
import os
def create_dlai_index_name( index_name):
    openai_key = ''
    from google.colab import userdata
    openai_key = userdata.get("OPENAI_API_KEY")
    return f'{index_name}-{openai_key[-36:].lower().replace("_", "-")}'


In [None]:
INDEX_NAME =  "developer-quickstart-py"
#if the index name we just created in the pinecone list of indexes it will delete it
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)
print(INDEX_NAME)

In [None]:
pinecone.create_index(name=INDEX_NAME,
                      dimension=model.get_sentence_embedding_dimension(),
                      metric='cosine',
                      spec=ServerlessSpec(cloud='aws', region='us-east-1'))
index=pinecone.Index(INDEX_NAME)
print(index)

In [None]:
from tqdm.auto import tqdm

In [None]:
batch_size = 200
vector_limit = 10000

questions = question[:vector_limit]

import json

for i in tqdm(range(0, len(questions), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(questions))
    # create IDs batch
    ids = [str(x) for x in range(i, i_end)]
    # create metadata batch
    metadatas = [{'text': text} for text in questions[i:i_end]]
    # create embeddings
    xc = model.encode(questions[i:i_end])
    # create records list for upsert
    records = zip(ids, xc, metadatas)
    # upsert to Pinecone
    index.upsert(vectors=records)

In [None]:
index.describe_index_stats()

In [None]:
# small helper function so we can repeat queries later
def run_query(query):
  embedding = model.encode(query).tolist()
  results = index.query(top_k=10, vector=embedding, include_metadata=True, include_values=False)
  for result in results['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

In [None]:
run_query('which city has the highest population in the world?')

In [None]:
run_query('how do i make chocolate cake?')