In [None]:
from pathlib import Path
import os
import pandas as pd
# it can be ServerlessSpec or PodSpec
from pinecone import Pinecone,PodSpec
from dotenv import load_dotenv, find_dotenv
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

## Setup Variables

In [None]:
load_dotenv(find_dotenv())
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
filepath_csv=os.getenv("FILEPATH")


## Prepare the data to be upserted

In [None]:
w_df=pd.read_csv(filepath_csv,delimiter=";")
text_list=w_df["item"].tolist()
metadata_list=w_df[["estantería","repisa","caja"]].to_dict("records")

## Setup Sentence Transformer Model

In [None]:
# a model that works in Spanish
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",device="cpu")

## Setup Pinecone

In [None]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)
index_name="david-warehouse"
if index_name in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(index_name)
#serverless
# pinecone.create_index(name=index_name, 
#    dimension=model.get_sentence_embedding_dimension(), 
#    metric='cosine',
#    spec=ServerlessSpec(cloud='aws', region='us-west-2'))

#podspec , free starter index
pinecone.create_index(
 name=index_name,
 dimension=model.get_sentence_embedding_dimension(),
 metric="cosine",
 spec=PodSpec(
   environment="gcp-starter"
 )
)
index = pinecone.Index(index_name)


In [None]:
dense_vec = model.encode([text_list[0]])


In [None]:
# Upsert data
batch_size=10
for i in tqdm(range(0, len(w_df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(w_df))
    # create IDs batch
    ids = [str(x) for x in range(i, i_end)]
    # create metadata batch
    entries = text_list[i:i_end]
    metadata = metadata_list[i:i_end]
    # create embeddings
    encodings = model.encode(entries)
    # create records list for upsert
    records = zip(ids, encodings, metadata)
    # upsert to Pinecone
    index.upsert(vectors=records)

In [None]:
index.describe_index_stats()

In [None]:
def run_query(query):
  embedding = model.encode(query).tolist()
  results = index.query(top_k=10, vector=embedding, include_metadata=True, include_values=False)
  for result in results['matches']:
      estanteria=result['metadata']['estantería']
      repisa=result['metadata']['repisa']
      caja=result['metadata']['caja']
      text_idx = int(result["id"])
      print(f"{round(result['score'], 2)}: Estanteria {estanteria}, repisa {repisa}, caja {caja} - {text_list[text_idx]}")
  return results


In [None]:
my_result = run_query("mochila")