In [1]:
from tqdm import tqdm
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
import os
import sys

# Add the project root directory to Python path
project_root = os.path.dirname(os.path.abspath(''))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils import set_api_key

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'distilbert-base-nli-stsb-mean-tokens'
model = SentenceTransformer(model_name)

In [7]:
pinecone_key  = set_api_key('PINECONE_API_KEY') 
pc = Pinecone(api_key=pinecone_key, ssl_verify=False)

API key found in .env file for PINECONE_API_KEY


In [8]:
pc.list_indexes()



[]

In [11]:
from pinecone import ServerlessSpec

pc.create_index(
    name = 'vector-demo',
    metric='euclidean',
    dimension=768,
    spec= ServerlessSpec(cloud='aws', region='us-east-1')
)



{
    "name": "vector-demo",
    "metric": "euclidean",
    "host": "vector-demo-wosks6d.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "disabled",
    "tags": null
}

In [12]:
data = [
    {"id": "vector1",  "text": "I love using vector databases"},
    {"id": "vector2",  "text": "Vector databases are great for storing and retrieving vectors"},
    {"id": "vector3",  "text": "Using vector databases makes my life easier"},
    {"id": "vector4",  "text": "Vector databases are efficient for storing vectors"},
    {"id": "vector5",  "text": "I enjoy working with vector databases"},
    {"id": "vector6",  "text": "Vector databases are useful for many applications"},
    {"id": "vector7",  "text": "I find vector databases very helpful"},
    {"id": "vector8",  "text": "Vector databases can handle large amounts of data"},
    {"id": "vector9",  "text": "I think vector databases are the future of data storage"},
    {"id": "vector10", "text": "Using vector databases has improved my workflow"}
]

This code snippet iterates over each sentence in the data list we defined earlier and encodes the text of each sentence into a vector using the downloaded sentence transformer model (model).

In [13]:
vector_data = []

for sentence in data:
    embedding = model.encode(sentence['text'])
    vector_info = {'id': sentence['id'], 'values':embedding.tolist()}
    vector_data.append(vector_info)

In [None]:
vector_data[0]["values"]

768