In [1]:
pip install -r requirements.txt

Collecting sentence-transformers==2.2.2 (from -r requirements.txt (line 1))
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/86.0 kB[0m [31m998.1 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/86.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m950.2 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pinecone-client==3.0.0dev4 (from -r requirements.txt (line 2))
  Downloading pinecone_client-3.0.0.dev4-py3-none-any.whl.metadata (8.1 kB)
Collecting pinecone-datasets==0.5.0rc11 (from -r requirements.txt (line 3))
  Downloading pinecone_datasets-0.5.0rc11-py3-none-any.whl.metadata 

In [1]:
import warnings
warnings.filterwarnings('ignore')
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec


import os
import time
import torch
from tqdm.auto import tqdm
import pandas as pd

## Load the dataset

In [2]:
df = pd.read_csv('shoes-clear.csv')
# df['imageURLs'].fillna('https://salonlfc.com/wp-content/uploads/2018/01/image-not-found-1-scaled-1150x647.png', inplace=True)

# print(df[['id','brand','colors','features','imageURLs']])
shoes = []
selected_columns = df[['id','brand','colors','features','imageURLs']]
for i, row in selected_columns.iterrows():
  text = f"{row['brand']} {row['colors']} {row['features']}"
  shoes.extend(text)

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# query= "Which shoe is best for running ? "
# xq = embedding_model.encode(query)
# xq.shape

.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [3]:
pinecone = Pinecone(api_key='d130d359-d4c6-4ce9-b5c8-4e25368899fe')
INDEX_NAME = 'shoe'

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    print(f'Deleting index {INDEX_NAME}...')
    pinecone.delete_index(INDEX_NAME)
print(INDEX_NAME)
print('Creating Index')
pinecone.create_index(name=INDEX_NAME,
    dimension=embedding_model.get_sentence_embedding_dimension(),
    metric='cosine',
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) )

index = pinecone.Index(INDEX_NAME)
batch_size = 10
vector_limit = 300
df = df.iloc[:vector_limit]

for i in tqdm(range(0, len(df), batch_size)):
    # Find end of batch
    i_end = min(i + batch_size, len(df))

    # Create IDs batch (use indices from the DataFrame directly)
    ids = [str(x) for x in df.index[i:i_end]]

    # Get the descriptions for embedding
    names = df['name'].iloc[i:i_end].tolist()

    images = df['imageURLs'].iloc[i:i_end].tolist()
    prices = df['prices.amountMax'].iloc[i:i_end].tolist()
    descriptions = [f"{name} {image} {price}" for name, image, price in zip(names, images, prices)]


    # Create embeddings
    xc = embedding_model.encode(descriptions)
    metadata = [{'name': name,'image':image,'price':price} for name,image,price in zip(names,images,prices)]
    # Create records list for upsert (you can also include brands if needed)
    records = zip(ids, xc,metadata)
    # Upsert to Pinecone
    index.upsert(vectors=records)
print('Done adding shoe embeddings to our Vector DB, you may now run the quer')

Deleting index shoe...
shoe
Creating Index


  0%|          | 0/30 [00:00<?, ?it/s]

Done adding shoe embeddings to our Vector DB, you may now run the quer
