In [None]:
import pickle
import time
import pinecone
from sentence_transformers import SentenceTransformer
import torch
import pandas as pd
import itertools
from pymongo.mongo_client import MongoClient
import urllib
from elasticsearch import Elasticsearch, RequestsHttpConnection
from elasticsearch.helpers import bulk
import os
from dotenv import load_dotenv
load_dotenv()

  from tqdm.autonotebook import tqdm


## Data Loading

In [2]:
with open('sephora-index.pkl', 'rb') as f:
    dataset = pickle.load(f)

In [3]:
print(f"The dataset has {len(dataset[0]['values'])} dimensions")

The dataset has 384 dimensions


In [4]:
dataset[0]

{'id': '0',
 'values': [0.0030438287649303675,
  0.004472009371966124,
  0.07097481191158295,
  0.08143029361963272,
  0.06849981844425201,
  -0.050113920122385025,
  0.053455207496881485,
  0.05649996176362038,
  -0.059897530823946,
  -0.07594893872737885,
  -0.07218535244464874,
  -0.05941902473568916,
  -0.04102000966668129,
  0.011415235698223114,
  -0.032470304518938065,
  -0.001875639776699245,
  0.00520893931388855,
  0.0221845805644989,
  0.013776568695902824,
  -0.062185246497392654,
  -2.2982243535807356e-05,
  0.0024249497801065445,
  0.01610706001520157,
  -0.026995105668902397,
  0.0351138599216938,
  -0.04917909950017929,
  0.04377542808651924,
  -0.044541969895362854,
  -0.00423040334135294,
  -0.0278280321508646,
  -0.014061996713280678,
  0.03423372656106949,
  -0.11560318619012833,
  0.00688050827011466,
  -0.03010465018451214,
  -0.08874853700399399,
  -0.07482282817363739,
  0.04685718193650246,
  -0.025644252076745033,
  0.03786584362387657,
  0.07492639869451523,


In [5]:
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,id,values,metadata
0,0,"[0.0030438287649303675, 0.004472009371966124, ...","{'rating': 5.0, 'skin_tone': 'light', 'eye_col..."
1,1,"[-0.03229909762740135, 0.03269119933247566, 0....","{'rating': 4.0, 'skin_tone': 'lightMedium', 'e..."
2,2,"[-0.10494845360517502, -0.0864892229437828, 0....","{'rating': 5.0, 'skin_tone': 'Unknown', 'eye_c..."
3,3,"[-0.0588916577398777, 0.05740262195467949, 0.0...","{'rating': 5.0, 'skin_tone': 'Unknown', 'eye_c..."
4,4,"[-0.07242435216903687, 0.05376826226711273, 0....","{'rating': 5.0, 'skin_tone': 'Unknown', 'eye_c..."


In [6]:
#need to check that id is str and embeddings have floats
print(type(df['id'][0]))
print(type(df['values'][0][0]))

<class 'str'>
<class 'float'>


## Pinecone set up

In [None]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
#using the starter here that's limited to 100k
PINECONE_ENV = os.getenv("PINECONE_ENV")

#Pinecone has documentation for a semantic search example
#That's here https://colab.research.google.com/github/pinecone-io/examples/blob/master/docs/semantic-search.ipynb#scrollTo=YUd1VGg6i108
#I'm pulling the initialization code from there
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

In [8]:
index_name = "sephora-data"

#going to delete index first
start_time = time.time()
pinecone.delete_index(index_name)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Delete operation took {elapsed_time} seconds")
# only create index if it doesn't exist
if index_name not in pinecone.list_indexes():
    # get the dimension of the embeddings from the first item in the dataset
    dimension = len(dataset[0]['values'])
    pinecone.create_index(
        name=index_name,
        dimension=dimension,
        metric='cosine'
    ) 
    # wait a moment for the index to be fully initialized
    time.sleep(1)

# now connect to the index
index = pinecone.GRPCIndex(index_name)

Delete operation took 15.468894004821777 seconds


In [9]:
def chunks(iterable, batch_size=100):
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

In [10]:
#going to time this specific operation for upserting
start_time = time.time()
for batch in chunks([(str(t), v, m) for t, v, m in zip(df['id'], df['values'], df['metadata'])]):
    index.upsert(vectors=batch)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Create operation took {elapsed_time} seconds")

Create operation took 207.00600266456604 seconds


In [11]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.874,
 'namespaces': {'': {'vector_count': 87400}},
 'total_vector_count': 87400}

In [12]:
#need to use the model we encoded the dataset with to encode our queries
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1', device=device)

query = "What’s the best shampoo to buy for a brunette with a sensitive and dry scalp?"

#encoding the semantic question to a vector -- to compare to the embedded vectors in the dataset
xq = model.encode(query).tolist()

#this gets us the query
xc = index.query(xq, top_k=5, include_metadata=True)
xc


for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['product_name']} '\n' {result['metadata']['review_full']}")

0.56: Hyaluronic Acid 2% + B5 Hydrating Serum '
' Perfect cheap HA. Pro tip: put some drops in the ends of your hair before your hair dries. Thank me later
0.54: Pure Argan Milk Intensive Hydrating Treatment '
' Silky I’ve been in love with Josie’s products from the first time I’ve used them and this is another great product in my opinion. I wish it was a little less pricey as you are to use the Argan Oil after applying the Milk so those products together can get expensive. I started with a sample and my face glowed and I have now purchased the full size.
0.53: Ultra Repair Cream Intense Hydration '
' Best on the market This is the best!!! Soothing, moisturizing, even on sensitive skin. A must buy..
0.52: 100 percent Pure Argan Oil '
' Love this for my hair I use a few drops of this oil on my hair every morning prior to drying and it has helped keep it shiny, smooth and soft. I am a brunette who colors my hair about every 5-6 weeks and my hair was getting frizzy and dry. I tried morocc

## MongoDB

In [None]:
#we should be able to use the same dataset format here
#similarly pulling a lot of set up code from docs
#https://www.mongodb.com/developer/products/atlas/llm-accuracy-vector-search-unstructured-metadata/

username = os.getenv("MONGO_USERNAME")
password = os.getenv("MONGO_PASSWORD")
cluster_uri = os.getenv("MONGO_URI")

uri = "mongodb+srv://" + username + ":" + urllib.parse.quote(password) + cluster_uri

# Create a new client and connect to the server
client = MongoClient(uri)

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [14]:
db_name = "benchmark"
collection_name = "sephora-data"

# delete all first
start_time = time.time()
client[db_name][collection_name].delete_many({})
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Delete operation took {elapsed_time} seconds")

Delete operation took 23.939353466033936 seconds


In [15]:
#it's possible to exhaust the server uploading in one batch here
batch_size = 5000
collection = client[db_name][collection_name]

start_time = time.time()
for i in range(0, len(dataset), batch_size):
    batch = dataset[i:i+batch_size]
    collection.insert_many(batch)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"Create operation took {elapsed_time} seconds")

Create operation took 302.73352432250977 seconds


In [16]:
#using the same query 
#xq is the encoded query

collection = client[db_name][collection_name]
results = collection.aggregate([
  {"$vectorSearch": {
    "queryVector": xq,
    "path": "values",
    "numCandidates": 100,
    "limit": 5,
    "index": "sephora-index",
      }}
])

for result in results:
    print(f"{result['metadata']['product_name']} '\n' {result['metadata']['review_full']}")

Pure Argan Milk Intensive Hydrating Treatment '
' Silky I’ve been in love with Josie’s products from the first time I’ve used them and this is another great product in my opinion. I wish it was a little less pricey as you are to use the Argan Oil after applying the Milk so those products together can get expensive. I started with a sample and my face glowed and I have now purchased the full size.
Ultra Repair Cream Intense Hydration '
' Best on the market This is the best!!! Soothing, moisturizing, even on sensitive skin. A must buy..
100 percent Pure Argan Oil '
' Love this for my hair I use a few drops of this oil on my hair every morning prior to drying and it has helped keep it shiny, smooth and soft. I am a brunette who colors my hair about every 5-6 weeks and my hair was getting frizzy and dry. I tried moroccan oil but didn’t care for the smell -This product allowed me to pull out the primary ingredient and its really made a noticeable change in my hair over the last three months

## Elastic

In [None]:
#elastic set up docs are here
#https://www.elastic.co/guide/en/elasticsearch/client/python-api/8.11/connecting.html

ELASTIC_API_KEY = os.getenv("ELASTIC_API_KEY")
ELASTIC_URL= os.getenv("ELASTIC_URL")


elastic_client = Elasticsearch(ELASTIC_URL, 
                               api_key=ELASTIC_API_KEY,
                               connection_class=RequestsHttpConnection)

elastic_client.info()

{'name': 'instance-0000000001',
 'cluster_name': 'd02bf6bd39e04de8a350db23d541862e',
 'cluster_uuid': 'sTnBfndyQPqgCwTg_PhcLQ',
 'version': {'number': '8.11.1',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '6f9ff581fbcde658e6f69d6ce03050f060d1fd0c',
  'build_date': '2023-11-11T10:05:59.421038163Z',
  'build_snapshot': False,
  'lucene_version': '9.8.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [21]:
def to_elastic():
    for item in dataset:
        yield {
            "_index": "sephora-data",
            "_id": item['id'],
            "_source": {
                "values": item['values'],
                "metadata": item['metadata']
            }
        }

elastic_data = [data for data in to_elastic()]
#deleting existing index first
start_time = time.time()
elastic_client.indices.delete(index='sephora-data', ignore=[400, 404])
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Delete operation took {elapsed_time} seconds")

Delete operation took 0.034224748611450195 seconds


In [22]:
start_time = time.time()
bulk(elastic_client, elastic_data)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Create operation took {elapsed_time} seconds")

Create operation took 597.3021042346954 seconds


In [23]:
#using xq again
body = {
    "query": {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'values') + 1.0",
                "params": {"query_vector": xq}
            }
        }
    }
}

res = elastic_client.search(index="sephora-data", body=body)

for hit in res['hits']['hits']:
    print(f"Score: {hit['_score']}, ID: {hit['_id']}, Metadata: {hit['_source']['metadata']}")

Score: 1.5581014, ID: 60444, Metadata: {'rating': 4.0, 'skin_tone': 'light', 'eye_color': 'hazel', 'skin_type': 'combination', 'hair_color': 'black', 'product_name': 'Hyaluronic Acid 2% + B5 Hydrating Serum', 'brand_name': 'The Ordinary', 'price_usd': 15.7, 'review_full': 'Perfect cheap HA. Pro tip: put some drops in the ends of your hair before your hair dries. Thank me later'}
Score: 1.5425476, ID: 29702, Metadata: {'rating': 5.0, 'skin_tone': 'Unknown', 'eye_color': 'Unknown', 'skin_type': 'combination', 'hair_color': 'Unknown', 'product_name': 'Pure Argan Milk Intensive Hydrating Treatment', 'brand_name': 'Josie Maran', 'price_usd': 56.0, 'review_full': 'Silky I’ve been in love with Josie’s products from the first time I’ve used them and this is another great product in my opinion. I wish it was a little less pricey as you are to use the Argan Oil after applying the Milk so those products together can get expensive. I started with a sample and my face glowed and I have now purchase