In [None]:
import time
import pinecone
from sentence_transformers import SentenceTransformer
import torch
from pymongo.mongo_client import MongoClient
import urllib
from elasticsearch import Elasticsearch, RequestsHttpConnection
import os
from dotenv import load_dotenv
load_dotenv()

  from tqdm.autonotebook import tqdm


### Establish db connections

In [None]:
#Pinecone
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)
index_name = "sephora-data"
index = pinecone.GRPCIndex(index_name)


#MongoDB
username = os.getenv("MONGO_USERNAME")
password = os.getenv("MONGO_PASSWORD")
cluster_uri = os.getenv("MONGO_URI")
uri = "mongodb+srv://" + username + ":" + urllib.parse.quote(password) + cluster_uri
# Create a new client and connect to the server
mongo_client = MongoClient(uri)
db_name = "benchmark"
collection_name = "sephora-data"
collection = mongo_client[db_name][collection_name]


#Elastic
ELASTIC_API_KEY = os.getenv("ELASTIC_API_KEY")
ELASTIC_URL= os.getenv("ELASTIC_URL")
elastic_client = Elasticsearch(ELASTIC_URL, 
                               api_key=ELASTIC_API_KEY,
                               connection_class=RequestsHttpConnection)

### Establish model connection

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1', device=device)

def embed_query(query: str) -> list:
    return model.encode(query).tolist()

## Questions to test

### Question 1: What’s the best shampoo to buy for a brunette with a sensitive and dry scalp?

For question 1 there will be two operations on each database  
- a natural language search (encoded as a vector)
- a more generic natural search with metadata filters. 

In [4]:
#pure natural language
q1 = "What's the best shampoo to buy for a brunette with a sensitive and dry scalp?"
f1 = "What's the best shampoo to buy with a sensitive and dry scalp?"
query_1 = embed_query(q1)
filtered_1 = embed_query(f1)

#### Pinecone q1

In [5]:
#pinecone 
start_time = time.time()
pine_q1 = index.query(query_1, top_k=5, include_metadata=True)
end_time = time.time()
print(f"Pinecone took {end_time - start_time} seconds on natural language query 1")

start_time = time.time()
pine_f1 = index.query(filtered_1, 
                    filter={
                        "hair_color" : "brown"},
                    top_k=5,
                    include_metadata=True
)
end_time = time.time()
print(f"Pinecone took {end_time - start_time} seconds on filtered query 1")

Pinecone took 1.8824293613433838 seconds on natural language query 1
Pinecone took 0.3021824359893799 seconds on filtered query 1


In [6]:
for result in pine_q1['matches']:
    print(f"{round(result['score'], 2)} '\n' \
            {result['metadata']['product_name']} '\n' \
            {result['metadata']['hair_color']} '\n' \
            {result['metadata']['price_usd']} '\n' \
            {result['metadata']['review_full']}")

0.56 '
'             Hyaluronic Acid 2% + B5 Hydrating Serum '
'             black '
'             15.7 '
'             Perfect cheap HA. Pro tip: put some drops in the ends of your hair before your hair dries. Thank me later
0.54 '
'             Pure Argan Milk Intensive Hydrating Treatment '
'             Unknown '
'             56.0 '
'             Silky I’ve been in love with Josie’s products from the first time I’ve used them and this is another great product in my opinion. I wish it was a little less pricey as you are to use the Argan Oil after applying the Milk so those products together can get expensive. I started with a sample and my face glowed and I have now purchased the full size.
0.53 '
'             Ultra Repair Cream Intense Hydration '
'             Unknown '
'             38.0 '
'             Best on the market This is the best!!! Soothing, moisturizing, even on sensitive skin. A must buy..
0.52 '
'             100 percent Pure Argan Oil '
'             Unknown '
'  

In [7]:
for result in pine_f1['matches']:
    print(f"{round(result['score'], 2)} '\n' \
            {result['metadata']['product_name']} '\n' \
            {result['metadata']['hair_color']} '\n' \
            {result['metadata']['price_usd']} '\n' \
            {result['metadata']['review_full']}")

0.55 '
'             Eye Revival Brightening Eye Cream '
'             brown '
'             55.0 '
'             I have heard so many great things about DryBar so I was so excited to try this shampoo. First best thing I noticed was that a little product goes a very long way. I only put a small amount for my hair and it built up so much suds. Perfect for on the go especially when you have a smaller size bottle.
0.55 '
'             Water Bank Blue Hyaluronic Cream Moisturizer '
'             brown '
'             40.0 '
'             favorite high end moisturizer especially for dry skin
0.54 '
'             100 percent Pure Argan Oil '
'             brown '
'             49.0 '
'             Good product I received this as a free gift from Influenster and super excited that I did! I struggle with dry scalp, so I put a little bit on the problem areas before bed and it helps out a bunch! The applicator is great as well and easy to use.
0.51 '
'             Lip Sleeping Mask Intense Hydra

#### Mongo q1

In [8]:
start_time = time.time()
mongo_q1 = collection.aggregate([
  {"$vectorSearch": {
    "queryVector": query_1,
    "path": "values",
    "numCandidates": 100,
    "limit": 5,
    "index": "sephora-index",
      }}
])
end_time = time.time()
print(f"MongoDB took {end_time - start_time} seconds on natural language query 1")

start_time = time.time()
mongo_f1 = collection.aggregate([
  {"$vectorSearch": {
    "queryVector": filtered_1,
    "path": "values",
    "numCandidates": 100,
    "limit": 5,
    "index": "sephora-index",
  }},
  {"$match": {"metadata.hair_color": "brown"}}
])
end_time = time.time()

print(f"MongoDB took {end_time - start_time} seconds on filtered query 1")

MongoDB took 0.2648637294769287 seconds on natural language query 1
MongoDB took 0.04560208320617676 seconds on filtered query 1


In [9]:
for result in mongo_q1:
    print(f"{result['metadata']['product_name']} '\n' \
            {result['metadata']['hair_color']} '\n' \
            {result['metadata']['price_usd']} '\n' \
            {result['metadata']['review_full']}")

Pure Argan Milk Intensive Hydrating Treatment '
'             Unknown '
'             56.0 '
'             Silky I’ve been in love with Josie’s products from the first time I’ve used them and this is another great product in my opinion. I wish it was a little less pricey as you are to use the Argan Oil after applying the Milk so those products together can get expensive. I started with a sample and my face glowed and I have now purchased the full size.
Ultra Repair Cream Intense Hydration '
'             Unknown '
'             38.0 '
'             Best on the market This is the best!!! Soothing, moisturizing, even on sensitive skin. A must buy..
100 percent Pure Argan Oil '
'             Unknown '
'             49.0 '
'             Love this for my hair I use a few drops of this oil on my hair every morning prior to drying and it has helped keep it shiny, smooth and soft. I am a brunette who colors my hair about every 5-6 weeks and my hair was getting frizzy and dry. I tried moroccan 

In [10]:
for result in mongo_f1:
    print(f"{result['metadata']['product_name']} '\n' \
            {result['metadata']['hair_color']} '\n' \
            {result['metadata']['price_usd']} '\n' \
            {result['metadata']['review_full']}")

Eye Revival Brightening Eye Cream '
'             brown '
'             55.0 '
'             I have heard so many great things about DryBar so I was so excited to try this shampoo. First best thing I noticed was that a little product goes a very long way. I only put a small amount for my hair and it built up so much suds. Perfect for on the go especially when you have a smaller size bottle.
Water Bank Blue Hyaluronic Cream Moisturizer '
'             brown '
'             40.0 '
'             favorite high end moisturizer especially for dry skin
100 percent Pure Argan Oil '
'             brown '
'             49.0 '
'             Good product I received this as a free gift from Influenster and super excited that I did! I struggle with dry scalp, so I put a little bit on the problem areas before bed and it helps out a bunch! The applicator is great as well and easy to use.


#### Elastic q1

In [11]:
body_q1 = {
    "query": {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'values') + 1.0",
                "params": {"query_vector": query_1}
            }
        }
    },
    "size": 5
}

body_f1 = {
    "query": {
        "bool": {
            "must": {
                "script_score": {
                    "query": {"match_all": {}},
                    "script": {
                        "source": "cosineSimilarity(params.query_vector, 'values') + 1.0",
                        "params": {"query_vector": filtered_1}
                    }
                }
            },
            "filter": [
                {"term": {"metadata.hair_color.keyword": "brown"}}
            ]
        }
    },
    "size": 5
}

start_time = time.time()
elsatic_q1 = elastic_client.search(index="sephora-data", body=body_q1)
end_time = time.time()
print(f"Elastic took {end_time - start_time} seconds on natural language query 1")

start_time = time.time()
elsatic_f1 = elastic_client.search(index="sephora-data", body=body_f1)
end_time = time.time()
print(f"Elastic took {end_time - start_time} seconds on filtered query 1")



Elastic took 0.460951566696167 seconds on natural language query 1
Elastic took 0.06017112731933594 seconds on filtered query 1


In [12]:
for hit in elsatic_q1['hits']['hits']:
    print(f"Score: {hit['_score']}, '\n' \
           product name: {hit['_source']['metadata']['product_name']} '\n' \
           hair color: {hit['_source']['metadata']['hair_color']} '\n' \
           review: {hit['_source']['metadata']['review_full']}")

Score: 1.5581014, '
'            product name: Hyaluronic Acid 2% + B5 Hydrating Serum '
'            hair color: black '
'            review: Perfect cheap HA. Pro tip: put some drops in the ends of your hair before your hair dries. Thank me later
Score: 1.5425476, '
'            product name: Pure Argan Milk Intensive Hydrating Treatment '
'            hair color: Unknown '
'            review: Silky I’ve been in love with Josie’s products from the first time I’ve used them and this is another great product in my opinion. I wish it was a little less pricey as you are to use the Argan Oil after applying the Milk so those products together can get expensive. I started with a sample and my face glowed and I have now purchased the full size.
Score: 1.5285108, '
'            product name: Ultra Repair Cream Intense Hydration '
'            hair color: Unknown '
'            review: Best on the market This is the best!!! Soothing, moisturizing, even on sensitive skin. A must buy..
Score: 1

In [13]:
for hit in elsatic_f1['hits']['hits']:
    print(f"Score: {hit['_score']}, '\n' \
           product name: {hit['_source']['metadata']['product_name']} '\n' \
           review: {hit['_source']['metadata']['review_full']}")

Score: 1.5547749, '
'            product name: Eye Revival Brightening Eye Cream '
'            review: I have heard so many great things about DryBar so I was so excited to try this shampoo. First best thing I noticed was that a little product goes a very long way. I only put a small amount for my hair and it built up so much suds. Perfect for on the go especially when you have a smaller size bottle.
Score: 1.5529599, '
'            product name: Water Bank Blue Hyaluronic Cream Moisturizer '
'            review: favorite high end moisturizer especially for dry skin
Score: 1.5356183, '
'            product name: 100 percent Pure Argan Oil '
'            review: Good product I received this as a free gift from Influenster and super excited that I did! I struggle with dry scalp, so I put a little bit on the problem areas before bed and it helps out a bunch! The applicator is great as well and easy to use.
Score: 1.5105603, '
'            product name: Lip Sleeping Mask Intense Hydration

It's clear that these results include products other than shampoos -

- pinecone does not support regex matches.
- mongo db does but only after a vector search query, not before
- elastic should be able to support this 

In [14]:
elastic_f11 = {
    "query": {
        "bool": {
            "must": {
                "script_score": {
                    "query": {"match_all": {}},
                    "script": {
                        "source": "cosineSimilarity(params.query_vector, 'values') + 1.0",
                        "params": {"query_vector": filtered_1}
                    }
                }
            },
            "filter": [
                {"term": {"metadata.hair_color.keyword": "brown"}},
                {"match": {"metadata.review_full": "shampoo"}}
            ]
        }
    }
}


result = elastic_client.search(index="sephora-data", body=elastic_f11)

for hit in result['hits']['hits']:
    print(f"Score: {hit['_score']}, '\n' \
           product name: {hit['_source']['metadata']['product_name']} '\n' \
           review: {hit['_source']['metadata']['review_full']}")

Score: 1.5547749, '
'            product name: Eye Revival Brightening Eye Cream '
'            review: I have heard so many great things about DryBar so I was so excited to try this shampoo. First best thing I noticed was that a little product goes a very long way. I only put a small amount for my hair and it built up so much suds. Perfect for on the go especially when you have a smaller size bottle.
Score: 1.3699973, '
'            product name: Crème Radiance Gentle Cleansing Creamy-Foam Cleanser '
'            review: #1 Makeup brush shampoo Dont wash my face with it as it dries my face out, but it is the best makeup brush or beauty blender cleanser I have ever used.
Score: 1.3531227, '
'            product name: 100% Plant-Derived Hemi-Squalane '
'            review: I bought this because I had originally tried the retinol 1% in squalane and was so impressed with its moisturizing. Then I read a few reviews where people really liked it on hair as well. I have super fine curly hair 

### Question 2: What’s the worst makeup for hiding blemishes?

In [15]:
#question 2 we'll just look at natural language.
q2 = "What’s the worst makeup for hiding blemishes?"
query_2 = embed_query(q2)

#### Pinecone q2

In [16]:
start_time = time.time()
pine_q2 = index.query(query_2, top_k=5, include_metadata=True)
end_time = time.time()
print(f"Pinecone took {end_time - start_time} seconds on natural language query 2")

Pinecone took 0.08999848365783691 seconds on natural language query 2


In [17]:
for result in pine_q2['matches']:
    print(f"{round(result['score'], 2)} '\n' \
            {result['metadata']['product_name']} '\n' \
            {result['metadata']['hair_color']} '\n' \
            {result['metadata']['price_usd']} '\n' \
            {result['metadata']['review_full']}")

0.7 '
'             Acne Control Clarifying Cream Cleanser '
'             Unknown '
'             39.0 '
'             Great This product is really convincing in removing blemishes.
0.68 '
'             Acne Solutions Clinical Clearing Gel '
'             Unknown '
'             32.0 '
'             Best Ever! This is my go to once I notice a blemish ready to start, or already formed! This does have a harsh sting at first, my recommendation is to just fan yourself for a few seconds! lol The outcome is worth a few seconds of agony!! :) If you’re determined to fight the blemish, and fast..what I do is apply it around 2 - 3 times a day. The next day the blemish is dramatically less noticeable! By the third/fourth day, it may still be there...but you’re the only one that notices it! Especially with a slight dab of concealer. This stuff is seriously fantastic! :)
0.66 '
'             Super Spot Remover Acne Treatment Gel '
'             black '
'             20.0 '
'             Awesome pr

#### Mongo q2

In [18]:
start_time = time.time()
mongo_q2 = collection.aggregate([
  {"$vectorSearch": {
    "queryVector": query_2,
    "path": "values",
    "numCandidates": 100,
    "limit": 5,
    "index": "sephora-index",
      }}
])
end_time = time.time()
print(f"MongoDB took {end_time - start_time} seconds on natural language query 2")

MongoDB took 0.06300091743469238 seconds on natural language query 2


In [19]:
for result in mongo_q2:
    print(f"{result['metadata']['product_name']} '\n' \
            {result['metadata']['hair_color']} '\n' \
            {result['metadata']['price_usd']} '\n' \
            {result['metadata']['review_full']}")

Acne Control Clarifying Cream Cleanser '
'             Unknown '
'             39.0 '
'             Great This product is really convincing in removing blemishes.
Acne Solutions Clinical Clearing Gel '
'             Unknown '
'             32.0 '
'             Best Ever! This is my go to once I notice a blemish ready to start, or already formed! This does have a harsh sting at first, my recommendation is to just fan yourself for a few seconds! lol The outcome is worth a few seconds of agony!! :) If you’re determined to fight the blemish, and fast..what I do is apply it around 2 - 3 times a day. The next day the blemish is dramatically less noticeable! By the third/fourth day, it may still be there...but you’re the only one that notices it! Especially with a slight dab of concealer. This stuff is seriously fantastic! :)
Super Spot Remover Acne Treatment Gel '
'             black '
'             20.0 '
'             Awesome product for spot blemishes!
Rapid Relief Acne Spot Treatment '
'

#### Elastic q2

In [20]:
body_q2 = {
    "query": {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'values') + 1.0",
                "params": {"query_vector": query_2}
            }
        }
    },
    "size": 5
}

start_time = time.time()
elsatic_q2 = elastic_client.search(index="sephora-data", body=body_q2)
end_time = time.time()
print(f"Elastic took {end_time - start_time} seconds on natural language query 2")

Elastic took 0.07151508331298828 seconds on natural language query 2


In [21]:
for hit in elsatic_q2['hits']['hits']:
    print(f"Score: {hit['_score']}, '\n' \
           product name: {hit['_source']['metadata']['product_name']} '\n' \
           review: {hit['_source']['metadata']['review_full']}")

Score: 1.7048737, '
'            product name: Acne Control Clarifying Cream Cleanser '
'            review: Great This product is really convincing in removing blemishes.
Score: 1.679861, '
'            product name: Acne Solutions Clinical Clearing Gel '
'            review: Best Ever! This is my go to once I notice a blemish ready to start, or already formed! This does have a harsh sting at first, my recommendation is to just fan yourself for a few seconds! lol The outcome is worth a few seconds of agony!! :) If you’re determined to fight the blemish, and fast..what I do is apply it around 2 - 3 times a day. The next day the blemish is dramatically less noticeable! By the third/fourth day, it may still be there...but you’re the only one that notices it! Especially with a slight dab of concealer. This stuff is seriously fantastic! :)
Score: 1.6596596, '
'            product name: Super Spot Remover Acne Treatment Gel '
'            review: Awesome product for spot blemishes!
Score: 1

### Question 3: Find all reviews for Vitamin C serum that helps with dark spots.

In [22]:
q3 = "What vitamin C serum helps with dark spots?"
query_3 = embed_query(q3)

#### Pinecone q3

In [23]:
#to fetch all we'll set top_k to 10000 which is the max
start_time = time.time()
pine_q3 = index.query(query_3, top_k=10000)
end_time = time.time()
print(f"Pinecone took {end_time - start_time} seconds on natural language query 3")

Pinecone took 0.6796462535858154 seconds on natural language query 3


#### Mongo q3

In [24]:
start_time = time.time()
mongo_q3 = collection.aggregate([
  {"$vectorSearch": {
    "queryVector": query_3,
    "path": "values",
    "numCandidates": 10000,
    "limit": 10000,
    "index": "sephora-index",
      }}
])
end_time = time.time()
print(f"MongoDB took {end_time - start_time} seconds on natural language query 3")

MongoDB took 0.4671671390533447 seconds on natural language query 3


#### Elastic q3

In [25]:
body_q3 = {
    "query": {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'values') + 1.0",
                "params": {"query_vector": query_3}
            }
        }
    },
    "size": 10000
}

start_time = time.time()
elsatic_q3 = elastic_client.search(index="sephora-data", body=body_q3)
end_time = time.time()
print(f"Elastic took {end_time - start_time} seconds on natural language query 3")

Elastic took 4.800441741943359 seconds on natural language query 3


### Question 4: Given a positive review for a leave-in product for curly hair, what are the 5 most similar reviews?

In [26]:
#Let's find a similar review first then pass it in 
curly = embed_query("Best leave-in product curly hair")
mongo_curly = collection.aggregate([
  {"$vectorSearch": {
    "queryVector": curly,
    "path": "values",
    "numCandidates": 100,
    "limit": 1,
    "index": "sephora-index",
      }}
])

for result in mongo_curly:
    print(f"{result['values']} '\n' \
            {result['metadata']['product_name']} '\n' \
            {result['metadata']['hair_color']} '\n' \
            {result['metadata']['price_usd']} '\n' \
            {result['metadata']['review_full']}")
  

[-0.11061471700668335, -0.09305941313505173, 0.06998290866613388, 0.04913369193673134, -0.07237361371517181, -0.010709412395954132, -0.030219286680221558, -0.016525983810424805, 0.013994907028973103, -0.09154359251260757, 0.0003219138307031244, 0.05123697966337204, 0.020507624372839928, -0.05676449090242386, -0.01618495024740696, 0.09382764995098114, 0.09126223623752594, 0.0865832269191742, -0.05141039937734604, 0.014926222153007984, -0.008923793211579323, -0.03260035812854767, 0.0016374229453504086, 0.03791555389761925, -0.01389245130121708, -0.001275827526114881, 0.06089259311556816, -0.026064421981573105, -0.03318677842617035, 0.008265053853392601, 0.009154515340924263, -0.03126682713627815, -0.02408178150653839, -0.07214127480983734, -0.041307780891656876, -0.005910374689847231, -0.060330357402563095, 0.03793245926499367, -0.014002078212797642, 0.03979220241308212, -0.014990572817623615, -0.08283542096614838, -0.06576273590326309, -0.02713456191122532, 0.058262817561626434, 0.00927

In [27]:
#have to run it again because the cursor is exhausted after printing once. 
mongo_curly = collection.aggregate([
  {"$vectorSearch": {
    "queryVector": curly,
    "path": "values",
    "numCandidates": 100,
    "limit": 1,
    "index": "sephora-index",
      }}
])
#it's a list of embeddings within a list -- getting [0] ensures it's a single list
embedding_q4 = [result['values'] for result in mongo_curly][0]
embedding_q4

[-0.11061471700668335,
 -0.09305941313505173,
 0.06998290866613388,
 0.04913369193673134,
 -0.07237361371517181,
 -0.010709412395954132,
 -0.030219286680221558,
 -0.016525983810424805,
 0.013994907028973103,
 -0.09154359251260757,
 0.0003219138307031244,
 0.05123697966337204,
 0.020507624372839928,
 -0.05676449090242386,
 -0.01618495024740696,
 0.09382764995098114,
 0.09126223623752594,
 0.0865832269191742,
 -0.05141039937734604,
 0.014926222153007984,
 -0.008923793211579323,
 -0.03260035812854767,
 0.0016374229453504086,
 0.03791555389761925,
 -0.01389245130121708,
 -0.001275827526114881,
 0.06089259311556816,
 -0.026064421981573105,
 -0.03318677842617035,
 0.008265053853392601,
 0.009154515340924263,
 -0.03126682713627815,
 -0.02408178150653839,
 -0.07214127480983734,
 -0.041307780891656876,
 -0.005910374689847231,
 -0.060330357402563095,
 0.03793245926499367,
 -0.014002078212797642,
 0.03979220241308212,
 -0.014990572817623615,
 -0.08283542096614838,
 -0.06576273590326309,
 -0.02713

#### Pinecone q4

In [28]:
start_time = time.time()
pine_q4= index.query(embedding_q4, top_k=5, include_metadata=True)
end_time = time.time()
print(f"Pinecone took {end_time - start_time} seconds on natural language query 4")

Pinecone took 1.227785587310791 seconds on natural language query 4


In [29]:
for result in pine_q4['matches']:
    print(f"{round(result['score'], 2)} '\n' \
            {result['metadata']['product_name']} '\n' \
            {result['metadata']['hair_color']} '\n' \
            {result['metadata']['price_usd']} '\n' \
            {result['metadata']['review_full']}")

1.0 '
'             100 percent Pure Argan Oil '
'             Unknown '
'             49.0 '
'             Curly Girl's Dream-come-true! I wish there were words for how much I love this! It’s so multipurpose and amazing!!!!I’m a user of the curly girl method (no shampoo, sulfates, etc) and this works AMAZINGLY with my hair routine! About once a month I shampoo my hair to remove build up (using Devacurl’s Low-poo) and follow it up with some deep conditioner and then apply this from my ears down. I have naturally curly, dry hair with some greys and this is AMAZING! My hair feels silky, springy and softer than ever! It’s a miracle in a bottle!On my skin is works miracles on my combination skin. My forehead is always intensely dry in winter and this has been a life saver! On my dry elbows-heaven!!I will keep repurchasing and recommending this product!
0.66 '
'             100 percent Pure Argan Oil '
'             Unknown '
'             49.0 '
'             Beyond Fabulous! This is the b

#### Mongo q4

In [30]:
start_time = time.time()
mongo_q4 = collection.aggregate([
  {"$vectorSearch": {
    "queryVector": embedding_q4,
    "path": "values",
    "numCandidates": 100,
    "limit": 5,
    "index": "sephora-index",
      }}
])
end_time = time.time()
print(f"MongoDB took {end_time - start_time} seconds on natural language query 4")

MongoDB took 0.0670614242553711 seconds on natural language query 4


In [31]:
for result in mongo_q4:
    print(f"{result['metadata']['product_name']} '\n' \
            {result['metadata']['hair_color']} '\n' \
            {result['metadata']['price_usd']} '\n' \
            {result['metadata']['review_full']}")

100 percent Pure Argan Oil '
'             Unknown '
'             49.0 '
'             Curly Girl's Dream-come-true! I wish there were words for how much I love this! It’s so multipurpose and amazing!!!!I’m a user of the curly girl method (no shampoo, sulfates, etc) and this works AMAZINGLY with my hair routine! About once a month I shampoo my hair to remove build up (using Devacurl’s Low-poo) and follow it up with some deep conditioner and then apply this from my ears down. I have naturally curly, dry hair with some greys and this is AMAZING! My hair feels silky, springy and softer than ever! It’s a miracle in a bottle!On my skin is works miracles on my combination skin. My forehead is always intensely dry in winter and this has been a life saver! On my dry elbows-heaven!!I will keep repurchasing and recommending this product!
100 percent Pure Argan Oil '
'             Unknown '
'             49.0 '
'             Beyond Fabulous! This is the best product out there for dry, curly hair

#### Elastic q4

In [32]:
body_q4 = {
    "query": {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'values') + 1.0",
                "params": {"query_vector": embedding_q4}
            }
        }
    },
    "size": 5
}

start_time = time.time()
elsatic_q4 = elastic_client.search(index="sephora-data", body=body_q4)
end_time = time.time()
print(f"Elastic took {end_time - start_time} seconds on natural language query 4")

Elastic took 0.11104846000671387 seconds on natural language query 4


In [33]:
for hit in elsatic_q4['hits']['hits']:
    print(f"Score: {hit['_score']}, '\n' \
           product name: {hit['_source']['metadata']['product_name']} '\n' \
           review: {hit['_source']['metadata']['review_full']}")

Score: 2.0, '
'            product name: 100 percent Pure Argan Oil '
'            review: Curly Girl's Dream-come-true! I wish there were words for how much I love this! It’s so multipurpose and amazing!!!!I’m a user of the curly girl method (no shampoo, sulfates, etc) and this works AMAZINGLY with my hair routine! About once a month I shampoo my hair to remove build up (using Devacurl’s Low-poo) and follow it up with some deep conditioner and then apply this from my ears down. I have naturally curly, dry hair with some greys and this is AMAZING! My hair feels silky, springy and softer than ever! It’s a miracle in a bottle!On my skin is works miracles on my combination skin. My forehead is always intensely dry in winter and this has been a life saver! On my dry elbows-heaven!!I will keep repurchasing and recommending this product!
Score: 1.6559019, '
'            product name: 100 percent Pure Argan Oil '
'            review: Beyond Fabulous! This is the best product out there for dry

### Question 5: Sort positive reviews of lotions for oily skin by price descending.

For this one we will fetch all and sort

Pinecone does not support this so we will have to fetch then sort
Mongo was able to support it fine
elastic loses information about vector scores when a sort parameter is applied. 

In [34]:
q5 = "positive reviews of lotions for oily skin"
query_5 = embed_query(q5)

#### Pinecone q1

In [35]:
start_time = time.time()
pine_q5 = index.query(query_5, top_k=5000, include_metadata=True)
sorted_pine_q5 = sorted(pine_q5['matches'], key=lambda x: x['metadata']['price_usd'], reverse=True)
end_time = time.time()
print(f"Pinecone took {end_time - start_time} seconds on natural language query 5")

Pinecone took 1.5250303745269775 seconds on natural language query 5


In [36]:
for result in sorted_pine_q5[:5]:
    print(f"{round(result['score'], 2)} '\n' \
            {result['metadata']['product_name']} '\n' \
            {result['metadata']['hair_color']} '\n' \
            {result['metadata']['price_usd']} '\n' \
            {result['metadata']['review_full']}")

0.6 '
'             The Concentrate Serum '
'             blonde '
'             425.0 '
'             Love the scent It is really good product. I received two small samples . I love the scent of it and the light texture. It’s not greasy and feels amazing on my skin . My skin feels smooth and soft . I have dry skin especially during cold weather and I feel this product works perfect for my skin type .
0.59 '
'             The Concentrate Serum '
'             brown '
'             425.0 '
'             Nice This is one of my favorite creams! So good and moisturizing! Such a good one! I recommend it to everyone who has normal dry skin, especially good during fall and winter. It makes my skin so smooth and healthy
0.63 '
'             Crème de la Mer Moisturizer '
'             black '
'             380.0 '
'             Maybe it helps other ppl (who has very dry skin) a lot, but for me I think this cream is a little bit too heavy. For the skin type is not too dry neither too oily I love

#### Mongo q5

In [37]:
start_time = time.time()
mongo_q5 = collection.aggregate([
  {"$vectorSearch": {
    "queryVector": query_5,
    "path": "values",
    "numCandidates": 5000,
    "limit": 5000,
    "index": "sephora-index",
      }},
    {"$sort": {"metadata.price_usd": -1}}
])
end_time = time.time()
print(f"MongoDB took {end_time - start_time} seconds on natural language query 5")

MongoDB took 0.718332052230835 seconds on natural language query 5


In [38]:
for i, result in enumerate(mongo_q5):
    if i >= 5:
        break
    print(f"{result['metadata']['product_name']} '\n' \
            {result['metadata']['hair_color']} '\n' \
            {result['metadata']['price_usd']} '\n' \
            {result['metadata']['review_full']}")

The Concentrate Serum '
'             blonde '
'             425.0 '
'             Love the scent It is really good product. I received two small samples . I love the scent of it and the light texture. It’s not greasy and feels amazing on my skin . My skin feels smooth and soft . I have dry skin especially during cold weather and I feel this product works perfect for my skin type .
The Concentrate Serum '
'             brown '
'             425.0 '
'             Nice This is one of my favorite creams! So good and moisturizing! Such a good one! I recommend it to everyone who has normal dry skin, especially good during fall and winter. It makes my skin so smooth and healthy
Crème de la Mer Moisturizer '
'             black '
'             380.0 '
'             Maybe it helps other ppl (who has very dry skin) a lot, but for me I think this cream is a little bit too heavy. For the skin type is not too dry neither too oily I love weightless stuff better.
Crème de la Mer Moisturizer '
'     

#### Elastic q5

In [39]:
body_q5 = {
    "query": {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'values') + 1.0",
                "params": {"query_vector": query_5}
            }
        }
    },
    "size": 5000
}
start_time = time.time()
elastic_q5 = elastic_client.search(index="sephora-data", body=body_q5)
sorted_hits = sorted(elastic_q5['hits']['hits'], key=lambda x: x['_source']['metadata']['price_usd'], reverse=True)
end_time = time.time()
print(f"Elastic took {end_time - start_time} seconds on natural language query 5")


Elastic took 2.470366954803467 seconds on natural language query 5


In [40]:
for hit in sorted_hits[:5]:
    print(f"Score: {hit['_score']}, '\n' \
           product name: {hit['_source']['metadata']['product_name']} '\n' \
           price: {hit['_source']['metadata']['price_usd']} '\n' \
           review: {hit['_source']['metadata']['review_full']}")

Score: 1.5966933, '
'            product name: The Concentrate Serum '
'            price: 425.0 '
'            review: Love the scent It is really good product. I received two small samples . I love the scent of it and the light texture. It’s not greasy and feels amazing on my skin . My skin feels smooth and soft . I have dry skin especially during cold weather and I feel this product works perfect for my skin type .
Score: 1.5862967, '
'            product name: The Concentrate Serum '
'            price: 425.0 '
'            review: Nice This is one of my favorite creams! So good and moisturizing! Such a good one! I recommend it to everyone who has normal dry skin, especially good during fall and winter. It makes my skin so smooth and healthy
Score: 1.6294103, '
'            product name: Crème de la Mer Moisturizer '
'            price: 380.0 '
'            review: Maybe it helps other ppl (who has very dry skin) a lot, but for me I think this cream is a little bit too heavy. For t

### Update times

I'm realizing that we have all of CRUD except update
- let's create 5 new vectors and update the first 5 ids

In [41]:
new_reviews = [
    "Wow, what an amazing product. Y'all have got to try this out. I use this with my favorite moisturizer and it works like a charm.",
    "DO NOT get this. Does not work for sensitive skin. I was feeling the burn.",
    "I would definitely recommend this for someone with combination oily skin. It keeps my skin less oily for longer throughout the day.",
    "This is over-priced for the amount you get. Will probably avoid in the future.",
    "They've changed their formula recently. I don't like it as much."
]

new_vectors = [embed_query(r) for r in new_reviews]
new_vectors[0][0]

-0.10167992860078812

#### Pinecone update

In [45]:
start_time = time.time()
for i, vector in enumerate(new_vectors):
    index.update(
    id=str(i),
    values=vector,
    set_metadata={'eye_color': 'hazel'}
    )
end_time = time.time()
print(f"Update operation took {end_time - start_time} seconds")

Update operation took 0.4371213912963867 seconds


#### Mongo Update

In [46]:
start_time = time.time()
for i, vector in enumerate(new_vectors):
    collection.update_one({"id": str(i)}, {"$set": {"values": vector, "metadata.eye_color": "hazel"}})
    time.sleep(1)
end_time = time.time()
print(f"Update operation took {end_time - start_time} seconds")

Update operation took 5.134519338607788 seconds


#### Elastic Update

In [47]:
start_time = time.time()
for i, vector in enumerate(new_vectors):
    elastic_client.update(
        index="sephora-data", 
        id=str(i), 
        body={"doc": {"values": vector, "metadata.eye_color": "hazel"}}
        )
end_time = time.time()
print(f"Update operation took {end_time - start_time} seconds")

Update operation took 0.16942667961120605 seconds
