In [135]:
!%pip install mistralai evaluate

zsh:fg:1: no job control in this shell.


In [182]:
import os
from dotenv import load_dotenv
load_dotenv()

prompts = {
    "review1": {
        "review_content": "Gets my clothes fresh and clean every time. No lingering odor with Tide.",
        "golden_answer": {
            "product category": "Powder Detergents for Laundry",
            "brand": "Tide",
            "other keywords": ['fresh', 'clean', 'no lingering odor']
        },
    },
    "review2": {
        "review_content": "Keeps my coffee hot for hours—just what I need for long workdays. Thanks, Contigo.",
        "golden_answer": {
            "product category": "Thermal Mugs",
            "brand": "Contigo",
            "other keywords": ["hot for hours"]
        },
    },
}


In [183]:
from pydantic import BaseModel, Field
from typing import List, Union, Literal


class Product(BaseModel):
    product_category: Literal[
        "Powder Detergents for Laundry",
        "Thermal Mugs",
        "Dishwasher Detergents",
        "Sunscreens",
        "Nappies",
        "Others"
    ] = Field(
        description="The category of the product.",
        alias="product category"
    )
    # product_category: str = Field(
    #     description="The category of the product.",
    #     alias="product category"
    # )
    brand: str = Field(
        description="The brand of the product, or 'N/A' if not applicable.",
        alias="brand"
    )
    
    other_keywords: List[str] = Field(
        description="A list of other keywords associated with the product.",
        alias="other keywords"
    )

In [184]:
from tkinter.filedialog import Open
from mistralai import Mistral
from openai import OpenAI, api_key
from dotenv import load_dotenv
load_dotenv()
import os
from together import Together  

together_client = Together()

openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
client_mistral = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))

conversation_template = [
    {"role": "system", "content": "Given the user review of the product, extract the following information: product category, brand, and other keywords, which are associated with the product."}, 
    {"role": "system", "content": f"""Answer using only JSON format with the following JSON schema:

{Product.model_json_schema()}"""},]

def run_mistral(user_message, model="mistral-large-latest"):
    messages = conversation_template[:1] + [{"role": "user", "content": user_message}] + conversation_template[1:] 
    #messages = [{"role": "user", "content": user_message}]
    chat_response = client_mistral.chat.complete(
        model=model,
        messages=messages,
        response_format={"type": "json_object",
                         "json_schema": {"name": "product", "schema": Product.model_json_schema(),}},
    )
    return chat_response.choices[0].message.content


def run_openai(user_message, model="gpt-4o"):
    messages = [{"role": "user", "content": user_message}]
    chat_response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
        response_format={"type": "json_schema",
                         "json_schema": {"name": "product", "schema": Product.model_json_schema(),}}
    )
    return chat_response.choices[0].message.content

# def run_together(user_message, model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"):
#     messages = [{"role": "user", "content": user_message}]
#     chat_response = together_client.chat.completions.create(
#         model=model,
#         messages=messages,
#         response_format={
#             "type": "json_object",
#             "schema": Product.model_json_schema(),
#         },
#     )
#     return chat_response.choices[0].message.content

# define prompt template
prompt_template = """
Extract information from the following reviews:
{content}

Return only json format with the following JSON schema:

{{
        "product category": {{
            "type": "string",
            "enum": ["Powder Detergents for Laundry", "Thermal Mugs", "Dishwasher Detergents", "Sunscreens", "Nappies", "Others"]
        }},
        "brand": {{
            "type": "string" or N/A
        }},
        "other keywords": {{
            "type": "array",
            "items": {{
                "type": "string"
            }}
        }},

}}
"""

reviews = [
    # Thermal Mugs
    "Keeps my coffee hot for hours—just what I need for long workdays. Thanks, Contigo.",
    "The lid isn’t leak-proof, but it keeps drinks warm for a decent amount of time.",
    "Love the sleek design of my Zojirushi mug, and it fits perfectly in my car cup holder!",
    "It’s lightweight but keeps my drinks at the right temperature for hours with Hydro Flask.",
    "No more cold coffee! This Yeti thermal mug does the job.",
    "It’s easy to clean, and the thermal insulation works like a charm.",
    "The handle makes it easy to carry, and it doesn’t spill.",
    "Not great for keeping drinks cold, but excellent for hot beverages.",
    "I wish it were bigger, but it’s perfect for my morning tea.",
    "I accidentally dropped it, and it didn’t dent! Very sturdy.",
    "The rubber seal around the lid came loose after a few weeks. Disappointing.",
    "Great for both coffee and soup—keeps them warm for hours.",
    "The exterior stays cool, even when my drink is piping hot inside.",
    "I love the color options, and it’s great for on-the-go.",
    "Keeps ice water cold for hours, even in hot weather!",
    "It’s a little tricky to open one-handed, but overall, a great mug.",
    "The size is perfect for travel, and it keeps drinks hot all day.",
    "It doesn’t leak, even when I toss it in my bag. Highly recommend Contigo.",
    "The lid is a little tight, but the mug works well for keeping drinks warm.",
    "Very stylish and functional! I get compliments all the time.",
    "Keeps my coffee scalding hot for longer than any mug I’ve owned with Zojirushi.",
    "Great value for the price. Works just as well as more expensive brands.",
    "The mug is lightweight and easy to carry around.",
    "It fits perfectly under my single-serve coffee machine!",
    "Durable, sleek, and it does exactly what it’s supposed to.",

    # Dishwasher Detergents
    "My dishes come out sparkling clean every time with Cascade. Love this detergent!",
    "It works well on glass, but I’ve noticed spots on my silverware.",
    "Great for tough, greasy messes. Leaves no residue! Thanks, Finish.",
    "This detergent smells amazing and leaves my dishwasher fresh.",
    "It’s a little pricey, but my dishes have never looked better with Cascade Platinum.",
    "Gets rid of even the most stubborn baked-on food. Highly recommend Finish Quantum.",
    "Not the best on hard water stains, but otherwise it works great.",
    "My dishes have never been so spotless after a wash!",
    "It’s very effective, but I wish it came in a fragrance-free version.",
    "Cuts through grease like a dream. No more pre-rinsing with Cascade Complete.",
    "This detergent doesn’t leave any residue on plastic, which I love.",
    "My glasses come out clear and sparkling every single time.",
    "It doesn’t work well with my eco dishwasher. Dishes aren’t as clean.",
    "Very efficient—gets rid of food stains and smells with no problem.",
    "I noticed some streaks on my glassware, but overall it works well.",
    "Leaves my dishes spotless and my machine smelling fresh.",
    "A great, eco-friendly option that actually works!",
    "It’s a little hard on some of my delicate dishware.",
    "This is the only detergent that works on my hard water stains.",
    "No need to rewash dishes after using this—so efficient!",
    "Perfect for everyday use. My dishes are clean and shiny.",
    "Leaves a chemical smell, but it’s effective at cleaning.",
    "A bit expensive, but worth it for the spotless results.",
    "No more streaks or water spots! Best dishwasher detergent ever.",
    "My silverware and dishes look brand new after every wash.",

    # Sunscreens
    "Absorbs quickly and doesn’t leave a greasy residue. Great for daily use with Neutrogena.",
    "The scent is a little strong, but it protects well with Coppertone.",
    "Perfect for sensitive skin! No breakouts or irritation with La Roche-Posay.",
    "A bit thick to apply, but once it’s on, it stays all day.",
    "I love the lightweight formula of CeraVe, perfect for wearing under makeup.",
    "Doesn’t leave a white cast, even on darker skin tones.",
    "The spray bottle makes it super easy to apply on the go.",
    "This sunscreen saved me from burning on a beach vacation with Banana Boat!",
    "It’s waterproof, which is a must for pool days. Highly recommend Hawaiian Tropic.",
    "A bit pricey, but the protection it provides is worth every penny with Supergoop.",
    "This is my go-to sunscreen for both my face and body with Neutrogena.",
    "It’s a little greasy, but it gets the job done in strong sun.",
    "No weird scent, and it goes on smooth. Love this product!",
    "Perfect for outdoor activities—no sunburn, even after hours outside.",
    "It’s great for kids! No irritation, and it’s easy to apply with Blue Lizard.",
    "A little too heavy for my face, but works perfectly for the body.",
    "The texture is nice and light, not sticky at all.",
    "This sunscreen doesn’t clog my pores, which is a huge plus with EltaMD.",
    "I’ve tried a lot of sunscreens, and this one offers the best protection with La Roche-Posay.",
    "It leaves a slight sheen, but I love how protected my skin feels.",
    "This formula doesn’t dry out my skin like others do.",
    "It’s great under makeup—no pilling or greasiness.",
    "I wish it were more affordable, but it’s worth it for the protection.",
    "Very effective, even after swimming for hours.",
    "My skin stays soft and protected all day with Neutrogena sunscreen."

     # Powder Detergents for Laundry
    "Gets my clothes fresh and clean every time. No lingering odor with Tide.",
    "It dissolves well, even in cold water. My whites have never been brighter thanks to Ariel.",
    "A little pricey, but worth it for the excellent stain removal power of Persil.",
    "This powder leaves a residue on darker clothes. Not a fan of OMO.",
    "Great for sensitive skin! No itching or redness after using Seventh Generation.",
    "I love how eco-friendly this detergent is. It’s a big plus for me with Ecover.",
    "I don’t need fabric softener anymore—this leaves my clothes so soft!",
    "My laundry has never smelled so fresh, and it lasts for days with Gain.",
    "It’s not the best for heavy stains but works great for daily washes.",
    "Great value for the price. This box lasts forever! Thanks, Arm & Hammer.",
    "Perfect for my workout gear—gets rid of all the sweat smells.",
    "Leaves a bit of powder behind in the machine, but it cleans well.",
    "I’ve been using it for years, and Tide never disappoints.",
    "Not as effective in hard water areas, but still decent.",
    "My go-to detergent for all of my family’s laundry needs.",
    "I noticed some fading in my darker clothes after a few washes.",
    "It’s gentle on my baby’s clothes and skin with Dreft.",
    "Very effective at removing mud and grass stains from the kids’ clothes.",
    "I like the scent, but it might be too strong for some.",
    "No complaints so far! My clothes feel clean and fresh.",
    "Works just as well as liquid detergents but at a lower cost.",
    "A bit too perfumed for my taste, but it gets the job done.",
    "My clothes are noticeably softer and smell better than before.",
    "The box is hard to pour from, but the detergent works well.",
    "This is my new favorite detergent. So much better than the leading brand!",
]

In [185]:
import json


def compare_json_objects(obj1, obj2,keys_to_compare={'other keywords', 'brand', 'product category'}):
    total_fields = 0
    identical_fields = 0
    common_keys = set(obj1.keys()) & set(obj2.keys() & keys_to_compare)
    for key in common_keys:
        identical_fields += obj1[key] == obj2[key]
    #percentage_identical = (identical_fields / max(len(obj1.keys()), 1)) * 100
    percentage_identical = (identical_fields / max(len(keys_to_compare), 1)) * 100

    return percentage_identical

In [186]:
accuracy_rates = []
accuracy_rates_openai = []
# for each test case
for name in prompts:

    # define user message
    #user_message = #prompt_template.format(medical_notes=prompts[name]["medical_notes"])
    text=prompts[name]["review_content"]
    #print(text)
    #user_message = prompt_template.format(content=text)
    #print(user_message)
    #user_message = prompt_templateR.format(notes=promptsR[name]["review_content"])

    # run LLM
    response_raw = run_mistral(text)
    response_raw_openai = run_openai(text)
    # print(response_raw)

    response = json.loads(response_raw)
    response_openai = json.loads(response_raw_openai)
    # print(response)
    # calculate accuracy rate for this test case
    accuracy_rates.append(
        compare_json_objects(response, prompts[name]["golden_answer"],keys_to_compare={'product category','brand'})
    )
    accuracy_rates_openai.append(
        compare_json_objects(response_openai, prompts[name]["golden_answer"],keys_to_compare={'product category','brand'})
    )

# calculate accuracy rate across test cases
print(f"Mistral: {sum(accuracy_rates) / len(accuracy_rates)}")
print(f"OpenAI: {sum(accuracy_rates_openai) / len(accuracy_rates_openai)}")

Mistral: 100.0
OpenAI: 100.0


In [187]:
# prompt: create big json dictionary with subsequent reviews and LLM answer
import time

results = {}
results_openai = {}
for review in reviews:
  #print(review)
  user_message = prompt_template.format(content=review)
  response = json.loads(run_mistral(user_message))
  response_openai = json.loads(run_openai(user_message))
  results[review] = response
  results_openai[review] = response_openai
  #print(response)
  # time.sleep(1)
  # print()

import json
with open('results.json', 'w') as f:
  json.dump(results, f, indent=4)

with open('results_openai.json', 'w') as f:
  json.dump(results_openai, f, indent=4)

In [188]:
product_offers = [
    # Thermal Mugs
    "Contigo Workday Travel Mug – Keeps Coffee Hot for Hours!",
    "Zojirushi Sleek Travel Mug – Perfect Fit for Car Holders",
    "Hydro Flask Lightweight Insulation Mug – Stay Warm for Hours",
    "Yeti Thermal Mug – No More Cold Coffee!",
    "Contigo All-Day Heat Retention Mug – Ideal for Travel",
    "Contigo Leak-Proof Mug – Toss in Your Bag with Confidence",
    "Zojirushi Scalding Hot Coffee Mug – Best Insulation Yet",

    # Dishwasher Detergents
    "Cascade Sparkling Clean Detergent – Your Dishes Will Shine",
    "Finish Detergent – Tough on Grease, No Residue Left",
    "Cascade Platinum Detergent – Pricey, But Worth It for Results",
    "Finish Quantum Detergent – Stubborn Food Stains Gone",
    "Cascade Complete Detergent – No Pre-Rinsing Needed for Grease",

    # Sunscreens
    "Neutrogena Daily Sunscreen – Quick Absorption, No Grease",
    "Coppertone Suncream – Strong Scent, Strong Protection",
    "La Roche-Posay Sensitive Skin Sunscreen – No Breakouts",
    "CeraVe Lightweight Sunscreen – Perfect Under Makeup",
    "Banana Boat Beach-Saver Sunscreen – No Burns, Just Fun",
    "Hawaiian Tropic Waterproof Sunscreen – Pool Day Essential",
    "Supergoop Premium Sunscreen – Worth Every Penny",
    "Neutrogena Face & Body Sunscreen – All-Purpose Protection",
    "EltaMD Pore-Friendly Sunscreen – Protection Without Clogging",
    "La Roche-Posay Suncream – The Best in Sun Protection",
    "Neutrogena Sunscreen – Soft, Protected Skin All Day",

    # Powder Detergents for Laundry
    "Tide Powder Detergent – Fresh, Clean Clothes Every Time",
    "Ariel Powder Detergent – Whites Brighter, Even in Cold Water",
    "Persil Powder Detergent – Powerful Stain Removal",
    "OMO Powder Detergent – Leaves Residue on Dark Clothes",
    "Seventh Generation Powder Detergent – Great for Sensitive Skin",
    "Ecover Eco-Friendly Detergent – Perfect for the Eco-Conscious",
    "Gain Powder Detergent – Fresh-Smelling Laundry for Days",
    "Arm & Hammer Powder Detergent – Great Value, Lasts Forever",
    "Dreft Baby Powder Detergent – Gentle on Baby Clothes"
]

In [189]:
import time

products = {}
products_openai = {}

for title in product_offers:
  # print(title)
  user_message = prompt_template.format(content=title)
  response = json.loads(run_mistral(user_message))
  response_openai = json.loads(run_openai(user_message))
  products[title] = response
  products_openai[title] = response_openai
  # print(response)
  # time.sleep(1)
  # print()

import json
with open('products.json', 'w') as f:
  json.dump(products, f, indent=4)

with open('products_openai.json', 'w') as f:
  json.dump(products_openai, f, indent=4)


In [190]:
# prompt: write code for comparing one review from results with one product from products and add in these comparison extracted keywords with lower weight additionally to categories and brand comparison

import re


def compare_json_objects_with_keywords(obj1, obj2, keys_to_compare={'other keywords', 'brand', 'product category'}):
    identical_fields = 0
    common_keys = set(obj1.keys()) & set(obj2.keys() & keys_to_compare)
    for key in common_keys:
        if key == 'other keywords':
            if isinstance(obj1.get(key), list) and isinstance(obj2.get(key), list):
                common_keywords = set(obj1.get(key)) & set(obj2.get(key))
                identical_fields += len(common_keywords) * 0.3  # Keywords have lower weight
            else:
              identical_fields += 0
        else:
          identical_fields += obj1[key] == obj2[key]
    #percentage_identical = (identical_fields / max(len(obj1.keys()), 1)) * 100
    percentage_identical = (identical_fields / max(len(keys_to_compare), 1)) * 100

    return percentage_identical


# Example usage (assuming 'results' and 'products' are dictionaries)
review = list(results.keys())[50]  # Get the first review
review_data = results[review]

product = list(products.keys())[20]  # Get the first product
product_data = products[product]
print(review_data)
print(product_data)
similarity_score = compare_json_objects_with_keywords(review_data, product_data,keys_to_compare={'brand', 'product category'})
print(f"Similarity between review:\n '{review}'\n and product:\n '{product}'\n is as follow:\n {similarity_score}")

review_openai = list(results_openai.keys())[50]  # Get the first review
review_data_openai = results_openai[review_openai]

product_openai = list(products_openai.keys())[20]  # Get the first product
product_data_openai = products_openai[product_openai]
similarity_score_openai = compare_json_objects_with_keywords(review_data_openai, product_data_openai,keys_to_compare={'brand', 'product category'})
print(f"Similarity between review:\n '{review_openai}'\n and product:\n '{product_openai}'\n is as follow:\n {similarity_score_openai}")

{'product category': 'Sunscreens', 'brand': 'Neutrogena', 'other keywords': ['absorbs quickly', 'greasy residue', 'daily use']}
{'product category': 'Sunscreens', 'brand': 'EltaMD', 'other keywords': ['Pore-Friendly', 'Protection', 'Clogging']}
Similarity between review:
 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use with Neutrogena.'
 and product:
 'EltaMD Pore-Friendly Sunscreen – Protection Without Clogging'
 is as follow:
 50.0
Similarity between review:
 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use with Neutrogena.'
 and product:
 'EltaMD Pore-Friendly Sunscreen – Protection Without Clogging'
 is as follow:
 50.0


In [191]:
# prompt: add comparison when if product has different cateogry there is no similarity, but if it is the same it is about 30%, if has the same category and brand, similarity is about 75%

def compare_json_objects_with_keywords(obj1, obj2, keys_to_compare={'other keywords', 'brand', 'product category'}):
    if obj1.get('product category') != obj2.get('product category'):
        return 0  # No similarity if categories don't match

    similarity_score = 0
    common_keys = set(obj1.keys()) & set(obj2.keys() & keys_to_compare)

    if obj1.get('product category') == obj2.get('product category'):
        similarity_score += 0.3  # Base similarity for matching categories

    if obj1.get('brand') == obj2.get('brand'):
        similarity_score += 0.4  # Additional similarity for matching brands

    for key in common_keys:
        if key == 'other keywords':
            if isinstance(obj1.get(key), list) and isinstance(obj2.get(key), list):
                common_keywords = set(obj1.get(key)) & set(obj2.get(key))
                similarity_score += len(common_keywords) * 0.05  # Keywords have lower weight
            else:
                similarity_score += 0


    return min(100, round(similarity_score * 100))


In [192]:
# Example usage (assuming 'results' and 'products' are dictionaries)
review = list(results.keys())[50]  # Get the first review
review_data = results[review]

product = list(products.keys())[20]  # Get the first product
product_data = products[product]
print(review_data)
print(product_data)


similarity_score = compare_json_objects_with_keywords(review_data, product_data,keys_to_compare={'brand', 'product category'})
print(f"Similarity between review:\n '{review}'\n and product:\n '{product}'\n is as follow:\n {similarity_score}")

review_openai = list(results_openai.keys())[50]  # Get the first review
review_data_openai = results_openai[review_openai]

product_openai = list(products_openai.keys())[20]  # Get the first product
product_data_openai = products_openai[product_openai]
similarity_score_openai = compare_json_objects_with_keywords(review_data_openai, product_data_openai,keys_to_compare={'brand', 'product category'})
print(f"Similarity between review:\n '{review_openai}'\n and product:\n '{product_openai}'\n is as follow:\n {similarity_score_openai}")


{'product category': 'Sunscreens', 'brand': 'Neutrogena', 'other keywords': ['absorbs quickly', 'greasy residue', 'daily use']}
{'product category': 'Sunscreens', 'brand': 'EltaMD', 'other keywords': ['Pore-Friendly', 'Protection', 'Clogging']}
Similarity between review:
 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use with Neutrogena.'
 and product:
 'EltaMD Pore-Friendly Sunscreen – Protection Without Clogging'
 is as follow:
 30
Similarity between review:
 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use with Neutrogena.'
 and product:
 'EltaMD Pore-Friendly Sunscreen – Protection Without Clogging'
 is as follow:
 30


In [193]:
!pip install bert_score



In [201]:
# prompt: comapre review and product keyword by bertscore

from bert_score import score

def compare_keywords_bert_score(review_keywords, product_keywords):
  """Compares keywords using BERTScore."""

  if not review_keywords or not product_keywords:
    return 0

  max_len = max(len(review_keywords), len(product_keywords))
  review_keywords = (review_keywords * (max_len // len(review_keywords) + 1))[:max_len]
  product_keywords = (product_keywords * (max_len // len(product_keywords) + 1))[:max_len]

  # Calculate BERTScore
  P, R, F1 = score(
    review_keywords,
    product_keywords,
    lang="en",
    model_type="bert-base-uncased",
    verbose=False
  )

  return F1.mean().item()


# Example usage
review_keywords = results[review].get("other keywords", [])
product_keywords = products[product].get("other keywords", [])

review_keywords_openai = results_openai[review].get("other keywords", [])
product_keywords_openai = products_openai[product].get("other keywords", [])

print(review_keywords)
print(product_keywords)

print(review_keywords_openai)
print(product_keywords_openai)
similarity_score_bert = compare_keywords_bert_score(review_keywords, product_keywords)



similarity_score_bert_openai = compare_keywords_bert_score(review_keywords_openai, product_keywords_openai)
print(
    f"Similarity (BERTScore) between review:\n '{review}'\n and product:\n '{product}'\n is as follow:\n {similarity_score_bert}"
)

print(
    f"Similarity (BERTScore) between review:\n '{review_openai}'\n and product:\n '{product_openai}'\n is as follow:\n {similarity_score_bert_openai}"
)

['absorbs quickly', 'greasy residue', 'daily use']
['Pore-Friendly', 'Protection', 'Clogging']
['Absorbs quickly', 'non-greasy', 'daily use']
['Pore-Friendly', 'Protection', 'Clogging', 'Skin care', 'UV protection']
Similarity (BERTScore) between review:
 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use with Neutrogena.'
 and product:
 'EltaMD Pore-Friendly Sunscreen – Protection Without Clogging'
 is as follow:
 0.44779595732688904
Similarity (BERTScore) between review:
 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use with Neutrogena.'
 and product:
 'EltaMD Pore-Friendly Sunscreen – Protection Without Clogging'
 is as follow:
 0.4194450378417969


In [203]:
review =  'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use with Neutrogena.'
product = 'EltaMD Pore-Friendly Sunscreen – Protection Without Clogging'
review_data = {'product category': 'Sunscreens', 'brand': 'Neutrogena', 'other keywords': ['spf50', 'daily use']}
product_data = {'product category': 'Sunscreens', 'brand': 'Neutrogena', 'other keywords': ['spf50','Pore-Friendly', 'Protection', 'daily use']}

similarity_score = compare_json_objects_with_keywords(review_data, product_data,keys_to_compare={'other keywords','brand', 'product category'})
print(f"Similarity between review:\n '{review}'\n and product:\n '{product}'\n is as follow:\n {similarity_score}")

similarity_score_bert = compare_keywords_bert_score(review_keywords, product_keywords)

print(
    f"Similarity (BERTScore) between review:\n '{review}'\n and product:\n '{product}'\n is as follow:\n {similarity_score_bert}"
)


Similarity between review:
 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use with Neutrogena.'
 and product:
 'EltaMD Pore-Friendly Sunscreen – Protection Without Clogging'
 is as follow:
 80
Similarity (BERTScore) between review:
 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use with Neutrogena.'
 and product:
 'EltaMD Pore-Friendly Sunscreen – Protection Without Clogging'
 is as follow:
 0.44779595732688904


In [204]:
# prompt: propose other similarity score based on categories, brands and review_keywords; other measure then above, produce sth different ; combine also comaprison between full title of product name and concatenated string from category, brand and keywords from review, print every subsequent result, copare strings with bert score not tiff

def compare_product_review_similarity(review_data, product_data):
  """
  Calculates a similarity score between a review and a product based on
  categories, brands, and keywords, including a comparison of full product
  title with review information using BERTScore.
  """

  similarity_score = 0

  # Category Matching (Highest weight)
  if review_data.get('product category') == product_data.get('product category'):
    similarity_score += 0.5

  # Brand Matching (Medium weight)
  if review_data.get('brand') == product_data.get('brand'):
    similarity_score += 0.3

  # String Comparison (BERTScore) between Product Title and Review Data
  review_info_string = " ".join(
      [
          review_data.get("product category"),
          review_data.get("brand"),
          " ".join(review_data.get("other keywords")),
      ]
  )

  product_info_string = " ".join(
      [
          product_data.get("product category"),
          product_data.get("brand"),
          " ".join(product_data.get("other keywords")),
      ]
  )

  P, R, F1 = score(
      [product_info_string],
      [review_info_string],
      lang="en",
      model_type="bert-base-uncased",
      verbose=False
  )

  print(F1.mean().item())
  similarity_score += F1.mean().item() * 0.2

  return round(min(1, similarity_score) * 100)


# Example Usage (same as before)
review = 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use SPf50 with Neutrogena.'
product = 'Neutrogena SPF50 Pore-Friendly Sunscreen – Protection Without Clogging'
review_data = {'product category': 'Sunscreens', 'brand': 'Neutrogena', 'other keywords': ['spf50', 'daily use']}
product_data = {'product category': 'Sunscreens', 'brand': 'Neutrogena', 'other keywords': ['spf50','Pore-Friendly', 'Protection', 'daily use']}

similarity_score_new = compare_product_review_similarity(review_data, product_data)

print(
    f"New Similarity between review:\n '{review}'\n with data'{review_data}'\n and product:\n '{product}'\n with data'{product_data}''\n is as follow:\n {similarity_score_new}"
)


0.8770545125007629
New Similarity between review:
 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use SPf50 with Neutrogena.'
 with data'{'product category': 'Sunscreens', 'brand': 'Neutrogena', 'other keywords': ['spf50', 'daily use']}'
 and product:
 'Neutrogena SPF50 Pore-Friendly Sunscreen – Protection Without Clogging'
 with data'{'product category': 'Sunscreens', 'brand': 'Neutrogena', 'other keywords': ['spf50', 'Pore-Friendly', 'Protection', 'daily use']}''
 is as follow:
 98


# Added code

In [208]:
import numpy as np
#!pip install sentence-transformers
from sentence_transformers import SentenceTransformer

def compare_keywords_cosine_similarity(review_keywords, product_keywords):
    if not review_keywords or not product_keywords:
        return 0.0

    model = SentenceTransformer('all-MiniLM-L6-v2')  # Choose an appropriate model

    # Compute embeddings
    review_embeddings = model.encode(review_keywords)
    product_embeddings = model.encode(product_keywords)

    # Average the embeddings
    avg_review_embedding = np.mean(review_embeddings, axis=0)
    avg_product_embedding = np.mean(product_embeddings, axis=0)

    # Compute cosine similarity
    similarity = np.dot(avg_review_embedding, avg_product_embedding) / (
        np.linalg.norm(avg_review_embedding) * np.linalg.norm(avg_product_embedding)
    )

    return similarity


# Example usage
review = 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use SPf50 with Neutrogena.'
product = 'Neutrogena SPF50 Pore-Friendly Sunscreen – Protection Without Clogging'
review_data = {'product category': 'Sunscreens', 'brand': 'Neutrogena', 'other keywords': ['spf50', 'daily use']}
product_data = {'product category': 'Sunscreens', 'brand': 'Neutrogena', 'other keywords': ['spf50','Pore-Friendly', 'Protection', 'daily use']}

similarity_score_cosine = compare_keywords_cosine_similarity(review_data.get("other keywords", []), product_data.get("other keywords", []))
print(
    f"Cosine Similarity between review:\n '{review}'\n and product:\n '{product}'\n is as follow:\n {similarity_score_cosine}"
)





modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Cosine Similarity between review:
 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use SPf50 with Neutrogena.'
 and product:
 'Neutrogena SPF50 Pore-Friendly Sunscreen – Protection Without Clogging'
 is as follow:
 0.8082792162895203


In [209]:
def compare_keywords_jaccard_similarity(review_keywords, product_keywords):
    set_review = set(review_keywords)
    set_product = set(product_keywords)
    intersection = set_review.intersection(set_product)
    union = set_review.union(set_product)
    if not union:
        return 0.0
    return len(intersection) / len(union)

jaccard_similarity = compare_keywords_jaccard_similarity(review_data.get("other keywords", []), product_data.get("other keywords", []))
print(
    f"Jaccard Similarity between review:\n '{review}'\n and product:\n '{product}'\n is as follow:\n {jaccard_similarity}"
)

Jaccard Similarity between review:
 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use SPf50 with Neutrogena.'
 and product:
 'Neutrogena SPF50 Pore-Friendly Sunscreen – Protection Without Clogging'
 is as follow:
 0.5


In [211]:
!pip install Levenshtein
import Levenshtein

def compare_keywords_levenshtein_distance(review_keywords, product_keywords):
    total_distance = 0
    comparisons = 0
    for r_keyword in review_keywords:
        for p_keyword in product_keywords:
            distance = Levenshtein.distance(r_keyword, p_keyword)
            total_distance += distance
            comparisons += 1
    if comparisons == 0:
        return 0.0
    # Normalize the distance
    average_distance = total_distance / comparisons
    max_length = max(len(''.join(review_keywords)), len(''.join(product_keywords)))
    similarity = 1 - (average_distance / max_length)
    return similarity

levenshtein_similarity = compare_keywords_levenshtein_distance(review_data.get("other keywords", []), product_data.get("other keywords", []))
print(
    f"Levenshtein Similarity between review:\n '{review}'\n and product:\n '{product}'\n is as follow:\n {levenshtein_similarity}"
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting Levenshtein
  Downloading levenshtein-0.26.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein)
  Downloading rapidfuzz-3.10.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (11 kB)
Downloading levenshtein-0.26.0-cp310-cp310-macosx_11_0_arm64.whl (157 kB)
Downloading rapidfuzz-3.10.1-cp310-cp310-macosx_11_0_arm64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.26.0 rapidfuzz-3.10.1
Levenshtein Similarity between review:
 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use SPf50 with Neutrogena.'
 and product:
 'Neutrogena SPF50 Pore-Friendly Sunscreen – Protection Without Clogging'
 is as follow:
 0.7871621621621622


In [212]:
def compare_keywords_overlap_coefficient(review_keywords, product_keywords):
    set_review = set(review_keywords)
    set_product = set(product_keywords)
    intersection = set_review.intersection(set_product)
    min_size = min(len(set_review), len(set_product))
    if min_size == 0:
        return 0.0
    return len(intersection) / min_size

overlap_coefficient = compare_keywords_overlap_coefficient(review_data.get("other keywords", []), product_data.get("other keywords", []))
print(
    f"Overlap Coefficient between review:\n '{review}'\n and product:\n '{product}'\n is as follow:\n {overlap_coefficient}"
)


Overlap Coefficient between review:
 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use SPf50 with Neutrogena.'
 and product:
 'Neutrogena SPF50 Pore-Friendly Sunscreen – Protection Without Clogging'
 is as follow:
 1.0


In [213]:
from nltk.translate.bleu_score import sentence_bleu

def compare_keywords_bleu_score(review_keywords, product_keywords):
    if not review_keywords or not product_keywords:
        return 0.0
    score = sentence_bleu([product_keywords], review_keywords)
    return score

bleu_score = compare_keywords_bleu_score(review_data.get("other keywords", []), product_data.get("other keywords", []))
print(
    f"BLEU Score between review:\n '{review}'\n and product:\n '{product}'\n is as follow:\n {bleu_score}"
)


BLEU Score between review:
 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use SPf50 with Neutrogena.'
 and product:
 'Neutrogena SPF50 Pore-Friendly Sunscreen – Protection Without Clogging'
 is as follow:
 6.702145341854094e-232


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [215]:
!pip install rouge
from rouge import Rouge

def compare_keywords_rouge_score(review_keywords, product_keywords):
    if not review_keywords or not product_keywords:
        return 0.0
    rouge = Rouge()
    scores = rouge.get_scores(' '.join(review_keywords), ' '.join(product_keywords))
    return scores[0]['rouge-l']['f']

rouge_score = compare_keywords_rouge_score(review_data.get("other keywords", []), product_data.get("other keywords", []))
print(
    f"ROUGE Score between review:\n '{review}'\n and product:\n '{product}'\n is as follow:\n {rouge_score}"
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
ROUGE Score between review:
 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use SPf50 with Neutrogena.'
 and product:
 'Neutrogena SPF50 Pore-Friendly Sunscreen – Protection Without Clogging'
 is as follow:
 0.7499999953125


In [219]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compare_keywords_tfidf_cosine_similarity(review_keywords, product_keywords):
    corpus = [' '.join(review_keywords), ' '.join(product_keywords)]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return similarity[0][0]

tfidf_cosine_similarity = compare_keywords_tfidf_cosine_similarity(review_data.get("other keywords", []), product_data.get("other keywords", []))
print(
    f"TF-IDF Cosine Similarity between review:\n '{review}'\n and product:\n '{product}'\n is as follow:\n {tfidf_cosine_similarity}"
)


TF-IDF Cosine Similarity between review:
 'Absorbs quickly and doesn’t leave a greasy residue. Great for daily use SPf50 with Neutrogena.'
 and product:
 'Neutrogena SPF50 Pore-Friendly Sunscreen – Protection Without Clogging'
 is as follow:
 0.5797386715376658
