Author: Izabela Telejko

In [1]:
import spacy
import openai
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from together import Together

  from tqdm.autonotebook import tqdm, trange


In [7]:
prompts = {
    "review1": {
        "review_content": "Gets my clothes fresh and clean every time. No lingering odor with Tide.",
        "golden_answer": {
            "product category": "Powder Detergents for Laundry",
            "brand": "Tide",
            "other keywords": ['fresh', 'clean', 'no lingering odor']
        },
    },
    "review2": {
        "review_content": "Keeps my coffee hot for hours—just what I need for long workdays. Thanks, Contigo.",
        "golden_answer": {
            "product category": "Thermal Mugs",
            "brand": "Contigo",
            "other keywords": ["hot for hours"]
        },
    },
}
reviews = [
    # Thermal Mugs
    "Keeps my coffee hot for hours—just what I need for long workdays. Thanks, Contigo.",
    "The lid isn’t leak-proof, but it keeps drinks warm for a decent amount of time.",
    "Love the sleek design of my Zojirushi mug, and it fits perfectly in my car cup holder!",
    "It’s lightweight but keeps my drinks at the right temperature for hours with Hydro Flask.",
    "No more cold coffee! This Yeti thermal mug does the job.",
    "It’s easy to clean, and the thermal insulation works like a charm.",
    "The handle makes it easy to carry, and it doesn’t spill.",
    "Not great for keeping drinks cold, but excellent for hot beverages.",
    "I wish it were bigger, but it’s perfect for my morning tea.",
    "I accidentally dropped it, and it didn’t dent! Very sturdy.",
    "The rubber seal around the lid came loose after a few weeks. Disappointing.",
    "Great for both coffee and soup—keeps them warm for hours.",
    "The exterior stays cool, even when my drink is piping hot inside.",
    "I love the color options, and it’s great for on-the-go.",
    "Keeps ice water cold for hours, even in hot weather!",
    "It’s a little tricky to open one-handed, but overall, a great mug.",
    "The size is perfect for travel, and it keeps drinks hot all day.",
    "It doesn’t leak, even when I toss it in my bag. Highly recommend Contigo.",
    "The lid is a little tight, but the mug works well for keeping drinks warm.",
    "Very stylish and functional! I get compliments all the time.",
    "Keeps my coffee scalding hot for longer than any mug I’ve owned with Zojirushi.",
    "Great value for the price. Works just as well as more expensive brands.",
    "The mug is lightweight and easy to carry around.",
    "It fits perfectly under my single-serve coffee machine!",
    "Durable, sleek, and it does exactly what it’s supposed to.",

    # Dishwasher Detergents
    "My dishes come out sparkling clean every time with Cascade. Love this detergent!",
    "It works well on glass, but I’ve noticed spots on my silverware.",
    "Great for tough, greasy messes. Leaves no residue! Thanks, Finish.",
    "This detergent smells amazing and leaves my dishwasher fresh.",
    "It’s a little pricey, but my dishes have never looked better with Cascade Platinum.",
    "Gets rid of even the most stubborn baked-on food. Highly recommend Finish Quantum.",
    "Not the best on hard water stains, but otherwise it works great.",
    "My dishes have never been so spotless after a wash!",
    "It’s very effective, but I wish it came in a fragrance-free version.",
    "Cuts through grease like a dream. No more pre-rinsing with Cascade Complete.",
    "This detergent doesn’t leave any residue on plastic, which I love.",
    "My glasses come out clear and sparkling every single time.",
    "It doesn’t work well with my eco dishwasher. Dishes aren’t as clean.",
    "Very efficient—gets rid of food stains and smells with no problem.",
    "I noticed some streaks on my glassware, but overall it works well.",
    "Leaves my dishes spotless and my machine smelling fresh.",
    "A great, eco-friendly option that actually works!",
    "It’s a little hard on some of my delicate dishware.",
    "This is the only detergent that works on my hard water stains.",
    "No need to rewash dishes after using this—so efficient!",
    "Perfect for everyday use. My dishes are clean and shiny.",
    "Leaves a chemical smell, but it’s effective at cleaning.",
    "A bit expensive, but worth it for the spotless results.",
    "No more streaks or water spots! Best dishwasher detergent ever.",
    "My silverware and dishes look brand new after every wash.",

    # Sunscreens
    "Absorbs quickly and doesn’t leave a greasy residue. Great for daily use with Neutrogena.",
    "The scent is a little strong, but it protects well with Coppertone.",
    "Perfect for sensitive skin! No breakouts or irritation with La Roche-Posay.",
    "A bit thick to apply, but once it’s on, it stays all day.",
    "I love the lightweight formula of CeraVe, perfect for wearing under makeup.",
    "Doesn’t leave a white cast, even on darker skin tones.",
    "The spray bottle makes it super easy to apply on the go.",
    "This sunscreen saved me from burning on a beach vacation with Banana Boat!",
    "It’s waterproof, which is a must for pool days. Highly recommend Hawaiian Tropic.",
    "A bit pricey, but the protection it provides is worth every penny with Supergoop.",
    "This is my go-to sunscreen for both my face and body with Neutrogena.",
    "It’s a little greasy, but it gets the job done in strong sun.",
    "No weird scent, and it goes on smooth. Love this product!",
    "Perfect for outdoor activities—no sunburn, even after hours outside.",
    "It’s great for kids! No irritation, and it’s easy to apply with Blue Lizard.",
    "A little too heavy for my face, but works perfectly for the body.",
    "The texture is nice and light, not sticky at all.",
    "This sunscreen doesn’t clog my pores, which is a huge plus with EltaMD.",
    "I’ve tried a lot of sunscreens, and this one offers the best protection with La Roche-Posay.",
    "It leaves a slight sheen, but I love how protected my skin feels.",
    "This formula doesn’t dry out my skin like others do.",
    "It’s great under makeup—no pilling or greasiness.",
    "I wish it were more affordable, but it’s worth it for the protection.",
    "Very effective, even after swimming for hours.",
    "My skin stays soft and protected all day with Neutrogena sunscreen."

     # Powder Detergents for Laundry
    "Gets my clothes fresh and clean every time. No lingering odor with Tide.",
    "It dissolves well, even in cold water. My whites have never been brighter thanks to Ariel.",
    "A little pricey, but worth it for the excellent stain removal power of Persil.",
    "This powder leaves a residue on darker clothes. Not a fan of OMO.",
    "Great for sensitive skin! No itching or redness after using Seventh Generation.",
    "I love how eco-friendly this detergent is. It’s a big plus for me with Ecover.",
    "I don’t need fabric softener anymore—this leaves my clothes so soft!",
    "My laundry has never smelled so fresh, and it lasts for days with Gain.",
    "It’s not the best for heavy stains but works great for daily washes.",
    "Great value for the price. This box lasts forever! Thanks, Arm & Hammer.",
    "Perfect for my workout gear—gets rid of all the sweat smells.",
    "Leaves a bit of powder behind in the machine, but it cleans well.",
    "I’ve been using it for years, and Tide never disappoints.",
    "Not as effective in hard water areas, but still decent.",
    "My go-to detergent for all of my family’s laundry needs.",
    "I noticed some fading in my darker clothes after a few washes.",
    "It’s gentle on my baby’s clothes and skin with Dreft.",
    "Very effective at removing mud and grass stains from the kids’ clothes.",
    "I like the scent, but it might be too strong for some.",
    "No complaints so far! My clothes feel clean and fresh.",
    "Works just as well as liquid detergents but at a lower cost.",
    "A bit too perfumed for my taste, but it gets the job done.",
    "My clothes are noticeably softer and smell better than before.",
    "The box is hard to pour from, but the detergent works well.",
    "This is my new favorite detergent. So much better than the leading brand!",
]

#### Using spacy

In [10]:
nlp = spacy.load("en_core_web_sm")

def extract_product_data(text):
    """Extract product category, brand, and other attributes from a review."""
    doc = nlp(text)
    category, brand, attributes = None, None, []

    for ent in doc.ents:
        if ent.label_ == "ORG": 
            brand = ent.text
        elif ent.label_ in {"PRODUCT", "WORK_OF_ART"}:
            category = ent.text

    for token in doc:
        if token.pos_ in {"ADJ", "NOUN"} and token.text.lower() not in (brand or "").lower():
            attributes.append(token.text)

    return {
        "product category": category,
        "brand": brand,
        "other keywords": attributes
    }

for key, data in prompts.items():
    review_content = data["review_content"]
    extracted_data = extract_product_data(review_content)
    print(f"{key} - Extracted Data: {extracted_data}")

review1 - Extracted Data: {'product category': 'Tide', 'brand': None, 'other keywords': ['clothes', 'fresh', 'clean', 'time', 'odor']}
review2 - Extracted Data: {'product category': None, 'brand': None, 'other keywords': ['coffee', 'hot', 'hours', 'long', 'workdays', 'Thanks']}


#### Using LLM

Prompt:

Analyze the following product review and identify:
- Product category
- Brand
- Relevant keywords or phrases

Review: [review]

Answer format:
Category: [category]
Brand: [brand]
Keywords: [keyword1, keyword2, ...]

In [6]:
# this code requires an OpenAI API key, which is not free, so instead I used OpenAI UI to generate the results with GPT-4o mini

openai.api_key = 'XXX'

def extract_product_data_llm(text):
    prompt = (
        f"Analyze the following product review and identify:\n"
        f"- Product category\n"
        f"- Brand\n"
        f"- Relevant keywords or phrases\n\n"
        f"Review: {text}\n\n"
        f"Answer format:\n"
        f"Category: [category]\nBrand: [brand]\nKeywords: [keyword1, keyword2, ...]"
    )
    
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=100,
        n=1,
        stop=None,
        temperature=0.7,
    )
    
    generated_text = response.choices[0].message['content']
    return generated_text

review_1_output = extract_product_data_llm(prompts["review1"]["review_content"])
review_2_output = extract_product_data_llm(prompts["review2"]["review_content"])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


review1 - Extracted Data: {'product category': '[category]', 'brand': '[brand]', 'other keywords': ['']}
review2 - Extracted Data: {'product category': '[category]', 'brand': '[brand]', 'other keywords': ['[']}


In [18]:
# Results generated by OpenAI UI with GPT-4o mini
review_1_output = 'Category: Laundry Detergent\nBrand: Tide\nKeywords: fresh, clean, no lingering odor'
review_2_output = 'Category: Drinkware\nBrand: Contigo\nKeywords: coffee, hot, hours, long workdays'

In [29]:
def extract_product_data_llm(generated_text):
    lines = generated_text.split("\n")
    category, brand, keywords = None, None, []
    for line in lines:
        if line.startswith("Category:"):
            category = line.replace("Category:", "").strip()
        elif line.startswith("Brand:"):
            brand = line.replace("Brand:", "").strip()
        elif line.startswith("Keywords:"):
            keywords = line.replace("Keywords:", "").strip().split(", ")

    return {
        "product category": category,
        "brand": brand,
        "keywords": keywords
    }

print('Review 1: ', prompts["review1"]["review_content"])
print('Extracted data:')
extracted_review1 = extract_product_data_llm(review_1_output)
print(extracted_review1)
prompts["review1"]["generated_answer"] = extracted_review1
print('\nReview 2: ', prompts["review2"]["review_content"])
print('Extracted data:')
extracted_review2 = extract_product_data_llm(review_2_output)
print(extracted_review2)
prompts["review2"]["generated_answer"] = extracted_review2

Review 1:  Gets my clothes fresh and clean every time. No lingering odor with Tide.
Extracted data:
{'product category': 'Laundry Detergent', 'brand': 'Tide', 'keywords': ['fresh', 'clean', 'no lingering odor']}

Review 2:  Keeps my coffee hot for hours—just what I need for long workdays. Thanks, Contigo.
Extracted data:
{'product category': 'Drinkware', 'brand': 'Contigo', 'keywords': ['coffee', 'hot', 'hours', 'long workdays']}


#### Similarity between products

In [30]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

reviews = [data["review_content"] for data in prompts.values()]
product_categories = [data['generated_answer']["product category"] for data in prompts.values()]
review_embeddings = embedding_model.encode(reviews)
category_embeddings = embedding_model.encode(product_categories)
similarity_matrix = cosine_similarity(review_embeddings, category_embeddings)

for i, review in enumerate(reviews):
    print(f"Review: {review}")
    for j, category in enumerate(product_categories):
        print(f"  Product Category: {category} - Similarity Score: {similarity_matrix[i][j]:.4f}")
    print('\n')

Review: Gets my clothes fresh and clean every time. No lingering odor with Tide.
  Product Category: Laundry Detergent - Similarity Score: 0.6488
  Product Category: Drinkware - Similarity Score: 0.1604


Review: Keeps my coffee hot for hours—just what I need for long workdays. Thanks, Contigo.
  Product Category: Laundry Detergent - Similarity Score: 0.1680
  Product Category: Drinkware - Similarity Score: 0.2287




Prompt:

Please examine the product review provided below and extract the following information:

* Product category
* Brand
* Relevant keywords or phrases

Review: [review]

Response Format:
Category: 
Brand:
Keywords:

In [None]:
prompts = {
    "review1": {
        "review_content": "Gets my clothes fresh and clean every time. No lingering odor with Tide.",
        "golden_answer": {
            "product category": "Powder Detergents for Laundry",
            "brand": "Tide",
            "other keywords": ['fresh', 'clean', 'no lingering odor']
        },
    },
    "review2": {
        "review_content": "Keeps my coffee hot for hours—just what I need for long workdays. Thanks, Contigo.",
        "golden_answer": {
            "product category": "Thermal Mugs",
            "brand": "Contigo",
            "other keywords": ["hot for hours"]
        },
    },
}

In [None]:
# this code requires an OpenAI API key, which is not free, so instead I used OpenAI UI to generate the results with GPT-4o mini

openai.api_key = 'XXX'

def extract_product_data_llm(text):
    prompt = (
        f"Please examine the product review provided below and extract the following information:\n"
        f"* Product category\n"
        f"* Brand\n"
        f"* Relevant keywords or phrases\n\n"
        f"Review: {text}\n\n"
        f"Response Format:\n"
        f"Category: \nBrand: \nKeywords: "
        
    )
    
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=100,
        n=1,
        stop=None,
        temperature=0.7,
    )
    
    generated_text = response.choices[0].message['content']
    return generated_text

review_1_output = extract_product_data_llm(prompts["review1"]["review_content"])
review_2_output = extract_product_data_llm(prompts["review2"]["review_content"])

In [34]:
# Results generated by OpenAI UI with GPT-4o mini
review_1_output = 'Category: Laundry Detergent\nBrand: Tide\nKeywords: fresh, clean, no lingering odor'
review_2_output = 'Category: Coffee Mug\nBrand: Contigo\nKeywords: Keeps coffee hot, hours, long workdays'

In [35]:
print('Review 1: ', prompts["review1"]["review_content"])
print('Extracted data:')
extracted_review1 = extract_product_data_llm(review_1_output)
print(extracted_review1)
prompts["review1"]["generated_answer"] = extracted_review1
print('\nReview 2: ', prompts["review2"]["review_content"])
print('Extracted data:')
extracted_review2 = extract_product_data_llm(review_2_output)
print(extracted_review2)
prompts["review2"]["generated_answer"] = extracted_review2

Review 1:  Gets my clothes fresh and clean every time. No lingering odor with Tide.
Extracted data:
{'product category': 'Laundry Detergent', 'brand': 'Tide', 'keywords': ['fresh', 'clean', 'no lingering odor']}

Review 2:  Keeps my coffee hot for hours—just what I need for long workdays. Thanks, Contigo.
Extracted data:
{'product category': 'Coffee Mug', 'brand': 'Contigo', 'keywords': ['Keeps coffee hot', 'hours', 'long workdays']}


In [37]:
# similarities

product_categories = [data['generated_answer']["product category"] for data in prompts.values()]
review_embeddings = embedding_model.encode(reviews)
category_embeddings = embedding_model.encode(product_categories)
similarity_matrix = cosine_similarity(review_embeddings, category_embeddings)

for i, review in enumerate(reviews):
    print(f"Review: {review}")
    for j, category in enumerate(product_categories):
        print(f"  Product Category: {category} - Similarity Score: {similarity_matrix[i][j]:.4f}")
    print('\n')

Review: Gets my clothes fresh and clean every time. No lingering odor with Tide.
  Product Category: Laundry Detergent - Similarity Score: 0.6488
  Product Category: Coffee Mug - Similarity Score: 0.0557


Review: Keeps my coffee hot for hours—just what I need for long workdays. Thanks, Contigo.
  Product Category: Laundry Detergent - Similarity Score: 0.1680
  Product Category: Coffee Mug - Similarity Score: 0.4863




#### google/gemma-2b-it

In [44]:
client = Together(api_key='XXX')

def get_responses(reviews):
    extracted_info = []
    
    for review in reviews:
        prompt = f"Analyze the following product review and identify:\n" \
                  f"- Product category\n" \
                  f"- Brand\n" \
                  f"- Relevant keywords or phrases\n\n" \
                  f"Review: {review}\n\n" \
                  f"Answer format:\n" \
                  f"Category: [category]\n" \
                  f"Brand: [brand]\n" \
                  f"Keywords: [keyword1, keyword2, ...]"

        response = client.chat.completions.create(
            model="google/gemma-2b-it",
            messages=[{"role": "user", "content": prompt}],
        )
        response_content = response.choices[0].message.content.strip()
        extracted_info.append(response_content)
    
    return extracted_info

product_info = get_responses(reviews)

for info in product_info:
    print(info)
    print("---")  

**Category:** Coffee
**Brand:** Contigo
**Keywords:** hot coffee, long workdays, Contigo
---
**Category:** Product Review
**Brand:** N/A
**Keywords:** leak-proof, warm, drink
---
**Category:** Home & Kitchen
**Brand:** Zojirushi
**Keywords:** mug, car cup holder, sleek design
---
**Category:** Personal care
**Brand:** Hydro Flask
**Keywords:** Hydro Flask, temperature, drinks, hours
---
**Category:** Beverage
**Brand:** Yeti
**Keywords:** Cold coffee, thermal mug, Yeti, coffee mug
---
**Category:** Cleaning
**Brand:** [brand name]
**Keywords:** Thermal insulation, cleaning
---
**Category:** Handheld
**Brand:** [Brand name]
**Keywords:** handle, spill, easy to carry
---
**Category:** Beverage
**Brand:** [Brand name]
**Keywords:** cold drinks, hot beverages
---
**Category:** Tea
**Brand:** Unknown
**Keywords:** Morning tea, bigger
---
**Category:** Not specified
**Brand:** Not specified
**Keywords:** accident, dent, sturdy
---
**Category:** Product Review
**Brand:** N/A
**Keywords:** loo

In [45]:
def extract_product_info(reviews_info):
    extracted_info = []

    for i, review in enumerate(reviews_info):
        category_line = review.split('\n')[0]
        brand_line = review.split('\n')[1]
        keywords_line = review.split('\n')[2]
        category = category_line.split(':** ')[1]
        brand = brand_line.split(':** ')[1]
        keywords = keywords_line.split(':** ')[1]

        extracted_info.append({
            'Review': reviews[i],
            'Category': category,
            'Brand': brand,
            'Keywords': [keyword.strip() for keyword in keywords.split(',')]
        })
    
    return extracted_info

extracted_product_info = extract_product_info(product_info)
for info in extracted_product_info:
    print(info)

{'Review': 'Keeps my coffee hot for hours—just what I need for long workdays. Thanks, Contigo.', 'Category': 'Coffee', 'Brand': 'Contigo', 'Keywords': ['hot coffee', 'long workdays', 'Contigo']}
{'Review': 'The lid isn’t leak-proof, but it keeps drinks warm for a decent amount of time.', 'Category': 'Product Review', 'Brand': 'N/A', 'Keywords': ['leak-proof', 'warm', 'drink']}
{'Review': 'Love the sleek design of my Zojirushi mug, and it fits perfectly in my car cup holder!', 'Category': 'Home & Kitchen', 'Brand': 'Zojirushi', 'Keywords': ['mug', 'car cup holder', 'sleek design']}
{'Review': 'It’s lightweight but keeps my drinks at the right temperature for hours with Hydro Flask.', 'Category': 'Personal care', 'Brand': 'Hydro Flask', 'Keywords': ['Hydro Flask', 'temperature', 'drinks', 'hours']}
{'Review': 'No more cold coffee! This Yeti thermal mug does the job.', 'Category': 'Beverage', 'Brand': 'Yeti', 'Keywords': ['Cold coffee', 'thermal mug', 'Yeti', 'coffee mug']}
{'Review': 'It

In [47]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

categories = [
    "Thermal Mugs",
    "Dishwasher Detergents",
    "Sunscreens",
    "Powder Detergents for Laundry",
]

def calculate_similarity(reviews, categories):
    review_texts = [review['Review'] for review in reviews]
    combined_texts = review_texts + categories
    embeddings = embedding_model.encode(combined_texts)
    review_embeddings = embeddings[:len(review_texts)]
    category_embeddings = embeddings[len(review_texts):]
    similarity_matrix = cosine_similarity(review_embeddings, category_embeddings)
    return similarity_matrix

similarity_matrix = calculate_similarity(extracted_product_info, categories)

for i, review in enumerate(extracted_product_info):
    print(f"Similarity scores for review '{review['Review']}':")
    for j, score in enumerate(similarity_matrix[i]):
        print(f" - with category '{categories[j]}': {score:.4f}")
    print("---")

Similarity scores for review 'Keeps my coffee hot for hours—just what I need for long workdays. Thanks, Contigo.':
 - with category 'Thermal Mugs': 0.4807
 - with category 'Dishwasher Detergents': 0.0946
 - with category 'Sunscreens': 0.2474
 - with category 'Powder Detergents for Laundry': 0.1782
---
Similarity scores for review 'The lid isn’t leak-proof, but it keeps drinks warm for a decent amount of time.':
 - with category 'Thermal Mugs': 0.3309
 - with category 'Dishwasher Detergents': 0.0158
 - with category 'Sunscreens': 0.0828
 - with category 'Powder Detergents for Laundry': -0.0818
---
Similarity scores for review 'Love the sleek design of my Zojirushi mug, and it fits perfectly in my car cup holder!':
 - with category 'Thermal Mugs': 0.5887
 - with category 'Dishwasher Detergents': 0.1484
 - with category 'Sunscreens': 0.0673
 - with category 'Powder Detergents for Laundry': 0.0551
---
Similarity scores for review 'It’s lightweight but keeps my drinks at the right temperatu