Author: Szymon Pawlonka

- identify product data (categories, brands, and other attributes) with LLM
- define similarity between products from the dataset and reviews

with different models, different prompts and real-world dataset

In [3]:
import os
import dotenv
import kaggle
import json
import pandas as pd
from tqdm import tqdm
import time
from typing import List, Dict
from mistralai import Mistral
from openai import OpenAI
from bert_score import score

dotenv.load_dotenv()

mistral = Mistral(api_key=os.getenv("API_KEY"))
gpt = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

## Prepare dataset

In [39]:
kaggle.api.dataset_download_files('promptcloud/ebay-product-dataset', path='ebay-product-dataset', unzip=True)
kaggle.api.dataset_download_files('wojtekbonicki/ebay-reviews', path='ebay-reviews', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/promptcloud/ebay-product-dataset
Dataset URL: https://www.kaggle.com/datasets/wojtekbonicki/ebay-reviews


In [42]:
with open('ebay-product-dataset/marketing_sample_for_ebay_com-ebay_com_product__20210101_20210331__30k_data.csv', 'r') as file:
    products_csv_string = file.read()
    products_csv_string = products_csv_string.replace(',,', ',')
    with open('ebay-product-dataset/ebay_products_cleaned.csv', 'w') as output_file:
        output_file.write(products_csv_string)

In [2]:
products_df = pd.read_csv('ebay-product-dataset/ebay_products_cleaned.csv')
reviews_df = pd.read_csv('ebay-reviews/ebay_reviews_cleaned.csv')

products = products_df['Title']
reviews = reviews_df['review'][reviews_df['review'].apply(lambda x: len(x.split(' ')) > 20)] # Filter only meaningful reviews

## Prompts

In [10]:
SIMPLE_PROMPT = """
Extract the information from the following review:
{review}

Return the data in JSON format with these fields:
{{
    "product_category": "string",
    "brand": "string",
    "other_keywords": [
        "string"
    ],
}}

If the information is not available, return an empty string.
"""

PLAYROLE_PROMPT = """
You are an expert at analyzing customer reviews to classify products. Given the following review, extract information about the product:

Review:
{review}

Return the data in JSON format with these fields:
{{
    "product_category": "string",
    "brand": "string",
    "other_keywords": [
        "string"
    ],
}}

If the information is not available, return an empty string.
"""

PROMPT_WITH_EXAMPLE = """
Analyze the following customer review and extract structured information.

Review:
{review}

Return the data in JSON format with these fields:
{{
    "product_category": "string",
    "brand": "string",
    "other keywords": [
        "string"
    ],
}}

Example output:
{{
    "product_category": "Thermal Mugs",
    "brand": "BrandX",
    "other_keywords": ["spill-proof", "keeps hot", "easy to clean"]
}}

If the information is not available, return an empty string.
""" 


PLAYROLE_PROMPT_WITH_EXAMPLE = """
You are an expert at analyzing customer reviews to classify products. Given the following review, extract information about the product: 

Review:
{review}

Return the data in JSON format with these fields:
{{
    "product_category": "string",
    "brand": "string",
    "other_keywords": [
        "string"
    ],
}}

Example output:
{{
    "product_category": "Thermal Mugs",
    "brand": "BrandX",
    "other_keywords": ["spill-proof", "keeps hot", "easy to clean"]
}}

If the information is not available, return an empty string.
"""


In [11]:
ALL_PROMPTS = {
    "simple_prompt": SIMPLE_PROMPT,
    "playrole_prompt": PLAYROLE_PROMPT,
    "prompt_with_example": PROMPT_WITH_EXAMPLE,
    "playrole_prompt_with_example": PLAYROLE_PROMPT_WITH_EXAMPLE
}

## Extracting product category and keywords

In [110]:
class MistralWrapper: 
    def __init__(self, mistral: Mistral, model="mistral-large-latest") -> None:
        self.mistral = mistral
        self.model = model
        pass

    def extract_info(self, prompt:str, review: str) -> dict:
        messages = [{"role": "user", "content": prompt.format(review=review)}]
        chat_response = self.mistral.chat.complete(
            model=self.model,
            messages=messages,
            response_format={"type": "json_object"},
        )

        if chat_response.choices[0].message.content == "": 
            return {}

        return json.loads(chat_response.choices[0].message.content)
    
class GPTWrapper: 
    def __init__(self, gpt: OpenAI, model='gpt-4o') -> None:
        self.gpt = gpt
        self.model = model
        pass

    def extract_info(self, prompt:str, review: str) -> dict:
        try: 
            response = self.gpt.chat.completions.create(
                messages=[{"role": "system", "content": prompt.format(review=review)}],
                model=self.model,
                n=1,
                temperature=0.0
            )

            response = response.choices[0].message.content.replace('```json\n', '').replace('```', '')

            return json.loads(response)
        
        except Exception as e:
            print(response)
            raise e

In [116]:
def extract_key_data(data: List[str], output_file: str, wrapper):
    extracted_info = []
    prompt_to_extracted_infos : Dict[str, List[dict]] = {}

    if os.path.exists(output_file):
        prompt_to_extracted_infos = json.load(open(output_file))

    for d in data:
        for prompt_name, prompt in ALL_PROMPTS.items():
            time.sleep(0.5) # To avoid rate limiting
        
            extracted_info = wrapper.extract_info(prompt, d)

            if isinstance(extracted_info, dict):
                extracted_info['source'] = d
                prompt_to_extracted_infos[prompt_name] = prompt_to_extracted_infos.get(prompt_name, [])
                prompt_to_extracted_infos[prompt_name].append(extracted_info)

                json.dump(prompt_to_extracted_infos, open(output_file, "w"), indent=4)

In [4]:
sample_n = 20

In [72]:
sample_reviews = reviews.sample(n=sample_n, random_state=0).tolist()
sample_products = products.sample(n=sample_n, random_state=0).tolist()

In [None]:
extracted_info_reviews_mistral_path = "extracted_info_reviews_mistral.json"
extracted_info_products_mistral_path = "extracted_info_products_mistral.json"

mistral_wrapper = MistralWrapper(mistral, model="mistral-large-latest")
extract_key_data(sample_reviews, extracted_info_reviews_mistral_path, mistral_wrapper)
extract_key_data(sample_products, extracted_info_products_mistral_path, mistral_wrapper)

In [117]:
extracted_info_reviews_gpt_path = "extracted_info_reviews_gpt.json"
extracted_info_products_gpt_path = "extracted_info_products_gpt.json"

gpt_wrapper = GPTWrapper(gpt)
extract_key_data(sample_reviews, extracted_info_reviews_gpt_path, gpt_wrapper)
extract_key_data(sample_products, extracted_info_products_gpt_path, gpt_wrapper)

# Similarity

In [49]:
class ReviewProductPair:
    def __init__(
            self, 
            prompt_type:str, 
            review: str, 
            product: str, 
            review_extracted: dict, 
            product_extracted: dict,
            similarity: float
        ) -> None:
        self.prompt_type = prompt_type
        self.review = review
        self.product = product
        self.review_extracted = review_extracted
        self.product_extracted = product_extracted
        self.similarity = similarity

    def to_dict(self):
        return {
            "prompt_type": self.prompt_type,
            "review": self.review,
            "product": self.product,
            "review_extracted": self.review_extracted,
            "product_extracted": self.product_extracted,
            "bert_score": self.similarity
        }
    
    @staticmethod
    def from_dict(d: dict):
        return ReviewProductPair(
            d['prompt_type'],
            d['review'],
            d['product'],
            d['review_extracted'],
            d['product_extracted'],
            d['similarity']
        )

In [135]:
def compute_similarity_score(
    output_file:str, 
    category_score_func,
    keyword_score_func, 
    reviews_extracted: dict,
    products_extracted: dict
):
    product_review_pairs = []

    for prompt_type in ALL_PROMPTS:
        for review_extracted in tqdm(reviews_extracted[prompt_type]):
            review_to_product_candidates = []

            for product_extracted in products_extracted[prompt_type]:
                bert_score = category_score_func(review_extracted, product_extracted)
                product_extracted['bert_score'] = bert_score
                review_to_product_candidates.append(product_extracted)

            review_to_product_candidates = sorted(review_to_product_candidates, key=lambda x: x['bert_score'], reverse=True)
            review_to_product_candidates = review_to_product_candidates[:5]

            for product_candidate in review_to_product_candidates:
                review_product_pair = ReviewProductPair(
                prompt_type,
                review_extracted['source'],
                product_candidate['source'],
                review_extracted,
                product_candidate,
                keyword_score_func(review_extracted, product_candidate)
            )

                product_review_pairs.append(review_product_pair)

            json.dump([pair.to_dict() for pair in product_review_pairs], open(output_file, "w"), indent=4)

## Bert similarity

In [141]:
def compare_keywords_bert_score(extratced_info_review: dict, extracted_info_product: dict) -> float:
  if (
    ('other_keywords' not in extratced_info_review or not extratced_info_review['other_keywords']) 
    or ('other_keywords' not in extracted_info_product or not extracted_info_product['other_keywords'])
  ):
    return 0

  min_keywords = min(len(extratced_info_review['other_keywords']), len(extracted_info_product['other_keywords']))

  _, _, F1 = score(
      [keyword for keyword in extratced_info_review['other_keywords'][:min_keywords]],
      [keyword for keyword in extracted_info_product['other_keywords'][:min_keywords]],
      lang="en",
      model_type="bert-base-uncased",
      verbose=False
  )

  return F1.mean().item()

def compare_product_category_bert_score(extratced_info_review: dict, extracted_info_product: dict) -> float:
  if (
    ('product_category' not in extratced_info_review or not extratced_info_review['product_category']) 
    or ('product_category' not in extracted_info_product or not extracted_info_product['product_category'])
  ):
    return 0

  _, _, F1 = score(
      [extratced_info_review['product_category']],
      [extracted_info_product['product_category']],
      lang="en",
      model_type="bert-base-uncased",
      verbose=False
  )

  return F1.mean().item()

### For Mistral

In [None]:
reviews_extracted = json.load(open(extracted_info_reviews_mistral_path))
products_extracted = json.load(open(extracted_info_products_mistral_path))

In [43]:
compute_similarity_score(
    "product_review_pairs_bert_score_mistral.json",
    compare_product_category_bert_score,
    compare_keywords_bert_score,
    reviews_extracted,
    products_extracted,
)

100%|██████████| 20/20 [03:43<00:00, 11.20s/it]
100%|██████████| 20/20 [03:46<00:00, 11.33s/it]
100%|██████████| 20/20 [03:50<00:00, 11.51s/it]
100%|██████████| 20/20 [04:04<00:00, 12.24s/it]


In [13]:
product_review_pairs = json.load(open("product_review_pairs_bert_score_mistral.json"))

In [15]:
product_review_pairs.sort(key=lambda x: x['bert_score'], reverse=True)

df_product_review_pairs = pd.DataFrame(product_review_pairs)
df_product_review_pairs[['prompt_type', 'review', 'product', 'bert_score']].head(10)

Unnamed: 0,prompt_type,review,product,bert_score
0,playrole_prompt,great job guys bought beats hear phone 15000 d...,Bluedio Hi Wireless Bluetooth 5.0 Earphone For...,0.658965
1,playrole_prompt_with_example,great job guys bought beats hear phone 15000 d...,Bluedio Hi Wireless Bluetooth 5.0 Earphone For...,0.658965
2,playrole_prompt_with_example,like new stuff fix like latest stuff dont mind...,Jlab Metal Neon Aluminum Earbuds with Mic & Tr...,0.63797
3,prompt_with_example,canon eos 60d you’re upgrading rebel budget ca...,Jlab Metal Neon Aluminum Earbuds with Mic & Tr...,0.620535
4,playrole_prompt_with_example,great job guys bought beats hear phone 15000 d...,Bass Microbud EP-2800-2 Pink Aluminum Earbuds ...,0.606276
5,playrole_prompt,like new stuff fix like latest stuff dont mind...,Jlab Metal Neon Aluminum Earbuds with Mic & Tr...,0.595019
6,playrole_prompt,like new stuff fix like latest stuff dont mind...,2x bracelet de remplacement apparence concise ...,0.589014
7,playrole_prompt_with_example,excellent camera excellent camera works advert...,Jlab Metal Neon Aluminum Earbuds with Mic & Tr...,0.585991
8,playrole_prompt,cost benefit note 9 sensation came 201819those...,Beats Solo 2 Headphones,0.584436
9,prompt_with_example,cost benefit note 9 sensation came 201819those...,Beats Solo 2 Headphones,0.584436


### For GPT

In [119]:
reviews_extracted = json.load(open(extracted_info_reviews_gpt_path))
products_extracted = json.load(open(extracted_info_products_gpt_path))

In [142]:
compute_similarity_score(
    "product_review_pairs_bert_score_gpt.json",
    compare_product_category_bert_score,
    compare_keywords_bert_score,
    reviews_extracted,
    products_extracted,
)

100%|██████████| 19/19 [03:06<00:00,  9.84s/it]
100%|██████████| 20/20 [03:32<00:00, 10.64s/it]
100%|██████████| 20/20 [03:15<00:00,  9.79s/it]
100%|██████████| 20/20 [03:36<00:00, 10.83s/it]


In [11]:
product_review_pairs = json.load(open("product_review_pairs_bert_score_gpt.json"))

In [12]:
product_review_pairs.sort(key=lambda x: x['bert_score'], reverse=True)

df_product_review_pairs = pd.DataFrame(product_review_pairs)
df_product_review_pairs[['prompt_type', 'review', 'product', 'bert_score']].head(10)

Unnamed: 0,prompt_type,review,product,bert_score
0,simple_prompt,great job guys bought beats hear phone 15000 d...,Bluedio Hi Wireless Bluetooth 5.0 Earphone For...,0.658965
1,playrole_prompt,great job guys bought beats hear phone 15000 d...,Bluedio Hi Wireless Bluetooth 5.0 Earphone For...,0.658965
2,playrole_prompt_with_example,great job guys bought beats hear phone 15000 d...,Bluedio Hi Wireless Bluetooth 5.0 Earphone For...,0.658965
3,simple_prompt,actually read instructions everything read rev...,Fossil Gen 4 Authentic Digital Dial Smart Watc...,0.577589
4,playrole_prompt,actually read instructions everything read rev...,Fossil Gen 4 Authentic Digital Dial Smart Watc...,0.577589
5,playrole_prompt,like new stuff fix like latest stuff dont mind...,Fossil Gen 4 Authentic Digital Dial Smart Watc...,0.573572
6,simple_prompt,great job guys bought beats hear phone 15000 d...,Jlab Metal Neon Aluminum Earbuds with Mic & Tr...,0.563792
7,playrole_prompt,cost benefit note 9 sensation came 201819those...,Fossil Gen 4 Authentic Digital Dial Smart Watc...,0.559724
8,simple_prompt,great keyboard fat fingers skinny phone wife l...,Amazon Echo Dot (4th Gen.) Smart Speaker - Cha...,0.559537
9,playrole_prompt_with_example,great keyboard fat fingers skinny phone wife l...,Fossil Gen 4 Authentic Digital Dial Smart Watc...,0.548094
