In [None]:
from google.cloud import bigquery
import os
import pandas as pd
import time

os.environ["TOKENIZERS_PARALLELISM"] = "false"

client = bigquery.Client()

project_id = 'ingka-tugc-infra-prod'
dataset_id = 'eu_ai_content'
table_id = 'reviews'


table_ref = f'{project_id}.{dataset_id}.{table_id}'

# Query to get all the data - 1.17GB to process

article_id = '40598766'

query_all = f"""
    SELECT concat(r.title, '. ', r.text) as review_text
    FROM {table_ref} r
    WHERE franchise='set-11' AND content_lang_code = 'en' AND art_id = '{article_id}'
"""

query_job = client.query(query_all)

reviews = [row['review_text'] for row in query_job]
print(f"Processing {len(reviews)} reviews")

Processing 1016 reviews


In [None]:
from openai import AzureOpenAI
from utils.getSecret import get_secret

project = "923326131319"
secret  = "derai-azure"
api_key = get_secret(project, secret)

llm_client = AzureOpenAI(
    api_key=api_key,
    api_version="2023-07-01-preview",
    azure_endpoint="https://derai-vision.openai.azure.com/",
)

model = "gpt-4o" 

In [None]:
from utils.generateTopics import get_topics

topics = get_topics(reviews, llm_client, model)

In [4]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Define the embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode the topics
topic_embeddings = model.encode(topics, convert_to_numpy=True)

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(topic_embeddings)

# Set a similarity threshold
similarity_threshold = 0.8

# Identify and remove similar topics
unique_topics = topics.copy()
for i in range(len(topics)):
    for j in range(i + 1, len(topics)):
        if similarity_matrix[i, j] > similarity_threshold:
            if topics[j] in unique_topics:
                unique_topics.remove(topics[j])
                
deleted_topics_count = len(topics) - len(unique_topics)
print("Number of deleted topics:", deleted_topics_count)
print("Unique topics:", unique_topics)

  from .autonotebook import tqdm as notebook_tqdm


Number of deleted topics: 0
Unique topics: ['Color', 'Quality', 'Price', 'Fabric', 'Delivery', 'Fit', 'Washability', 'Decor']


# Topic Identification Evaluation - Are those 8 topics good?

In [None]:
from openai import AzureOpenAI
prompt = f"""You are an AI review analyst. Your task is to analyze a collection of reviews and assess the following topics: "{unique_topics}" 
            Go through all of the reviews and check that each topic is mentioned either explicitly or implicitly. 
            Once you read all the reviews, provide a rating for each topic (from 0 to 5) considering the relevance of the topic to the reviews.
            If a topic is not mentioned in the reviews, you should rate it as 0. If it is mentioned in a lot of reviews, you should rate it as 5.
            Reviews: "{', '.join(reviews)}" 
            Answer first with an array of scoring of the type 'Topic':'Rating' (e.g.: Quality: 5, Price: 5, ...) and after that, on a new line, 
            with a table containing the topics, their ratings and a brief justification on why that score has been assigned. 
            Remember that the score only indicated how much a topic is relevant for that group of reviews. 
            Do not add any additional text to the answer."""

ai_client = AzureOpenAI(
    api_key=api_key,
    api_version="2023-07-01-preview",
    azure_endpoint="https://derai-vision.openai.azure.com/",
)
result = ai_client.chat.completions.create(
    model="o1-mini",
    messages=[
        {"role": "user", "content": prompt},
    ],
)

print(result.choices[0].message.content)

```
Color: 5, Quality: 5, Price: 5, Fabric: 4, Delivery: 3, Fit: 3, Washability: 5, Decor: 5

| Topic      | Rating | Justification                                                   |
|------------|--------|-----------------------------------------------------------------|
| Color      | 5      | Mentioned frequently and prominently in reviews                |
| Quality    | 5      | Consistently discussed with high relevance                      |
| Price      | 5      | Frequently mentioned as a key factor                            |
| Fabric     | 4      | Often discussed, but slightly less emphasis compared to others  |
| Delivery   | 3      | Mentioned several times, moderate relevance                     |
| Fit        | 3      | Mentioned occasionally, relevant but not dominant               |
| Washability| 5      | Highly emphasized in many reviews                               |
| Decor      | 5      | Frequently discussed in context of matching or enhancing decor   |
```


# Topic assignment - Labeling each review with 0 to N topics

## deberta-v3-base-finetuned

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import logging

model_dir = 'artifacts/deberta-v3-base-finetuned:v11'  

device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model     = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.to(device)
model.eval()

start = time.time()

from utils.assignTopics import get_reviews_labels_deBERTa

print("Assigning topics to reviews...")
results = []
for review in reviews:
    try:
        assigned = get_reviews_labels_deBERTa(tokenizer, model, device, review, topics, threshold=0.3)
        results.append([review, assigned, 1])
    except Exception as e:
        msg = str(e)
        if "content_filter" in msg or "ResponsibleAIPolicyViolation" in msg:
            logging.error(f"Content filter triggered for review: {review} – skipping.")
        else:
            logging.error(f"Error processing review: {review} – {e}")
        results.append([review, [], 1])

df_results = pd.DataFrame(results, columns=["sequence", "labels", "scores"])

# Save to CSV
df_results.to_csv("csv/deberta_finetuned.csv", index=False)

deberta_finetuned_time = time.time() - start

print("Results saved to deberta_finetuned.csv, time taken:", deberta_finetuned_time)

Assigning topics to reviews...
Results saved to deberta_finetuned.csv, time taken: 52.4697151184082


## deberta-pairwise

In [None]:
model_dir = 'artifacts/deberta-v3-pairwise-finetuned:v1'  

device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model     = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.to(device)
model.eval()

start = time.time()

from utils.assignTopics import get_reviews_labels_deBERTa

print("Assigning topics to reviews...")
results = []
for review in reviews:
    try:
        assigned = get_reviews_labels_deBERTa(tokenizer, model, device, review, topics, threshold=0.3)
        results.append([review, assigned, 1])
    except Exception as e:
        msg = str(e)
        if "content_filter" in msg or "ResponsibleAIPolicyViolation" in msg:
            logging.error(f"Content filter triggered for review: {review} – skipping.")
        else:
            logging.error(f"Error processing review: {review} – {e}")
        results.append([review, [], 1])

df_results = pd.DataFrame(results, columns=["sequence", "labels", "scores"])

# Save to CSV
df_results.to_csv("csv/deberta_finetuned.csv", index=False)

deberta_finetuned_time = time.time() - start

print("Results saved to deberta_finetuned.csv, time taken:", deberta_finetuned_time)

## nli-deberta-v3-base

In [8]:
start = time.time()

from transformers import AutoTokenizer, pipeline

tokenizer = AutoTokenizer.from_pretrained("cross-encoder/nli-deberta-v3-base", use_fast=False)
classifier = pipeline("zero-shot-classification", model='cross-encoder/nli-deberta-v3-base', tokenizer=tokenizer)

results = []
for review in reviews:
    result = classifier(review, topics, multi_label=True)
    results.append(result)

# Transform results into a DataFrame
df_results = pd.DataFrame(results)

# Save DataFrame to CSV
df_results.to_csv('csv/nli-deberta-v3.csv', index=False)

nlidebertav3_time = time.time() - start

print("Results saved to nli-deberta-v3.csv, time taken:", nlidebertav3_time)

Device set to use mps:0


Results saved to nli-deberta-v3.csv, time taken: 190.302903175354


## bart-large-mnli

In [9]:
start = time.time()

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
results = []
for review in reviews:
    result = classifier(review, topics, multi_label=True)
    results.append(result)

# Transform results into a DataFrame
df_results = pd.DataFrame(results)

# Save DataFrame to CSV
df_results.to_csv('csv/bart-large-mnli.csv', index=False)

bartlargemnli_time = time.time() - start

print("Results saved to bart-large-mnli.csv, time taken:", bartlargemnli_time)

Device set to use mps:0


Results saved to bart-large-mnli.csv, time taken: 194.8539719581604


## Flan-T5-base-finetuned

In [10]:
from transformers import T5ForConditionalGeneration

model_dir = 'artifacts/flan-t5-base-finetuned:v9'  # adjust if your artifact lives in a subdirectory

# -----------------------------------------------------------------------------
# Load tokenizer & Flan-T5 for classification
# -----------------------------------------------------------------------------
device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model     = T5ForConditionalGeneration.from_pretrained(model_dir)
model.to(device)
model.eval()

# -----------------------------------------------------------------------------
# Assign topics to each review via Flan-T5
# -----------------------------------------------------------------------------

start = time.time()

from assignTopics import get_reviews_labels_flanT5

print("Assigning topics to reviews...")
results = []
for review in reviews:
    try:
        assigned = get_reviews_labels_flanT5(
            tokenizer, model, device, review, topics
        )
        results.append([review, assigned, 1])
    except Exception as e:
        msg = str(e)
        if "content_filter" in msg or "ResponsibleAIPolicyViolation" in msg:
            logging.error(f"Content filter triggered for review: {review} – skipping.")
        else:
            logging.error(f"Error processing review: {review} – {e}")
        results.append([review, [], 1])


# Transform results into a DataFrame
df_results = pd.DataFrame(results, columns=["sequence", "labels", "scores"])

# Check and split labels if they contain ';'
df_results['labels'] = df_results['labels'].apply(lambda x: x.split(';') if ';' in x else x)

# Save DataFrame to CSV
df_results.to_csv('csv/flan_t5_finetuned.csv', index=False)

flan_t5_finetuned_time = time.time() - start

print("Results saved to flan_t5_finetuned.csv, time taken:", flan_t5_finetuned_time)

Assigning topics to reviews...
Results saved to flan_t5_finetuned.csv, time taken: 279.5624740123749


## Flan-T5-small-finetuned

In [None]:
model_dir = 'artifacts/flan-t5-small-finetuned:v8'  # adjust if your artifact lives in a subdirectory

# -----------------------------------------------------------------------------
# Load tokenizer & Flan-T5 for classification
# -----------------------------------------------------------------------------
device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model     = T5ForConditionalGeneration.from_pretrained(model_dir)
model.to(device)
model.eval()

# -----------------------------------------------------------------------------
# Assign topics to each review via Flan-T5
# -----------------------------------------------------------------------------

start = time.time()

from utils.assignTopics import get_reviews_labels_flanT5

print("Assigning topics to reviews...")
results = []
for review in reviews:
    try:
        assigned = get_reviews_labels_flanT5(
            tokenizer, model, device, review, topics
        )
        results.append([review, assigned, 1])
    except Exception as e:
        msg = str(e)
        if "content_filter" in msg or "ResponsibleAIPolicyViolation" in msg:
            logging.error(f"Content filter triggered for review: {review} – skipping.")
        else:
            logging.error(f"Error processing review: {review} – {e}")
        results.append([review, [], 1])


# Transform results into a DataFrame
df_results = pd.DataFrame(results, columns=["sequence", "labels", "scores"])

# Check and split labels if they contain ';'
df_results['labels'] = df_results['labels'].apply(lambda x: x.split(';') if ';' in x else x)

# Save DataFrame to CSV
df_results.to_csv('csv/flan_t5_small_finetuned.csv', index=False)

flan_t5_small_finetuned_time = time.time() - start

print("Results saved to flan_t5_small_finetuned.csv, time taken:", flan_t5_small_finetuned_time)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdavide-zanutto[0m ([33mdigital-ethics-responsible-ai[0m) to [32mhttps://wandb.mlops.ingka.com[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact flan-t5-small-finetuned:v8, 1180.77MB. 20 files... 
[34m[1mwandb[0m:   20 of 20 files downloaded.  
Done. 0:1:15.8


## LLM Zero-shot - GPT-4o

In [None]:
def get_reviews_labels(review, topics):
    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful customer reviews expert that identifies the main topics in a review.\n"
                "Provide the output as a comma-separated list of topics.\n"
            ),
        },
        {
            "role": "user",
            "content": (
                "Read the following review and associate the topics mentioned implicitly or explicitly in the review.\n"
                "Only answer with the topics that are mentioned in the review. Example: ['price', 'quality']. \n"
                "If you cannot identify any topics, just return '[]' \n"
                "Do not generate any new topic, just use the ones provided.\n"
                f"Review: '{review}' \n"
                f"Topics: '{topics}' \n"
                f"Topics mentioned within the review:"
            ),
        },
    ]

    response = ' '
    model = "gpt-4o-mini" 
    # Generate the topic word using the language model
    response = llm_client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=30,
        temperature=0.4,
        n=1,
        stop=None,
    )

    # Extract and return the topic word
    return response.choices[0].message.content.strip()

In [12]:
start = time.time()

results = []
for review in reviews:
    result = get_reviews_labels(review, topics)
    results.append([review, result, 1])

# Transform results into a DataFrame
df = pd.DataFrame(results, columns=["sequence", "labels", "scores"])

# Save DataFrame to CSV
df.to_csv('csv/LLM.csv', index=False)

LLM_time = time.time() - start

print("Results saved to LLM.csv, time taken:", LLM_time)

Results saved to LLM.csv, time taken: 448.77764892578125


In [None]:
import json
import datetime


# Load the results from the three CSV files
df_deberta_finetuned = pd.read_csv('csv/deberta_finetuned.csv')
df_nli_deberta_v3 = pd.read_csv('csv/nli-deberta-v3.csv')
df_bart_large_mnli = pd.read_csv('csv/bart-large-mnli.csv')
df_flan_t5_finetuned = pd.read_csv('csv/flan_t5_finetuned.csv')
df_flan_t5_small_finetuned = pd.read_csv('csv/flan_t5_small_finetuned.csv')
llm = pd.read_csv('csv/LLM.csv')

# Initialize a dictionary to hold the reviews and their corresponding topics
reviews_topics = []
# Add the topics to the dictionary
reviews_topics.append({"Identified topics": topics})
# Add the time taken by each method
time_taken = {
    "deberta_finetuned_time": deberta_finetuned_time,
    "nli_deberta_v3_time": nlidebertav3_time,
    "bart_large_mnli_time": bartlargemnli_time,
    "flan_t5_finetuned_time": flan_t5_finetuned_time,
    "flan_t5_small_finetuned_time": flan_t5_small_finetuned_time,
    "LLM_time": LLM_time}
reviews_topics.append({"Time taken": time_taken})
for i, review in enumerate(reviews):
    review_entry = {
        "review": review,
        "deberta_finetuned_topics": {
            label: 1 for label in  eval(df_deberta_finetuned.iloc[i]['labels'])
        },
        "nli_deberta_v3_topics": {
            label: score for label, score in zip(eval(df_nli_deberta_v3.iloc[i]['labels']), eval(df_nli_deberta_v3.iloc[i]['scores'])) if score > 0.9
        },
        "bart_large_mnli_topics": {
            label: score for label, score in zip(eval(df_bart_large_mnli.iloc[i]['labels']), eval(df_bart_large_mnli.iloc[i]['scores'])) if score > 0.9
        },
        "flan_t5_finetuned_topics": {
            label: 1 for label in  eval(df_flan_t5_finetuned.iloc[i]['labels'])        
        },
        "flan_t5_small_finetuned_topics": {
            label: 1 for label in  eval(df_flan_t5_small_finetuned.iloc[i]['labels'])        
        },
        "LLM_topics": {
            label: 1 for label in  eval(llm.iloc[i]['labels'])
        }   
    }
    reviews_topics.append(review_entry)

# ct stores current time
ct = datetime.datetime.now()
json_name = f'outputs/comparison_output_{ct}.json'
# Write the dictionary to a JSON file
with open(json_name, 'w') as json_file:
    json.dump(reviews_topics, json_file, indent=4)

print("Json output saved!")

Json output saved!


In [35]:
import json
import ast
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_score, recall_score, f1_score

def normalize_label(label: str) -> str:
    return label.strip().lower().replace(" ", "")

# 1. Load the JSON (skip the first two metadata entries)
with open(json_name, 'r') as f:
    data = json.load(f)
entries = data[2:]

# 2. Build a mapping from review text → prediction‐entry
review_to_entry = { e['review']: e for e in entries }

# 3. Load ground‐truth, parse lists
df_truth = pd.read_csv('csv/ground_truth_40598766.csv')
df_truth['topics'] = df_truth['topics'].apply(ast.literal_eval)

# 4. Prepare containers
y_true, model_preds = [], {
    'deberta_finetuned': [],
    'nli_deberta_v3':    [],
    'bart_large_mnli':   [],
    'flan_t5_finetuned': [],
    'LLM':               []
}

# 5. Align on review text
missing = []
for _, row in df_truth.iterrows():
    review = row['review']
    if review not in review_to_entry:
        missing.append(review)
        continue
    
    entry = review_to_entry[review]
    # normalize & collect ground truth
    true_norm = [normalize_label(t) for t in row['topics']]
    y_true.append(true_norm)
    
    # collect each model’s normalized predictions
    for model_key, json_key in [
        ('deberta_finetuned','deberta_finetuned_topics'),
        ('nli_deberta_v3',   'nli_deberta_v3_topics'),
        ('bart_large_mnli',  'bart_large_mnli_topics'),
        ('flan_t5_finetuned','flan_t5_finetuned_topics'),
        ('LLM','LLM_topics')
    ]:
        preds = list(entry[json_key].keys())
        model_preds[model_key].append([normalize_label(t) for t in preds])

if missing:
    print("WARNING: no JSON entry for these reviews:\n", "\n".join(missing))

# 6. Binarize all labels against the full identified topic set
identified = data[0]['Identified topics']
norm_identified = [normalize_label(t) for t in identified]
mlb = MultiLabelBinarizer(classes=norm_identified)
y_true_bin = mlb.fit_transform(y_true)

# 7. Compute metrics
for model, pred_lists in model_preds.items():
    y_pred_bin = mlb.transform(pred_lists)
    p = precision_score(y_true_bin, y_pred_bin, average='micro', zero_division=0)
    r = recall_score   (y_true_bin, y_pred_bin, average='micro', zero_division=0)
    f = f1_score       (y_true_bin, y_pred_bin, average='micro', zero_division=0)
    print(f"{model:20s}  Precision: {p:.3f}  Recall: {r:.3f}  F1: {f:.3f}")


deberta_finetuned     Precision: 0.834  Recall: 0.720  F1: 0.773
nli_deberta_v3        Precision: 0.461  Recall: 0.824  F1: 0.591
bart_large_mnli       Precision: 0.603  Recall: 0.502  F1: 0.548
flan_t5_finetuned     Precision: 0.832  Recall: 0.821  F1: 0.826
LLM                   Precision: 0.843  Recall: 0.881  F1: 0.861




In [18]:
import json
import pandas as pd
import ast

def load_ground_truth(csv_file):
    df = pd.read_csv(csv_file)
    # Convert the topics column (a string) into an actual list.
    df['topics'] = df['topics'].apply(ast.literal_eval)
    return df

def load_predictions(json_file):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Filter out entries that do not contain a 'review' key
    predictions = [entry for entry in data if "review" in entry]

    return predictions  # List of reviews with topics

def compute_precision_recall_f1(gt_topics, pred_topics):
    # True positives: the intersection of predicted and ground truth topics.
    true_positives = len(gt_topics & pred_topics)
    
    precision = true_positives / len(pred_topics) if pred_topics else 0.0
    recall = true_positives / len(gt_topics) if gt_topics else 0.0
    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0
    
    return precision, recall, f1

def safe_literal_eval(val):
    try:
        if isinstance(val, str):
            return ast.literal_eval(val)
        return val  # If it's already a list, return as is.
    except (ValueError, SyntaxError):
        print(f"Warning: Could not parse topics: {val}")
        return []  # Return an empty list if parsing fails.

# Load the CSV file
gt_df = pd.read_csv("csv/ground_truth_40598766.csv")

# Apply the safe conversion
gt_df['topics'] = gt_df['topics'].apply(safe_literal_eval)


predictions = load_predictions(json_name)

# List the models that we want to evaluate (must match keys in the JSON)
models = [
    "deberta_finetuned_topics",
    "nli_deberta_v3_topics",
    "bart_large_mnli_topics",
    "flan_t5_finetuned_topics",
    "LLM_topics"
]

# Initialize an accumulator for metrics per model.
metrics = {model: {"precision": [], "recall": [], "f1": []} for model in models}

# Check that the number of reviews is the same in both files.
if len(gt_df) != len(predictions):
    print("len(gt_df),", len(gt_df))
    print("len(predictions),", len(predictions))
    raise ValueError("The number of reviews in the CSV and JSON files do not match!")

# Iterate over the reviews (assumed aligned by index)
for idx, pred_entry in enumerate(predictions):
    # For consistency, we compare topics in lowercase with whitespace stripped.
    gt_topics = {topic.lower().strip() for topic in gt_df.iloc[idx]["topics"]}
    
    for model in models:
        # In the JSON predictions, if a model has produced topics,
        # we take the keys (ignoring the scores) as the predicted topics.
        pred_model_dict = pred_entry.get(model, {})
        pred_topics = {topic.lower().strip() for topic in pred_model_dict.keys()}
        
        precision, recall, f1 = compute_precision_recall_f1(gt_topics, pred_topics)
        
        metrics[model]["precision"].append(precision)
        metrics[model]["recall"].append(recall)
        metrics[model]["f1"].append(f1)

# Compute the average scores for each model and print them
for model in models:
    avg_precision = sum(metrics[model]["precision"]) / len(metrics[model]["precision"])
    avg_recall = sum(metrics[model]["recall"]) / len(metrics[model]["recall"])
    avg_f1 = sum(metrics[model]["f1"]) / len(metrics[model]["f1"])
    print(f"Model: {model}")
    print(f"  Precision: {avg_precision:.3f}")
    print(f"  Recall:    {avg_recall:.3f}")
    print(f"  F1 Score:  {avg_f1:.3f}")
    print("-" * 30)

Model: deberta_finetuned_topics
  Precision: 0.336
  Recall:    0.252
  F1 Score:  0.259
------------------------------
Model: nli_deberta_v3_topics
  Precision: 0.325
  Recall:    0.470
  F1 Score:  0.345
------------------------------
Model: bart_large_mnli_topics
  Precision: 0.327
  Recall:    0.255
  F1 Score:  0.258
------------------------------
Model: flan_t5_finetuned_topics
  Precision: 0.383
  Recall:    0.311
  F1 Score:  0.306
------------------------------
Model: LLM_topics
  Precision: 0.288
  Recall:    0.313
  F1 Score:  0.270
------------------------------
