In [1]:
from google.cloud import bigquery
import os
import pandas as pd
import time
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import logging

os.environ["TOKENIZERS_PARALLELISM"] = "false"

client = bigquery.Client()

project_id = 'ingka-tugc-infra-prod'
dataset_id = 'eu_ai_content'
table_id = 'reviews'


table_ref = f'{project_id}.{dataset_id}.{table_id}'

# Query to get all the data - 1.17GB to process

article_id = '40598766'

query_all = f"""
    SELECT concat(r.title, '. ', r.text) as review_text
    FROM {table_ref} r
    WHERE franchise='set-11' AND content_lang_code = 'en' AND art_id = '{article_id}'
"""

query_job = client.query(query_all)

reviews = [row['review_text'] for row in query_job]
print(f"Processing {len(reviews)} reviews")

  from .autonotebook import tqdm as notebook_tqdm


Processing 1016 reviews


In [2]:
from openai import AzureOpenAI
from utils.getSecret import get_secret

project = "923326131319"
secret  = "derai-azure"
api_key = get_secret(project, secret)

llm_client = AzureOpenAI(
    api_key=api_key,
    api_version="2023-07-01-preview",
    azure_endpoint="https://derai-vision.openai.azure.com/",
)

model = "gpt-4o" 

In [3]:
from utils.generateTopics import get_topics

topics = get_topics(reviews, llm_client, model)

In [4]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Define the embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode the topics
topic_embeddings = model.encode(topics, convert_to_numpy=True)

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(topic_embeddings)

# Set a similarity threshold
similarity_threshold = 0.8

# Identify and remove similar topics
unique_topics = topics.copy()
for i in range(len(topics)):
    for j in range(i + 1, len(topics)):
        if similarity_matrix[i, j] > similarity_threshold:
            if topics[j] in unique_topics:
                unique_topics.remove(topics[j])
                
deleted_topics_count = len(topics) - len(unique_topics)
print("Number of deleted topics:", deleted_topics_count)
print("Unique topics:", unique_topics)

Number of deleted topics: 0
Unique topics: ['Quality', 'Price', 'Color', 'Fabric', 'Value', 'Size', 'Washability', 'Zipper']


# Topic Identification Evaluation - Are those 8 topics good?

In [None]:
"""from openai import AzureOpenAI
prompt = fYou are an AI review analyst. Your task is to analyze a collection of reviews and assess the following topics: "{unique_topics}" 
            Go through all of the reviews and check that each topic is mentioned either explicitly or implicitly. 
            Once you read all the reviews, provide a rating for each topic (from 0 to 5) considering the relevance of the topic to the reviews.
            If a topic is not mentioned in the reviews, you should rate it as 0. If it is mentioned in a lot of reviews, you should rate it as 5.
            Reviews: "{', '.join(reviews)}" 
            Answer first with an array of scoring of the type 'Topic':'Rating' (e.g.: Quality: 5, Price: 5, ...) and after that, on a new line, 
            with a table containing the topics, their ratings and a brief justification on why that score has been assigned. 
            Remember that the score only indicated how much a topic is relevant for that group of reviews. 
            Do not add any additional text to the answer.

ai_client = AzureOpenAI(
    api_key=api_key,
    api_version="2023-07-01-preview",
    azure_endpoint="https://derai-vision.openai.azure.com/",
)
result = ai_client.chat.completions.create(
    model="o1-mini",
    messages=[
        {"role": "user", "content": prompt},
    ],
)

print(result.choices[0].message.content)"""

```json
{
  "Quality": 4,
  "Price": 5,
  "Color": 5,
  "Fabric": 4,
  "Value": 5,
  "Size": 3,
  "Washability": 4,
  "Zipper": 3
}
```

| Topic       | Rating | Justification                                                                                      |
|-------------|--------|----------------------------------------------------------------------------------------------------|
| Quality     | 4      | Frequently mentioned with both positive and negative comments, indicating high relevance.          |
| Price       | 5      | Dominantly discussed as a key factor, mostly positively, making it highly relevant.                |
| Color       | 5      | Extensively discussed in terms of variety, accuracy, and impact on decor.                         |
| Fabric      | 4      | Regularly mentioned regarding texture, durability, and material quality.                          |
| Value       | 5      | Often highlighted as excellent or great, closely tied with price and quality.       

# Topic assignment - Labeling each review with 0 to N topics

## deberta-v3-base-finetuned

In [None]:
start = time.time()

model_dir = 'artifacts/deberta-v3-base-finetuned:v11'  

device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model     = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.to(device)
model.eval()


from utils.assignTopics import get_reviews_labels_deBERTa

print("Assigning topics to reviews...")
results = []
for review in reviews:
    try:
        assigned = get_reviews_labels_deBERTa(tokenizer, model, device, review, topics, threshold=0.3)
        results.append([review, assigned, 1])
    except Exception as e:
        msg = str(e)
        if "content_filter" in msg or "ResponsibleAIPolicyViolation" in msg:
            logging.error(f"Content filter triggered for review: {review} – skipping.")
        else:
            logging.error(f"Error processing review: {review} – {e}")
        results.append([review, [], 1])

df_results = pd.DataFrame(results, columns=["sequence", "labels", "scores"])

# Save to CSV
df_results.to_csv("csv/deberta_finetuned.csv", index=False)

deberta_finetuned_time = time.time() - start

print("Results saved to deberta_finetuned.csv, time taken:", deberta_finetuned_time)

Assigning topics to reviews...
Results saved to deberta_finetuned.csv, time taken: 46.330167055130005


## deberta-pairwise

In [None]:
start = time.time()

model_dir = 'artifacts/deberta-v3-pairwise-finetuned:v1'  

device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model     = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.to(device)
model.eval()

from utils.assignTopics import get_review_labels_deBERTa_pairwise

print("Assigning topics to reviews...")
results = []
for review in reviews:
    try:
        assigned = get_review_labels_deBERTa_pairwise(tokenizer, model, device, review, topics, threshold=0.5)
        results.append([review, assigned, 1])
    except Exception as e:
        msg = str(e)
        if "content_filter" in msg or "ResponsibleAIPolicyViolation" in msg:
            logging.error(f"Content filter triggered for review: {review} – skipping.")
        else:
            logging.error(f"Error processing review: {review} – {e}")
        results.append([review, [], 1])

df_results = pd.DataFrame(results, columns=["sequence", "labels", "scores"])

# Save to CSV
df_results.to_csv("csv/deberta_pairwise_finetuned.csv", index=False)

deberta_pairwise_finetuned_time = time.time() - start

print("Results saved to deberta_pairwise_finetuned.csv, time taken:", deberta_pairwise_finetuned_time)

Assigning topics to reviews...
Results saved to deberta_pairwise_finetuned.csv, time taken: 123.74009490013123


## nli-deberta-v3-base

In [8]:
start = time.time()

from transformers import AutoTokenizer, pipeline

tokenizer = AutoTokenizer.from_pretrained("cross-encoder/nli-deberta-v3-base", use_fast=False)
classifier = pipeline("zero-shot-classification", model='cross-encoder/nli-deberta-v3-base', tokenizer=tokenizer)

results = []
for review in reviews:
    result = classifier(review, topics, multi_label=True)
    results.append(result)

# Transform results into a DataFrame
df_results = pd.DataFrame(results)

# Save DataFrame to CSV
df_results.to_csv('csv/nli-deberta-v3.csv', index=False)

nlidebertav3_time = time.time() - start

print("Results saved to nli-deberta-v3.csv, time taken:", nlidebertav3_time)

Device set to use mps:0


Results saved to nli-deberta-v3.csv, time taken: 177.73452305793762


## bart-large-mnli

In [9]:
start = time.time()

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
results = []
for review in reviews:
    result = classifier(review, topics, multi_label=True)
    results.append(result)

# Transform results into a DataFrame
df_results = pd.DataFrame(results)

# Save DataFrame to CSV
df_results.to_csv('csv/bart-large-mnli.csv', index=False)

bartlargemnli_time = time.time() - start

print("Results saved to bart-large-mnli.csv, time taken:", bartlargemnli_time)

Device set to use mps:0


Results saved to bart-large-mnli.csv, time taken: 178.6922528743744


## Flan-T5-base-finetuned

In [None]:
from transformers import T5ForConditionalGeneration

start = time.time()

model_dir = 'artifacts/flan-t5-base-finetuned:v9'

# -----------------------------------------------------------------------------
# Load tokenizer & Flan-T5 for classification
# -----------------------------------------------------------------------------
device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model     = T5ForConditionalGeneration.from_pretrained(model_dir)
model.to(device)
model.eval()

# -----------------------------------------------------------------------------
# Assign topics to each review via Flan-T5
# -----------------------------------------------------------------------------

from utils.assignTopics import get_reviews_labels_flanT5

print("Assigning topics to reviews...")
results = []
for review in reviews:
    try:
        assigned = get_reviews_labels_flanT5(
            tokenizer, model, device, review, topics
        )
        results.append([review, assigned, 1])
    except Exception as e:
        msg = str(e)
        if "content_filter" in msg or "ResponsibleAIPolicyViolation" in msg:
            logging.error(f"Content filter triggered for review: {review} – skipping.")
        else:
            logging.error(f"Error processing review: {review} – {e}")
        results.append([review, [], 1])


# Transform results into a DataFrame
df_results = pd.DataFrame(results, columns=["sequence", "labels", "scores"])

# Check and split labels if they contain ';'
def split_semicolon_elements(label_list):
    """
    Given a list of strings, split any element containing ';'
    into multiple pieces, strip whitespace, and return the flat list.
    """
    new_labels = []
    for elem in label_list:
        # only split strings that contain a semicolon
        if isinstance(elem, str) and ';' in elem:
            parts = [part.strip() for part in elem.split(';')]
            new_labels.extend(parts)
        else:
            # leave other elements untouched (but strip surrounding spaces)
            new_labels.append(elem.strip() if isinstance(elem, str) else elem)
    return new_labels


# Check and split labels if they contain ';'
df_results['labels'] = df_results['labels'].apply(split_semicolon_elements)

# Save DataFrame to CSV
df_results.to_csv('csv/flan_t5_finetuned.csv', index=False)

flan_t5_finetuned_time = time.time() - start

print("Results saved to flan_t5_finetuned.csv, time taken:", flan_t5_finetuned_time)

Assigning topics to reviews...
Results saved to flan_t5_finetuned.csv, time taken: 284.03163599967957


## Flan-T5-small-finetuned

In [None]:
start = time.time()

model_dir = 'artifacts/flan-t5-small-finetuned:v8'  

# -----------------------------------------------------------------------------
# Load tokenizer & Flan-T5 for classification
# -----------------------------------------------------------------------------
device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model     = T5ForConditionalGeneration.from_pretrained(model_dir)
model.to(device)
model.eval()

# -----------------------------------------------------------------------------
# Assign topics to each review via Flan-T5
# -----------------------------------------------------------------------------

from utils.assignTopics import get_reviews_labels_flanT5

print("Assigning topics to reviews...")
results = []
for review in reviews:
    try:
        assigned = get_reviews_labels_flanT5(
            tokenizer, model, device, review, topics
        )
        results.append([review, assigned, 1])
    except Exception as e:
        msg = str(e)
        if "content_filter" in msg or "ResponsibleAIPolicyViolation" in msg:
            logging.error(f"Content filter triggered for review: {review} – skipping.")
        else:
            logging.error(f"Error processing review: {review} – {e}")
        results.append([review, [], 1])


# Transform results into a DataFrame
df_results = pd.DataFrame(results, columns=["sequence", "labels", "scores"])

# 3) Apply it to the DataFrame
df_results['labels'] = df_results['labels'].apply(split_semicolon_elements)

df_results.to_csv('csv/flan_t5_small_finetuned.csv', index=False)

flan_t5_small_finetuned_time = time.time() - start

print("Results saved to flan_t5_small_finetuned.csv, time taken:", flan_t5_small_finetuned_time)

Assigning topics to reviews...
Results saved to flan_t5_small_finetuned.csv, time taken: 87.50752091407776


## LLM Zero-shot - GPT-4o

In [None]:
from utils.assignTopics import get_reviews_labels_LLM_0shot

start = time.time()

results = []
for review in reviews:
    result = get_reviews_labels_LLM_0shot(review, topics, llm_client, "gpt-4o-mini")
    results.append([review, result, 1])

# Transform results into a DataFrame
df = pd.DataFrame(results, columns=["sequence", "labels", "scores"])

# Save DataFrame to CSV
df.to_csv('csv/LLM.csv', index=False)

LLM_time = time.time() - start

print("Results saved to LLM.csv, time taken:", LLM_time)

Results saved to LLM.csv, time taken: 521.2778120040894


In [14]:
import json
import datetime


# Load the results from the three CSV files
df_deberta_finetuned = pd.read_csv('csv/deberta_finetuned.csv')
df_nli_deberta_v3 = pd.read_csv('csv/nli-deberta-v3.csv')
df_deberta_pairwise_finetuned = pd.read_csv('csv/deberta_pairwise_finetuned.csv')
df_bart_large_mnli = pd.read_csv('csv/bart-large-mnli.csv')
df_flan_t5_finetuned = pd.read_csv('csv/flan_t5_finetuned.csv')
df_flan_t5_small_finetuned = pd.read_csv('csv/flan_t5_small_finetuned.csv')
llm = pd.read_csv('csv/LLM.csv')

# Initialize a dictionary to hold the reviews and their corresponding topics
reviews_topics = []
# Add the topics to the dictionary
reviews_topics.append({"Identified topics": topics})
# Add the time taken by each method
time_taken = {
    "deberta_finetuned_time": deberta_finetuned_time,
    "nli_deberta_v3_time": nlidebertav3_time,
    "deberta_pairwise_finetuned_time": deberta_pairwise_finetuned_time,
    "bart_large_mnli_time": bartlargemnli_time,
    "flan_t5_finetuned_time": flan_t5_finetuned_time,
    "flan_t5_small_finetuned_time": flan_t5_small_finetuned_time,
    "LLM_time": LLM_time}
reviews_topics.append({"Time taken": time_taken})
for i, review in enumerate(reviews):
    review_entry = {
        "review": review,
        "deberta_finetuned_topics": {
            label: 1 for label in  eval(df_deberta_finetuned.iloc[i]['labels'])
        },
        "nli_deberta_v3_topics": {
            label: score for label, score in zip(eval(df_nli_deberta_v3.iloc[i]['labels']), eval(df_nli_deberta_v3.iloc[i]['scores'])) if score > 0.9
        },
        "deberta_pairwise_finetuned_topics": {
            label: 1 for label in  eval(df_deberta_pairwise_finetuned.iloc[i]['labels'])        
        },
        "bart_large_mnli_topics": {
            label: score for label, score in zip(eval(df_bart_large_mnli.iloc[i]['labels']), eval(df_bart_large_mnli.iloc[i]['scores'])) if score > 0.9
        },
        "flan_t5_finetuned_topics": {
            label: 1 for label in  eval(df_flan_t5_finetuned.iloc[i]['labels'])        
        },
        "flan_t5_small_finetuned_topics": {
            label: 1 for label in  eval(df_flan_t5_small_finetuned.iloc[i]['labels'])        
        },
        "LLM_topics": {
            label: 1 for label in  eval(llm.iloc[i]['labels'])
        }   
    }
    reviews_topics.append(review_entry)

# ct stores current time
ct = datetime.datetime.now()
json_name = f'outputs/comparison_output_{ct}.json'
# Write the dictionary to a JSON file
with open(json_name, 'w') as json_file:
    json.dump(reviews_topics, json_file, indent=4)

print("Json output saved!")

Json output saved!


In [15]:
import json
import ast
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_score, recall_score, f1_score

def normalize_label(label: str) -> str:
    return label.strip().lower().replace(" ", "")

# 1. Load the JSON (skip the first two metadata entries)
with open(json_name, 'r') as f:
    data = json.load(f)
entries = data[2:]

# 2. Build a mapping from review text → prediction‐entry
review_to_entry = { e['review']: e for e in entries }

# 3. Load ground‐truth, parse lists
df_truth = pd.read_csv('csv/ground_truth_40598766.csv')
df_truth['topics'] = df_truth['topics'].apply(ast.literal_eval)

# 4. Prepare containers
y_true, model_preds = [], {
    'deberta_finetuned': [],
    'nli_deberta_v3':    [],
    'deberta_pairwise_finetuned': [],
    'bart_large_mnli':   [],
    'flan_t5_finetuned': [],
    'flan_t5_small_finetuned': [],
    'LLM':               []
}

# 5. Align on review text
missing = []
for _, row in df_truth.iterrows():
    review = row['review']
    if review not in review_to_entry:
        missing.append(review)
        continue
    
    entry = review_to_entry[review]
    # normalize & collect ground truth
    true_norm = [normalize_label(t) for t in row['topics']]
    y_true.append(true_norm)
    
    # collect each model’s normalized predictions
    for model_key, json_key in [
        ('deberta_finetuned','deberta_finetuned_topics'),
        ('nli_deberta_v3',   'nli_deberta_v3_topics'),
        ('deberta_pairwise_finetuned','deberta_pairwise_finetuned_topics'),
        ('bart_large_mnli',  'bart_large_mnli_topics'),
        ('flan_t5_finetuned','flan_t5_finetuned_topics'),
        ('flan_t5_small_finetuned','flan_t5_small_finetuned_topics'),
        ('LLM','LLM_topics')
    ]:
        preds = list(entry[json_key].keys())
        model_preds[model_key].append([normalize_label(t) for t in preds])

if missing:
    print("WARNING: no JSON entry for these reviews:\n", "\n".join(missing))

# 6. Binarize all labels against the full identified topic set
identified = data[0]['Identified topics']
norm_identified = [normalize_label(t) for t in identified]
mlb = MultiLabelBinarizer(classes=norm_identified)
y_true_bin = mlb.fit_transform(y_true)

# 7. Compute metrics
for model, pred_lists in model_preds.items():
    y_pred_bin = mlb.transform(pred_lists)
    p = precision_score(y_true_bin, y_pred_bin, average='micro', zero_division=0)
    r = recall_score   (y_true_bin, y_pred_bin, average='micro', zero_division=0)
    f = f1_score       (y_true_bin, y_pred_bin, average='micro', zero_division=0)
    print(f"{model:20s}  Precision: {p:.3f}  Recall: {r:.3f}  F1: {f:.3f}")


deberta_finetuned     Precision: 0.652  Recall: 0.827  F1: 0.729
nli_deberta_v3        Precision: 0.393  Recall: 0.820  F1: 0.532
deberta_pairwise_finetuned  Precision: 0.702  Recall: 0.934  F1: 0.802
bart_large_mnli       Precision: 0.515  Recall: 0.555  F1: 0.534
flan_t5_finetuned     Precision: 0.683  Recall: 0.832  F1: 0.750
flan_t5_small_finetuned  Precision: 0.657  Recall: 0.825  F1: 0.731
LLM                   Precision: 0.649  Recall: 0.921  F1: 0.761




In [17]:
print("Time taken for each model:")
print(f"deberta_finetuned: {deberta_finetuned_time:.2f} seconds")
print(f"nli_deberta_v3: {nlidebertav3_time:.2f} seconds")
print(f"deberta_pairwise_finetuned: {deberta_pairwise_finetuned_time:.2f} seconds")
print(f"bart_large_mnli: {bartlargemnli_time:.2f} seconds")
print(f"flan_t5_finetuned: {flan_t5_finetuned_time:.2f} seconds")
print(f"flan_t5_small_finetuned: {flan_t5_small_finetuned_time:.2f} seconds")
print(f"LLM: {LLM_time:.2f} seconds")

Time taken for each model:
deberta_finetuned: 46.33 seconds
nli_deberta_v3: 177.73 seconds
deberta_pairwise_finetuned: 123.74 seconds
bart_large_mnli: 178.69 seconds
flan_t5_finetuned: 284.03 seconds
flan_t5_small_finetuned: 87.51 seconds
LLM: 521.28 seconds
