In [12]:
from google.cloud import bigquery
from openai import AzureOpenAI
import os
import json

os.environ["TOKENIZERS_PARALLELISM"] = "false"

client = bigquery.Client()

project_id = 'ingka-tugc-infra-prod'
dataset_id = 'eu_ai_content'
table_id = 'reviews'

table_ref = f'{project_id}.{dataset_id}.{table_id}'

# First 5 articles with most reviews

articles = ['20351884', '40346924', '10305741', '00324518', '10360134']

article_id = articles[3]

query = f"""
    SELECT concat(title, '. ', text) as review_text
    FROM {table_ref}
    WHERE franchise='set-11' AND content_lang_code = 'en' AND art_id = '{article_id}'
"""

query_job = client.query(query)

reviews = [row['review_text'] for row in query_job]

print(f"Processing {len(reviews)} reviews")

Processing 3729 reviews


In [13]:
# Save reviews to a JSON file to avoid querying every time

with open('reviews.json', 'w') as f:
    json.dump(reviews, f)

In [14]:
# Load reviews from JSON file

with open('reviews.json', 'r') as f:
    reviews = json.load(f)

print(f"Loaded {len(reviews)} reviews")

Loaded 3729 reviews


In [15]:
from bertopic import BERTopic
from collections import Counter


# Filter reviews to include only those with a minimum length (20 characters)
processed_reviews = [review for review in reviews if len(review) >= 20]
reviews = processed_reviews

# Initialize and fit the BERTopic model
topic_model = BERTopic()
# fit_transform returns the primary topic for each review along with probabilities.
primary_topics, _ = topic_model.fit_transform(reviews)

In [16]:
len(primary_topics)

3626

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def compute_topic_similarity(topic_model, topic_A, topic_B, use_ctfidf=False):
    if use_ctfidf:
        embeddings = topic_model.c_tf_idf_
    else:
        embeddings = topic_model.topic_embeddings_

    # Adjust for outliers if necessary
    if topic_model._outliers:
        embeddings = embeddings[topic_model._outliers:]

    # Check indices and ensure they match the embeddings
    if topic_A >= len(embeddings) or topic_B >= len(embeddings):
        raise ValueError("Topic IDs must be within the range of available embeddings.")

    # Extract topic embeddings
    embedding_A = embeddings[topic_A].reshape(1, -1)
    embedding_B = embeddings[topic_B].reshape(1, -1)

    # Compute cosine similarity
    similarity_score = cosine_similarity(embedding_A, embedding_B)[0][0]
    return similarity_score

In [None]:
threshold = 0.7

i = 0
while i < len(top_topics):
    j = i + 1
    while j < len(top_topics):
        sim = compute_topic_similarity(topic_model, top_topics[i], top_topics[j], use_ctfidf=False)

        if sim > threshold:
            print(f"Similarity between {top_topics[i]} and {top_topics[j]}: {sim}")
            topic_model.merge_topics(reviews, [top_topics[i], top_topics[j]])
            topics = topic_model.get_topic_freq()
            valid_topics = topics[topics.Topic != -1].sort_values(by="Count", ascending=False)
            top_topics = valid_topics.head(8).Topic.tolist()
            print(f"New top topics: {top_topics}")
            i = 0
            j = 0
        j += 1
    i += 1

In [None]:

# Get overall topic distributions and token-level breakdowns.
# topic_distr: overall distribution per review
# topic_token_distr: token-level breakdown per review (returned as NumPy arrays)
topic_distr, topic_token_distr = topic_model.approximate_distribution(reviews, calculate_tokens=True)

# Set a threshold for what constitutes a "high contribution"
threshold = 0.6  # Adjust this value as needed

# Prepare a list to hold the multi-topic assignments for each review.
multi_topic_assignments = []

for i, review in enumerate(reviews):
    # Start with the primary topic assigned by BERTopic
    assigned_topics = [primary_topics[i]]
    
    # Get the token-level contributions for this review
    token_contribs = topic_token_distr[i]
    
    # Check if token_contribs is 1D or 2D:
    if token_contribs.ndim == 1:
        # Each element corresponds directly to a topic's contribution
        for topic_id, contrib in enumerate(token_contribs):
            if contrib >= threshold and topic_id not in assigned_topics:
                assigned_topics.append(topic_id)
    elif token_contribs.ndim == 2:
        # Each row is a token window distribution; columns correspond to topics
        n_topics = token_contribs.shape[1]
        for topic_id in range(n_topics):
            # Check if any token window has a contribution that meets/exceeds the threshold
            if (token_contribs[:, topic_id] >= threshold).any() and topic_id not in assigned_topics:
                assigned_topics.append(topic_id)
    else:
        print(f"Unexpected dimensionality for review {i}: {token_contribs.ndim}")
    
    multi_topic_assignments.append(assigned_topics)

# -----------------------
# Compute the top 8 topics overall
# -----------------------

# Count how many times each topic appears across all reviews
topic_counts = Counter()
for topics_assigned in multi_topic_assignments:
    topic_counts.update(topics_assigned)

# Get the top 8 topics (as a set for faster lookups)
top_8_topics = {topic for topic, _ in topic_counts.most_common(9)}
top_8_topics.remove(-1)
print(f"Top 8 topics: {top_8_topics}")
# -----------------------
# Filter the multi-topic assignments to only include topics in the top 8
filtered_multi_topic_assignments = []
for topics_assigned in multi_topic_assignments:
    filtered = [topic for topic in topics_assigned if topic in top_8_topics]
    filtered_multi_topic_assignments.append(filtered)

# Print the reviews with their final (filtered) topics
for idx, topics_assigned in enumerate(filtered_multi_topic_assignments):
    print(f"Review {idx + 1}:")
    print(f"Text: {reviews[idx]}")
    print(f"Topics: {topics_assigned}")
    print("-" * 50)


In [None]:
# Ensure topic_info is defined
topic_info = topic_model.get_topic_info()

# Print the name of each topic
for topic_id in top_8_topics:
    topic_name = topic_info[topic_info.Topic == topic_id]['Name'].values[0]
    print(f"Topic ID: {topic_id}, Topic Name: {topic_name}")

In [None]:
from bertopic import BERTopic

processed_reviews = [review for review in reviews if len(review) >= 20]
reviews = processed_reviews
## Limiting the number of topics with nr_topics does not work
nr_topics_before = 'Auto'
topic_model = BERTopic()

# Fit the model on the reviews
topics, probabilities = topic_model.fit_transform(reviews)

nr_topics_after = 'auto'

# Further reduce topics if needed
# topic_model.reduce_topics(reviews, nr_topics=nr_topics_after)

topics = topic_model.get_topics()

In [None]:
topic_info = topic_model.get_topic_info()
topic_info.head(10)

In [None]:
topic_distr, _ = topic_model.approximate_distribution(reviews)
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(reviews, calculate_tokens=True)

# Visualize the token-level distributions
df = topic_model.visualize_approximate_distribution(reviews[3], topic_token_distr[3])
df

In [None]:
topic_model.visualize_barchart()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def compute_topic_similarity(topic_model, topic_A, topic_B, use_ctfidf=False):
    if use_ctfidf:
        embeddings = topic_model.c_tf_idf_
    else:
        embeddings = topic_model.topic_embeddings_

    # Adjust for outliers if necessary
    if topic_model._outliers:
        embeddings = embeddings[topic_model._outliers:]

    # Check indices and ensure they match the embeddings
    if topic_A >= len(embeddings) or topic_B >= len(embeddings):
        raise ValueError("Topic IDs must be within the range of available embeddings.")

    # Extract topic embeddings
    embedding_A = embeddings[topic_A].reshape(1, -1)
    embedding_B = embeddings[topic_B].reshape(1, -1)

    # Compute cosine similarity
    similarity_score = cosine_similarity(embedding_A, embedding_B)[0][0]
    return similarity_score

In [17]:
topic_model.visualize_heatmap()

In [None]:
topics = topic_model.get_topic_freq()
valid_topics = topics[topics.Topic != -1].sort_values(by="Count", ascending=False)

# Start by selecting the first 8 topics
top_topics = valid_topics.head(8).Topic.tolist()

for topic_id in top_topics:
    topic_name = topic_info[topic_info.Topic == topic_id]['Name'].values[0]
    print(f"Topic ID: {topic_id}, Topic Name: {topic_name}")

In [None]:
threshold = 0.7

i = 0
while i < len(top_topics):
    j = i + 1
    while j < len(top_topics):
        sim = compute_topic_similarity(topic_model, top_topics[i], top_topics[j], use_ctfidf=False)

        if sim > threshold:
            print(f"Similarity between {top_topics[i]} and {top_topics[j]}: {sim}")
            topic_model.merge_topics(reviews, [top_topics[i], top_topics[j]])
            topics = topic_model.get_topic_freq()
            valid_topics = topics[topics.Topic != -1].sort_values(by="Count", ascending=False)
            top_topics = valid_topics.head(8).Topic.tolist()
            print(f"New top topics: {top_topics}")
            i = 0
            j = 0
        j += 1
    i += 1

In [None]:
topic_info = topic_model.get_topic_info()
topic_info.head(10)

In [None]:
topic_model.get_topic(1)

In [None]:
topic_model.visualize_barchart()

In [None]:
topics = topic_model.get_topic_freq()
topic_info = topic_model.get_topic_info()
valid_topics = topics[topics.Topic != -1].sort_values(by="Count", ascending=False)

# Start by selecting the first 8 topics
top_topics = valid_topics.head(8).Topic.tolist()

print(top_topics)

for i in top_topics:
    print(f"Top topic: {i}")
    top_topic_name = topic_info[topic_info.Topic == i]['Name'].values[0]
    print(f"Name: {top_topic_name}\n")

In [None]:
topic_ids = top_topics
topic_ids

In [None]:
def get_reviews(reviews, topic_model, topic_ids):

    topic_reviews = {}

    topic_assignments = topic_model.transform(reviews)[0]

    for topic_id in topic_ids:
        topic_reviews[topic_id] = [review for review, assigned_topic in zip(reviews, topic_assignments) if assigned_topic == topic_id]
        
    return topic_reviews

In [None]:
topic_reviews = get_reviews(reviews, topic_model, topic_ids)

In [None]:
topic_reviews

In [None]:
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("API_KEY")

In [None]:
llm_client = AzureOpenAI(
    api_key=api_key,
    api_version="2023-07-01-preview",
    azure_endpoint="https://derai-vision.openai.azure.com/",
)

model = "gpt-4o-mini" 

In [None]:
def get_topic_keyword(cluster_words):
    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful expert summarizer that identifies and generates a concise, broad topic word for each cluster of words.\n"
                "The topic word should capture the essence of all the words in the cluster.\n"
                "Merge similar or related words into a single, broader category.\n"
                "Use singular words unless a plural form is necessary.\n"                
                "Use only one word. 2 or 3 words can be used only when they are part of a composite word and are better to represent the idea of the topic (e.g.: ease of use).\n"
                "If you identify a verb as a topic, use the noun version (e.g., use 'order' instead of 'ordering').\n"
                "Generalize the topic word; for example, if you encounter 'saleswoman' or 'salesman', abstract it to 'staff'.\n"
                "Provide the output as a single word. Always start with the first letter capitalized"
            ),
        },
        {
            "role": "user",
            "content": (
                "Please read the following cluster of words carefully and generate a single topic word that captures the essence of all the words.\n"
                "The topic word should be broad and general, capturing the essence of the cluster's main points without being overly specific or redundant.\n"
                "The topics could be either nouns that refers to a certain characteristic of the product of spefic features or parts of the product (e.g.: click & collect, email redeem, etc.)\n"
                f"Topic: {cluster_words}\n"
                "Topic word(s):"
            ),
        },
    ]

    response = ' '
    
    # Generate the topic word using the language model
    response = llm_client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=5,
        temperature=0.4,
        n=1,
        stop=None,
    )

    # Extract and return the topic word
    return response.choices[0].message.content.strip()

In [None]:
import csv
def create_csv_file(topic_reviews, output_file):
    """
    Create a CSV file with reviews and their associated topic keywords.

    Parameters:
    - reviews: List of all reviews.
    - topic_reviews: Dictionary with topic IDs as keys and lists of reviews as values.
    - output_file: Path to the output CSV file.
    """
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = ['review', 'topics']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        
        for topic_id, reviews_list in topic_reviews.items():
            topic_details = topic_model.get_topic(topic_id)
            topic_name = topic_info[topic_info.Topic == topic_id]['Name'].values[0]
            topic_keyword = get_topic_keyword(topic_details)
            print(f"Topic ID: {topic_id}, Topic Name: {topic_name} --> Topic Keyword: {topic_keyword}")
            for review in reviews_list:
                writer.writerow({'review': review, 'topics': f"['{topic_keyword}']"})

    print(f"CSV file saved to {output_file}")

In [None]:
# Create the CSV file
csv_output_file = f'csv/BERTopicTop8_{article_id}.csv'
create_csv_file(topic_reviews, csv_output_file)

streamlit_csv = f'BERTopicTop8_{article_id}.csv'

## Run the demo

In [None]:
!streamlit run app.py -- "$streamlit_csv"