In [7]:
from google.cloud import bigquery
from openai import AzureOpenAI
import os
import json

os.environ["TOKENIZERS_PARALLELISM"] = "false"

client = bigquery.Client()

project_id = 'ingka-tugc-infra-prod'
dataset_id = 'eu_ai_content'
table_id = 'reviews'

table_ref = f'{project_id}.{dataset_id}.{table_id}'

# First 5 articles with most reviews

articles = ['20351884', '40346924', '10305741', '00324518', '10360134']

article_id = articles[0]

query = f"""
    SELECT concat(title, '. ', text) as review_text
    FROM {table_ref}
    WHERE franchise='set-11' AND content_lang_code = 'en' AND art_id = '{article_id}'
"""

query_job = client.query(query)

reviews = [row['review_text'] for row in query_job]

print(f"Processing {len(reviews)} reviews")

Processing 3724 reviews


In [4]:
# Save reviews to a JSON file to avoid querying every time
import json

reviews = []

with open('reviews.json', 'w') as f:
    json.dump(reviews, f)

In [5]:
# Load reviews from JSON file

with open('reviews.json', 'r') as f:
    reviews = json.load(f)

print(f"Loaded {len(reviews)} reviews")

Loaded 0 reviews


In [8]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk


processed_reviews = [review for review in reviews if len(review) >= 20]
reviews = processed_reviews
## Limiting the number of topics with nr_topics does not work
nr_topics_before = 'Auto'
topic_model = BERTopic()

# Fit the model on the reviews
topics, probabilities = topic_model.fit_transform(reviews)

nr_topics_after = 'auto'

# Further reduce topics if needed
# topic_model.reduce_topics(reviews, nr_topics=nr_topics_after)

topics = topic_model.get_topics()

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [9]:
topic_info = topic_model.get_topic_info()
topic_info.head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1187,-1_the_and_to_for,"[the, and, to, for, it, in, of, my, this, great]",[Kallax. We were looking for the best storage ...
1,0,196,0_shelf_shelves_great_this,"[shelf, shelves, great, this, are, it, to, the...","[Love this Shelf!. I love this shelf, it was e..."
2,1,191,1_easy_assemble_to_together,"[easy, assemble, to, together, put, looks, sim...","[Easy to assemble. Easy to assemble, Easy to a..."
3,2,154,2_closet_my_organizer_clothes,"[closet, my, organizer, clothes, perfect, for,...",[Great for organizing!. I out this unit in my ...
4,3,93,3_kallax_storage_of_for,"[kallax, storage, of, for, units, the, in, my,...","[Kallax is a great base, but need more accesso..."
5,4,87,4_easy_product_assemble_to,"[easy, product, assemble, to, great, together,...",[Love it! So easy to. Love it! So easy to put ...
6,5,85,5_shelving_unit_great_very,"[shelving, unit, great, very, and, units, they...","[Shelving unit. Great product, good quality, v..."
7,6,81,6_tv_stand_as_it,"[tv, stand, as, it, use, this, perfect, its, o...","[Love it. I Love it. I use it as a TV stand., ..."
8,7,73,7_vinyl_records_record_collection,"[vinyl, records, record, collection, albums, f...",[Vinyl Record Shelving. finally having a place...
9,8,58,8_versatile_one_can_in,"[versatile, one, can, in, use, different, and,...","[versatile shelf. versatile shelf, Amazing ver..."


In [10]:
number_of_topics = len(topics)
number_of_topics

58

In [11]:
topic_model.visualize_barchart()

In [12]:
topic_model.visualize_topics()

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def compute_topic_similarity(topic_model, topic_A, topic_B, use_ctfidf=False):
    if use_ctfidf:
        embeddings = topic_model.c_tf_idf_
    else:
        embeddings = topic_model.topic_embeddings_

    # Adjust for outliers if necessary
    if topic_model._outliers:
        embeddings = embeddings[topic_model._outliers:]

    # Check indices and ensure they match the embeddings
    if topic_A >= len(embeddings) or topic_B >= len(embeddings):
        raise ValueError("Topic IDs must be within the range of available embeddings.")

    # Extract topic embeddings
    embedding_A = embeddings[topic_A].reshape(1, -1)
    embedding_B = embeddings[topic_B].reshape(1, -1)

    # Compute cosine similarity
    similarity_score = cosine_similarity(embedding_A, embedding_B)[0][0]
    return similarity_score

In [14]:
topic_model.visualize_heatmap()

In [15]:
topics = topic_model.get_topic_freq()
valid_topics = topics[topics.Topic != -1].sort_values(by="Count", ascending=False)

# Start by selecting the first 8 topics
top_topics = valid_topics.head(8).Topic.tolist()

for topic_id in top_topics:
    topic_name = topic_info[topic_info.Topic == topic_id]['Name'].values[0]
    print(f"Topic ID: {topic_id}, Topic Name: {topic_name}")

Topic ID: 0, Topic Name: 0_shelf_shelves_great_this
Topic ID: 1, Topic Name: 1_easy_assemble_to_together
Topic ID: 2, Topic Name: 2_closet_my_organizer_clothes
Topic ID: 3, Topic Name: 3_kallax_storage_of_for
Topic ID: 4, Topic Name: 4_easy_product_assemble_to
Topic ID: 5, Topic Name: 5_shelving_unit_great_very
Topic ID: 6, Topic Name: 6_tv_stand_as_it
Topic ID: 7, Topic Name: 7_vinyl_records_record_collection


In [16]:
threshold = 0.7

i = 0
while i < len(top_topics):
    j = i + 1
    while j < len(top_topics):
        sim = compute_topic_similarity(topic_model, top_topics[i], top_topics[j], use_ctfidf=False)

        if sim > threshold:
            print(f"Similarity between {top_topics[i]} and {top_topics[j]}: {sim}")
            topic_model.merge_topics(reviews, [top_topics[i], top_topics[j]])
            topics = topic_model.get_topic_freq()
            valid_topics = topics[topics.Topic != -1].sort_values(by="Count", ascending=False)
            top_topics = valid_topics.head(8).Topic.tolist()
            print(f"New top topics: {top_topics}")
            i = 0
            j = 0
        j += 1
    i += 1

Similarity between 0 and 2: 0.7069297432899475
New top topics: [0, 1, 2, 3, 4, 5, 6, 7]
Similarity between 0 and 4: 0.8670609615216309
New top topics: [0, 1, 2, 3, 4, 5, 6, 7]
Similarity between 0 and 6: 0.8106400857375843
New top topics: [0, 1, 2, 3, 4, 5, 6, 7]
Similarity between 0 and 2: 0.7007503000189184
New top topics: [0, 1, 2, 3, 4, 5, 6, 7]
Similarity between 0 and 4: 0.7187660165985583
New top topics: [0, 1, 2, 3, 4, 5, 6, 7]
Similarity between 0 and 4: 0.8405592221650158
New top topics: [0, 1, 2, 3, 4, 5, 6, 7]
Similarity between 0 and 6: 0.7383412204888669
New top topics: [0, 1, 2, 3, 4, 5, 6, 7]
Similarity between 1 and 2: 0.7795648713387489
New top topics: [0, 1, 2, 3, 4, 5, 6, 7]
Similarity between 3 and 4: 0.8302508444436915
New top topics: [0, 1, 2, 3, 4, 5, 6, 7]


In [17]:
topic_info = topic_model.get_topic_info()
topic_info.head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1187,-1_the_and_to_for,"[the, and, to, for, it, in, of, my, this, great]",[Kallax. We were looking for the best storage ...
1,0,764,0_storage_shelf_for_my,"[storage, shelf, for, my, great, and, the, to,...",[Great Shelf Unit. Very easy to put together a...
2,1,278,1_easy_assemble_to_together,"[easy, assemble, to, together, put, looks, and...",[Easy to assemble and looks. Easy to assemble ...
3,2,101,2_product_good_great_very,"[product, good, great, very, happy, recommend,...","[Great product. Great product, great product. ..."
4,3,81,3_tv_stand_as_it,"[tv, stand, as, it, use, this, perfect, its, a...","[Love it. I Love it. I use it as a TV stand., ..."
5,4,50,4_unit_this_looks_great,"[unit, this, looks, great, units, these, have,...",[Great unit. Looks good. Very. Great unit. Loo...
6,5,48,5_kallax_units_the_fittings,"[kallax, units, the, fittings, love, versatile...",[Kallax is ideal for my requirements. I have a...
7,6,47,6_ikea_is_quality_to,"[ikea, is, quality, to, not, the, it, from, at...",[My god. I bought this about a month ago and I...
8,7,46,7_sturdy_very_strong_and,"[sturdy, very, strong, and, built, constructio...",[Very sturdy and very easy. Very sturdy and ve...
9,8,44,8_what_exactly_needed_looking,"[what, exactly, needed, looking, just, needs, ...","[Exactly what I needed. Exactly what I needed,..."


In [18]:
topic_model.get_topic(1)

[('easy', 0.059759778719918895),
 ('assemble', 0.04316517930024521),
 ('to', 0.03607787863667898),
 ('together', 0.03232533761646141),
 ('put', 0.028529638208410084),
 ('looks', 0.021501725939270033),
 ('and', 0.019199097318613488),
 ('assembly', 0.018809944187871307),
 ('it', 0.017998551761603605),
 ('great', 0.01650178851496449)]

In [19]:
topic_model.visualize_barchart()

In [20]:
topics = topic_model.get_topic_freq()
topic_info = topic_model.get_topic_info()
valid_topics = topics[topics.Topic != -1].sort_values(by="Count", ascending=False)

# Start by selecting the first 8 topics
top_topics = valid_topics.head(8).Topic.tolist()

print(top_topics)

for i in top_topics:
    print(f"Top topic: {i}")
    top_topic_name = topic_info[topic_info.Topic == i]['Name'].values[0]
    print(f"Name: {top_topic_name}\n")

[0, 1, 2, 3, 4, 5, 6, 7]
Top topic: 0
Name: 0_storage_shelf_for_my

Top topic: 1
Name: 1_easy_assemble_to_together

Top topic: 2
Name: 2_product_good_great_very

Top topic: 3
Name: 3_tv_stand_as_it

Top topic: 4
Name: 4_unit_this_looks_great

Top topic: 5
Name: 5_kallax_units_the_fittings

Top topic: 6
Name: 6_ikea_is_quality_to

Top topic: 7
Name: 7_sturdy_very_strong_and



In [21]:
topic_ids = top_topics
topic_ids

[0, 1, 2, 3, 4, 5, 6, 7]

In [22]:
def get_reviews(reviews, topic_model, topic_ids):

    topic_reviews = {}

    topic_assignments = topic_model.transform(reviews)[0]

    for topic_id in topic_ids:
        topic_reviews[topic_id] = [review for review, assigned_topic in zip(reviews, topic_assignments) if assigned_topic == topic_id]
        
    return topic_reviews

In [23]:
topic_reviews = get_reviews(reviews, topic_model, topic_ids)

In [24]:
topic_reviews

{0: ['Large size cubes. Bought this for our son’s room & it works for its intended purpose. Our oldest boys ( 31 & 29 ) grew up with ikea furniture in their room & we are keeping the tradition going.',
  'Wonderful for storage of vinyl records. This unit works great as a music station in our living room.  The bookshelf stereo, record player, and snack tray sit on top, records go in the top 4 cubbies, speakers in the bottom , along with drawers for CDs and tapes.  \nThe unit is very attractive, sturdy, and fits the space well.\nIt was very easy to put together (but installing the drawers was not!). ',
  "Great Shelving For Organizing Just About Anything!. We love these Kallax unit shelves very much and have them in different areas in our house serving different functions. The main one we use the most is a set up of a 2x4 side by side with a 4x4 with a 1x4 across the top of those, and we use them to store our boardgames. The white looks really sharp and makes the board game covers pop. W

In [25]:
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("API_KEY")

In [26]:
llm_client = AzureOpenAI(
    api_key=api_key,
    api_version="2023-07-01-preview",
    azure_endpoint="https://derai-vision.openai.azure.com/",
)

model = "gpt-4o-mini" 

In [27]:
def get_topic_keyword(cluster_words):
    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful expert summarizer that identifies and generates a concise, broad topic word for each cluster of words.\n"
                "The topic word should capture the essence of all the words in the cluster.\n"
                "Merge similar or related words into a single, broader category.\n"
                "Use singular words unless a plural form is necessary.\n"                
                "Use only one word. 2 or 3 words can be used only when they are part of a composite word and are better to represent the idea of the topic (e.g.: ease of use).\n"
                "If you identify a verb as a topic, use the noun version (e.g., use 'order' instead of 'ordering').\n"
                "Generalize the topic word; for example, if you encounter 'saleswoman' or 'salesman', abstract it to 'staff'.\n"
                "Provide the output as a single word. Always start with the first letter capitalized"
            ),
        },
        {
            "role": "user",
            "content": (
                "Please read the following cluster of words carefully and generate a single topic word that captures the essence of all the words.\n"
                "The topic word should be broad and general, capturing the essence of the cluster's main points without being overly specific or redundant.\n"
                "The topics could be either nouns that refers to a certain characteristic of the product of spefic features or parts of the product (e.g.: click & collect, email redeem, etc.)\n"
                f"Topic: {cluster_words}\n"
                "Topic word(s):"
            ),
        },
    ]

    response = ' '
    
    # Generate the topic word using the language model
    response = llm_client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=5,
        temperature=0.4,
        n=1,
        stop=None,
    )

    # Extract and return the topic word
    return response.choices[0].message.content.strip()

In [28]:
import csv
def create_csv_file(topic_reviews, output_file):
    """
    Create a CSV file with reviews and their associated topic keywords.

    Parameters:
    - reviews: List of all reviews.
    - topic_reviews: Dictionary with topic IDs as keys and lists of reviews as values.
    - output_file: Path to the output CSV file.
    """
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = ['review', 'topics']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        
        for topic_id, reviews_list in topic_reviews.items():
            topic_details = topic_model.get_topic(topic_id)
            topic_name = topic_info[topic_info.Topic == topic_id]['Name'].values[0]
            topic_keyword = get_topic_keyword(topic_details)
            print(f"Topic ID: {topic_id}, Topic Name: {topic_name} --> Topic Keyword: {topic_keyword}")
            for review in reviews_list:
                writer.writerow({'review': review, 'topics': f"['{topic_keyword}']"})

    print(f"CSV file saved to {output_file}")

In [29]:
# Create the CSV file
csv_output_file = f'csv/BERTopicTop8_{article_id}.csv'
create_csv_file(topic_reviews, csv_output_file)

streamlit_csv = f'BERTopicTop8_{article_id}.csv'

Topic ID: 0, Topic Name: 0_storage_shelf_for_my --> Topic Keyword: Storage
Topic ID: 1, Topic Name: 1_easy_assemble_to_together --> Topic Keyword: Assembly
Topic ID: 2, Topic Name: 2_product_good_great_very --> Topic Keyword: Quality
Topic ID: 3, Topic Name: 3_tv_stand_as_it --> Topic Keyword: Television
Topic ID: 4, Topic Name: 4_unit_this_looks_great --> Topic Keyword: Quality
Topic ID: 5, Topic Name: 5_kallax_units_the_fittings --> Topic Keyword: Storage
Topic ID: 6, Topic Name: 6_ikea_is_quality_to --> Topic Keyword: Quality
Topic ID: 7, Topic Name: 7_sturdy_very_strong_and --> Topic Keyword: Durability
CSV file saved to csv/BERTopicTop8_20351884.csv


## Run the demo

In [31]:
!streamlit run app.py -- "$streamlit_csv"

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://192.168.178.207:8501[0m
[0m
[34m[1m  For better performance, install the Watchdog module:[0m

  $ xcode-select --install
  $ pip install watchdog
            [0m
^C
[34m  Stopping...[0m
Exception ignored in: <module 'threading' from '/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/threading.py'>
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/threading.py", line 1594, in _shutdown
    atexit_call()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/concurrent/futures/thread.py", line 31, in _python_exit
    t.join()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/threading.py", line 1149, in join
    self._wait_for_tstate_lock()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib