In [1]:
from google.cloud import bigquery
from openai import AzureOpenAI
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

client = bigquery.Client()

project_id = 'ingka-tugc-infra-prod'
dataset_id = 'eu_ai_content'
table_id = 'reviews'

table_ref = f'{project_id}.{dataset_id}.{table_id}'

# Query to get all the data - 1.17GB to process

articles = ['20351884', '40346924', '10305741', '00324518', '10360134']

article_id = articles[0]

query = f"""
    SELECT concat(title, '. ', text) as review_text
    FROM {table_ref}
    WHERE franchise='set-11' AND content_lang_code = 'en' AND art_id = '{article_id}'
"""

query_job = client.query(query)

reviews = [row['review_text'] for row in query_job]

print(f"Processing {len(reviews)} reviews")

Processing 3723 reviews


In [2]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


stop_words = set(stopwords.words('english')).union(set(ENGLISH_STOP_WORDS))

processed_reviews = [' '.join([word for word in word_tokenize(review.lower()) if word.isalnum() and word not in stop_words]) for review in reviews]

## Limiting the number of topics with nr_topics does not work
nr_topics_before = 'Auto'
topic_model = BERTopic()

# Fit the model on the reviews
topics, probabilities = topic_model.fit_transform(processed_reviews)

nr_topics_after = 'auto'

# Further reduce topics if needed
# topic_model.reduce_topics(reviews, nr_topics=nr_topics_after)

topics = topic_model.get_topics()

  from .autonotebook import tqdm as notebook_tqdm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [3]:
topic_info = topic_model.get_topic_info()
all_topic_names = '; '.join(topic_info['Name'])
all_topic_names

'-1_storage_room_unit_bought; 0_shelf_shelving_shelves_unit; 1_tv_stand_use_using; 2_assemble_easy_assembly_good; 3_closet_clothes_organizer_perfect; 4_kallax_love_delivery_versatile; 5_bookcase_books_bookshelf_bookcases; 6_kallax_shelf_shelves_shelving; 7_vinyl_records_record_collection; 8_holes_dowels_wooden_screws; 9_toy_toys_kid_kids; 10_gloss_white_grey_high; 11_storage_options_solution_needs; 12_works_exactly_needed_purpose; 13_quality_good_price_reasonable; 14_vertical_horizontal_vertically_horizontally; 15_kallax_storage_units_dependable; 16_sturdy_nice_strong_construction; 17_fits_space_suits_perfect; 18_love_sturdiness_second_cute; 19_kallax_studio_units_items; 20_furniture_piece_affordable_steady; 21_good_ok_bad_say; 22_looks_easy_set_contemporary; 23_daughter_kids_room_rooms; 24_ikea_easiest_quality_furniture; 25_cubes_cube_storage_best; 26_excellent_great_awesome_fantastic; 27_product_excellent_great_wounderful; 28_value_money_good_price; 29_bedroom_room_living_addition; 3

In [4]:
number_of_topics = len(topics)
number_of_topics

61

In [5]:
topic_model.visualize_barchart()

In [6]:
topic_model.visualize_topics()

In [7]:
topic_model.visualize_heatmap()

In [8]:
hierarchical_topics = topic_model.hierarchical_topics(processed_reviews)

100%|██████████| 59/59 [00:00<00:00, 668.81it/s]


In [9]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [10]:
# topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)

In [11]:
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

.
├─good_product_easy_assemble_great
│    ├─assemble_easy_sturdy_looks_nice
│    │    ├─nice_thanks_product_item_neat
│    │    │    ├─■──nice_thanks_neat_really_great ── Topic: 56
│    │    │    └─■──nice_product_item_finish_color ── Topic: 58
│    │    └─assemble_easy_sturdy_looks_good
│    │         ├─assemble_easy_sturdy_looks_good
│    │         │    ├─sturdy_strong_assemble_construction_easy
│    │         │    │    ├─■──sturdy_nice_strong_construction_bracing ── Topic: 16
│    │         │    │    └─■──sturdy_assemble_easy_strong_assembled ── Topic: 33
│    │         │    └─assemble_easy_looks_assembly_good
│    │         │         ├─assemble_easy_assembly_good_looks
│    │         │         │    ├─■──assemble_storage_easy_sleek_quick ── Topic: 38
│    │         │         │    └─■──assemble_easy_assembly_good_instructions ── Topic: 2
│    │         │         └─install_easy_looks_build_super
│    │         │              ├─■──looks_easy_set_contemporary_good ── Topic: 22
│    │   

In [12]:
hierarchical_topics

Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance
58,118,great_easy_good_kallax_love,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",116,good_product_easy_assemble_great,117,kallax_shelf_love_perfect_great,1.626265
57,117,kallax_shelf_love_perfect_great,"[0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 17...",111,kallax_shelf_tv_love_unit,113,closet_perfect_room_toys_works,1.469752
56,116,good_product_easy_assemble_great,"[2, 11, 13, 16, 21, 22, 26, 27, 28, 31, 33, 34...",115,assemble_easy_sturdy_looks_nice,114,good_product_quality_excellent_great,1.332535
55,115,assemble_easy_sturdy_looks_nice,"[2, 16, 22, 33, 34, 38, 51, 56, 58]",60,nice_thanks_product_item_neat,103,assemble_easy_sturdy_looks_good,1.307416
54,114,good_product_quality_excellent_great,"[11, 13, 21, 26, 27, 28, 31, 40, 41, 42, 49, 5...",105,good_product_quality_value_price,109,happy_storage_functional_great_ago,1.284173
53,113,closet_perfect_room_toys_works,"[3, 9, 12, 17, 23, 29, 32, 35, 36, 39, 44, 46,...",112,toys_room_sewing_toy_craft,110,closet_perfect_clothes_exactly_works,1.270379
52,112,toys_room_sewing_toy_craft,"[9, 23, 29, 32, 35, 39, 44, 46, 48, 50]",108,sewing_craft_room_table_cutting,101,toys_toy_kids_room_daughter,1.231213
51,111,kallax_shelf_tv_love_unit,"[0, 1, 4, 5, 6, 7, 8, 10, 14, 15, 18, 19, 20, ...",107,tv_stand_ikea_vinyl_furniture,104,kallax_shelf_shelving_shelves_unit,1.177161
50,110,closet_perfect_clothes_exactly_works,"[3, 12, 17, 36, 53, 55]",63,closet_clothes_organizer_perfect_walk,99,perfect_exactly_works_expected_fits,1.135177
49,109,happy_storage_functional_great_ago,"[11, 26, 40, 41, 42, 49, 52, 54, 57]",102,storage_functional_great_unit_versatile,93,happy_ago_month_boxes_bought,1.124278


In [13]:
def get_topics_at_depth(df, depth):
    from collections import deque
    
    # Build adjacency list with stored distance (no accumulation)
    adjacency = {}
    for _, row in df.iterrows():
        adjacency[row['Parent_ID']] = [
            (row['Child_Left_ID'], row['Child_Left_Name'], row['Distance']),
            (row['Child_Right_ID'], row['Child_Right_Name'], row['Distance'])
        ]
    
    root_id = df.iloc[0]['Parent_ID']
    root_name = df.iloc[0]['Parent_Name']
    
    # BFS
    queue = deque([(root_id, root_name, 0)])  # (id, name, depth)
    result = []
    
    while queue:
        node_id, node_name, curr_depth = queue.popleft()
        children = adjacency.get(node_id, [])
        
        for child_id, child_name, child_distance in children:
            child_depth = curr_depth + 1
            if child_depth == depth:
                result.append((child_id, child_name, child_distance))
            elif child_depth < depth:
                queue.append((child_id, child_name, child_depth))
    
    return result

In [14]:
topics_at_depth = get_topics_at_depth(hierarchical_topics, 3)
for topic in topics_at_depth:
    print(f"ID: {topic[0]}, Name: {topic[1]}, Distance: {topic[2]}")

ID: 60, Name: nice_thanks_product_item_neat, Distance: 1.3074162661036748
ID: 103, Name: assemble_easy_sturdy_looks_good, Distance: 1.3074162661036748
ID: 105, Name: good_product_quality_value_price, Distance: 1.2841732492718585
ID: 109, Name: happy_storage_functional_great_ago, Distance: 1.2841732492718585
ID: 107, Name: tv_stand_ikea_vinyl_furniture, Distance: 1.177160861935259
ID: 104, Name: kallax_shelf_shelving_shelves_unit, Distance: 1.177160861935259
ID: 112, Name: toys_room_sewing_toy_craft, Distance: 1.270378767403591
ID: 110, Name: closet_perfect_clothes_exactly_works, Distance: 1.270378767403591


In [15]:
def adjust_topics(df, topics, threshold):
    from collections import defaultdict

    # Build child->parent and parent->children maps
    child_to_parent = {}
    parent_to_children = defaultdict(list)
    for _, row in df.iterrows():
        p_id, p_name, p_dist = row['Parent_ID'], row['Parent_Name'], row['Distance']
        cl_id, cl_name = row['Child_Left_ID'], row['Child_Left_Name']
        cr_id, cr_name = row['Child_Right_ID'], row['Child_Right_Name']
        
        child_to_parent[cl_id] = (p_id, p_name, p_dist)
        child_to_parent[cr_id] = (p_id, p_name, p_dist)
        parent_to_children[p_id].append((cl_id, cl_name, p_dist))
        parent_to_children[p_id].append((cr_id, cr_name, p_dist))

    # Start with the current topics in a set
    final_topics = set(topics)
    
    # Below-threshold topics
    below_threshold = [t for t in topics if t[2] < threshold]

    # For each below-threshold topic, pair it with another topic of the same distance,
    # remove both, then add the parent. Then remove the highest-distance topic and add its children.
    for bt_id, bt_name, bt_dist in below_threshold:
        if (bt_id, bt_name, bt_dist) not in final_topics:
            continue

        # Find another topic with the same distance
        same_dist_candidates = [
            t for t in final_topics
            if t[2] == bt_dist and t != (bt_id, bt_name, bt_dist)
        ]
        if not same_dist_candidates:
            continue

        # Remove the below-threshold topic and its same-distance candidate
        same_dist_topic = same_dist_candidates[0]
        final_topics.remove((bt_id, bt_name, bt_dist))
        final_topics.remove(same_dist_topic)

        # Add the parent of the below-threshold topic
        parent = child_to_parent.get(bt_id, (bt_id, bt_name, bt_dist))
        final_topics.add(parent)

        # Find the highest-distance topic, remove it, and add its children
        if final_topics:
            highest_topic = max(final_topics, key=lambda x: x[2])
            final_topics.remove(highest_topic)
            h_id, h_name, h_dist = highest_topic
            for ch_id, ch_name, ch_dist in parent_to_children.get(h_id, []):
                final_topics.add((ch_id, ch_name, ch_dist))

    return list(final_topics)

In [16]:
topics = adjust_topics(hierarchical_topics, topics_at_depth, 1)
topics

[('103', 'assemble_easy_sturdy_looks_good', 1.3074162661036748),
 ('109', 'happy_storage_functional_great_ago', 1.2841732492718585),
 ('105', 'good_product_quality_value_price', 1.2841732492718585),
 ('60', 'nice_thanks_product_item_neat', 1.3074162661036748),
 ('112', 'toys_room_sewing_toy_craft', 1.270378767403591),
 ('107', 'tv_stand_ikea_vinyl_furniture', 1.177160861935259),
 ('110', 'closet_perfect_clothes_exactly_works', 1.270378767403591),
 ('104', 'kallax_shelf_shelving_shelves_unit', 1.177160861935259)]

In [17]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1116,-1_storage_room_unit_bought,"[storage, room, unit, bought, great, love, per...",[great addition bedroom area bedroom shelf fit...
1,0,259,0_shelf_shelving_shelves_unit,"[shelf, shelving, shelves, unit, great, sturdy...",[great looking shelf shelf easy assemble looks...
2,1,137,1_tv_stand_use_using,"[tv, stand, use, using, perfect, entertainment...",[nice tv stand product nice worked tv stand ha...
3,2,134,2_assemble_easy_assembly_good,"[assemble, easy, assembly, good, instructions,...","[easy assemble easy assemble, easy assemble ea..."
4,3,106,3_closet_clothes_organizer_perfect,"[closet, clothes, organizer, perfect, walk, wa...",[perfect closet storage happy bought easy asse...
...,...,...,...,...,...
56,55,12,55_expected_expectations_meets_exactly,"[expected, expectations, meets, exactly, expec...","[expected expected, exactly expected exactly e..."
57,56,12,56_nice_thanks_neat_really,"[nice, thanks, neat, really, great, , , , , ]","[nice nice, nice nice, nice nice]"
58,57,11,57_functional_multifunctional_predictably_minimal,"[functional, multifunctional, predictably, min...","[functional attractive functional attractive, ..."
59,58,10,58_nice_product_item_finish,"[nice, product, item, finish, color, piece, aw...","[nice product nice product, nice product nice ..."


In [18]:
parent_ids = set(hierarchical_topics['Parent_ID'].astype(int))
parent_ids

{60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118}

In [19]:
def get_reviews(reviews, topic_model, topics):

    topic_reviews = {}

    topic_assignments = topic_model.transform(reviews)[0]

    for topic in topics:
        topic_id = int(topic[0])
        if topic_id not in set(hierarchical_topics['Parent_ID'].astype(int)):	
            topic_reviews[topic_id] = [review for review, assigned_topic in zip(reviews, topic_assignments) if assigned_topic == topic_id]
        else:
            child_topics = hierarchical_topics[hierarchical_topics['Parent_ID'].astype(int) == topic_id]['Topics'].values.tolist()[0]
            print(child_topics)
            topic_reviews[topic_id] = [review for review, assigned_topic in zip(reviews, topic_assignments) if assigned_topic in child_topics]

    return topic_reviews

In [20]:
topic_reviews = get_reviews(processed_reviews, topic_model, topics)

[2, 16, 22, 33, 34, 38, 51]
[11, 26, 40, 41, 42, 49, 52, 54, 57]
[13, 21, 27, 28, 31]
[56, 58]
[9, 23, 29, 32, 35, 39, 44, 46, 48, 50]
[1, 7, 8, 10, 14, 18, 20, 24, 25, 43, 45, 47, 59]
[3, 12, 17, 36, 53, 55]
[0, 4, 5, 6, 15, 19, 30, 37]


In [21]:
topic_reviews

{103: ['products rated products rated easy assemble tools useful far happy purchase',
  'great product nice sturdy clean design right size holds wanted use home gym store equipment',
  'easy easy',
  'ok assembly diagrams ok assembly diagrams clearer',
  'nice storage ease assembly nice color',
  'looks great super easy looks great super easy assemble added legs gave finished look',
  'looks nice easy assemble looks nice easy assemble',
  'excellent built item excellent built item sturdy',
  'great purchase friend instructions really helpful easy assemble product looks stylish',
  'great space saving unit easy assemble built better hardware store models',
  'sturdy designed bought product really liked simple confident design cheered fun',
  'easy assemble perfect storage easy assemble perfect storage studio',
  'good ratio good ratio easy assemble',
  'excellent products easy excellent products easy assemble',
  'good quality good workmanship lined perfectly easy assemble looks like hi

In [22]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("API_KEY")

In [23]:
llm_client = AzureOpenAI(
    api_key=api_key,
    api_version="2023-07-01-preview",
    azure_endpoint="https://derai-vision.openai.azure.com/",
)

model = "gpt-4o-mini" 

In [24]:
def get_topic_keyword(cluster_words):
    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful expert summarizer that identifies and generates a concise, broad topic word for each cluster of words.\n"
                "The topic word should capture the essence of all the words in the cluster.\n"
                "Merge similar or related words into a single, broader category.\n"
                "Use singular words unless a plural form is necessary.\n"                
                "Use only one word. 2 or 3 words can be used only when they are part of a composite word and are better to represent the idea of the topic (e.g.: ease of use).\n"
                "If you identify a verb as a topic, use the noun version (e.g., use 'order' instead of 'ordering').\n"
                "Generalize the topic word; for example, if you encounter 'saleswoman' or 'salesman', abstract it to 'staff'.\n"
                "Provide the output as a single word."
            ),
        },
        {
            "role": "user",
            "content": (
                "Please read the following cluster of words carefully and generate a single topic word that captures the essence of all the words.\n"
                "The topic word should be broad and general, capturing the essence of the cluster's main points without being overly specific or redundant.\n"
                "The topics could be either nouns that refers to a certain characteristic of the product of spefic features or parts of the product (e.g.: click & collect, email redeem, etc.)\n"
                f"Cluster: {', '.join(cluster_words)}\n"
                "Topic word(s):"
            ),
        },
    ]

    response = ' '
    
    # Generate the topic word using the language model
    response = llm_client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=5,
        temperature=0.4,
        n=1,
        stop=None,
    )

    # Extract and return the topic word
    return response.choices[0].message.content.strip()

In [29]:
topics

[('103', 'assemble_easy_sturdy_looks_good', 1.3074162661036748),
 ('109', 'happy_storage_functional_great_ago', 1.2841732492718585),
 ('105', 'good_product_quality_value_price', 1.2841732492718585),
 ('60', 'nice_thanks_product_item_neat', 1.3074162661036748),
 ('112', 'toys_room_sewing_toy_craft', 1.270378767403591),
 ('107', 'tv_stand_ikea_vinyl_furniture', 1.177160861935259),
 ('110', 'closet_perfect_clothes_exactly_works', 1.270378767403591),
 ('104', 'kallax_shelf_shelving_shelves_unit', 1.177160861935259)]

In [25]:
import csv
def create_csv_file(topic_reviews, output_file):
    """
    Create a CSV file with reviews and their associated topic keywords.

    Parameters:
    - reviews: List of all reviews.
    - topic_reviews: Dictionary with topic IDs as keys and lists of reviews as values.
    - output_file: Path to the output CSV file.
    """
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = ['review', 'topics']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for topic_id, reviews_list in topic_reviews.items():
            topic_name = next((topic[1] for topic in topics if int(topic[0]) == topic_id), None)
            topic_keyword = get_topic_keyword(topic_name)
            for review in reviews_list:
                writer.writerow({'review': review, 'topics': f"['{topic_keyword}']"})

    print(f"CSV file saved to {output_file}")

In [26]:
# Create the CSV file
csv_output_file = f'csv/BERTopic_{article_id}.csv'
create_csv_file(topic_reviews, csv_output_file)

streamlit_csv = f'BERTopic_{article_id}.csv'

CSV file saved to csv/BERTopic_20351884.csv


## Run the demo

In [27]:
!streamlit run app.py -- "$streamlit_csv"

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://192.168.178.85:8501[0m
[0m
[34m[1m  For better performance, install the Watchdog module:[0m

  $ xcode-select --install
  $ pip install watchdog
            [0m
Reading data from BERTopic_20351884.csv
Reading data from BERTopic_20351884.csv
Reading data from BERTopic_20351884.csv
Reading data from BERTopic_20351884.csv
Reading data from BERTopic_20351884.csv
Reading data from BERTopic_20351884.csv
^C
[34m  Stopping...[0m
