In [5]:
from google.cloud import bigquery
from openai import AzureOpenAI
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

client = bigquery.Client()

project_id = 'ingka-online-analytics-prod'
dataset_id = 'app_data_v2'
table_id = 'app_surveys'

table_ref = f'{project_id}.{dataset_id}.{table_id}'

## Query to test with a fixed number of reviews per day

start_date = '2025-02-20'
end_date = '2025-02-28'

query = f"""
        SELECT
            date, 
            answer_translated
        FROM {table_ref}
        WHERE date BETWEEN '{start_date}' AND '{end_date}'
            AND answer_translated IS NOT NULL AND rating != 0
        ORDER BY date DESC
    """

query_job = client.query(query)

reviews = [row['answer_translated'] for row in query_job]
timestamps = [row['date'] for row in query_job]

In [6]:
## Identify and remove non-english reviews
### For 6 months of data, this takes around 10 minutes 

from langdetect import detect

print("Reviews before processing: ", len(reviews))

filtered_reviews = []
filtered_timestamps = []
removed_reviews = []

for review, timestamp in zip(reviews, timestamps):
    try:
        if detect(review) == 'en' and len(review.split()) > 1 and len(review) >= 10:
            filtered_reviews.append(review)
            filtered_timestamps.append(timestamp)
        else:
            removed_reviews.append(review)
    except:
        removed_reviews.append(review)

print("Removed reviews:")
for review in removed_reviews:
    print(review)

reviews = filtered_reviews
timestamps = filtered_timestamps

print("Reviews after processing: ", len(reviews))

Reviews before processing:  10387
Removed reviews:



Bed linen not available. HATTEN in love 

Bed linen not available. HATTEN in love 

Bed linen not available. HATTEN in love 

Bed linen not available. HATTEN in love 

Bed linen not available. HATTEN in love 

Bed linen not available. HATTEN in love 

not paid
not paid
not paid
not paid







Convenient
Could you put Apple Pau
Before
very good
scan & go
Everything. Easy shopping.
leuk assortiment 
mijn tegoed bonnen waren niet meer geldig
works great
snelle afhandeling 
all
Before!
Before 
quick checkout.
scan
staff
top
all
all
Toppie
Everything 
Okay, bright painting.
Speed
Topppp
Vv
Vv
Vv
Everything 
👍🏼
👍🏼
Fast checkout 

Ok
Super












whole 
super 
pokladny s IKEA kartou

speed 
Beautiful
fast
fast
Very well 
Super convenient with no queues 
very kind paola
Super convenient with no queues 
Yes
Tng ina mo
All 
bravi
.
Tutto ok
Optimal 
beautiful simple
Optimal 
Cassa
break ok
break ok
break ok
break ok
broken
break bend
Tu

In [7]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

## Limiting the number of topics with nr_topics does not work
nr_topics_before = 'Auto'
topic_model = BERTopic()

# Fit the model on the reviews
topics, probabilities = topic_model.fit_transform(reviews)

nr_topics_after = 'auto'

# Further reduce topics if needed
# topic_model.reduce_topics(reviews, nr_topics=nr_topics_after)

topics = topic_model.get_topics()

In [8]:
topic_info = topic_model.get_topic_info()
all_topic_names = '; '.join(topic_info['Name'])
all_topic_names

'-1_to_the_for_was; 0_shipping_expensive_costs_delivery; 1_ikea_love_furniture_best; 2_staff_friendly_helpful_nice; 3_stock_out_stocks_in; 4_app_properly_stuck_use; 5_delivery_says_available_tells; 6_app_scanning_scan_with; 7_efficient_fast_functional_efficiently; 8_thank_always_three_stuff; 9_apple_pay_payment_applepay; 10_checkout_quick_easy_quicker; 11_worked_works_perfectly_everything; 12_fun_was_great_excited; 13_voucher_vouchers_redeem_10; 14_payment_transaction_speed_putting; 15_quality_products_convenience_satisfied; 16_register_cash_registers_open; 17_cashiers_cashier_kind_anti; 18_wifi_internet_connection_reception; 19_ikea_home_490_family; 20_use_easy_interface_teach; 21_found_needed_find_looking; 22_fast_easy_cup_quickly; 23_quick_easy_and_english; 24_click_collect_chargeable_charge; 25_ordering_order_arranged_process; 26_march_date_5th_arrive; 27_queue_queues_skipped_queuing; 28_app_easy_use_stocked; 29_tidy_clean_child_food; 30_perfect_everything_yes_just; 31_went_quickly

In [9]:
number_of_topics = len(topics)
number_of_topics

164

In [10]:
topic_model.visualize_barchart()

In [11]:
topic_model.visualize_topics()

In [12]:
topic_model.visualize_heatmap()

In [13]:
hierarchical_topics = topic_model.hierarchical_topics(reviews)

100%|██████████| 162/162 [00:00<00:00, 947.45it/s]


In [14]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [15]:
# topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)

In [16]:
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

.
├─the_to_is_and_app
│    ├─delivery_stock_shipping_available_order
│    │    ├─stock_available_says_are_order
│    │    │    ├─cart_orders_large_order_place
│    │    │    │    ├─large_orders_limited_recent_order
│    │    │    │    │    ├─■──limited_orders_large_recent_years ── Topic: 109
│    │    │    │    │    └─■──large_orders_order_lamp_absurd ── Topic: 108
│    │    │    │    └─cart_basket_cant_place_order
│    │    │    │         ├─cart_cant_place_order_shopping
│    │    │    │         │    ├─■──place_order_cant_an_detailed ── Topic: 76
│    │    │    │         │    └─cart_shopping_into_cant_pity
│    │    │    │         │         ├─■──cart_into_cant_shopping_get ── Topic: 105
│    │    │    │         │         └─■──cart_pity_over_displayed_again ── Topic: 125
│    │    │    │         └─basket_lists_list_favorites_disappearing
│    │    │    │              ├─■──favorites_disappearing_list_favourites_lists ── Topic: 156
│    │    │    │              └─■──basket_transfer_lists

In [17]:
hierarchical_topics

Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance
161,324,and_fast_the_everything_is,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",322,the_to_is_and_app,323,fast_everything_frederic_super_perfect,2.083153
160,323,fast_everything_frederic_super_perfect,"[7, 12, 21, 22, 23, 30, 31, 37, 39, 41, 47, 51...",312,everything_ok_went_perfect_super,320,fast_frederic_efficient_easy_quick,1.823304
159,322,the_to_is_and_app,"[0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 13, 14, 15...",306,delivery_stock_shipping_available_order,321,app_the_staff_at_very,1.822189
158,321,app_the_staff_at_very,"[1, 2, 4, 6, 8, 9, 10, 11, 13, 14, 15, 16, 17,...",316,app_ikea_payment_pay_the,319,cash_staff_kindness_registers_cashier,1.582511
157,320,fast_frederic_efficient_easy_quick,"[7, 22, 23, 37, 47, 51, 69, 74, 77, 78, 79, 84...",318,fast_frederic_efficient_practical_top,309,quick_convenient_easy_and_super,1.554386
...,...,...,...,...,...,...,...,...
4,167,simple_fast_agile_simpler_brilliantly,"[115, 127]",127,simple_fast_brilliantly_simpler_agile,115,simple_insanely_very_top_all,0.252750
3,166,quick_easy_and_glad_english,"[23, 159]",23,quick_easy_and_english_dirty,159,quick_glad_easy_was_im,0.244515
2,165,convenient_quick_its_jams_congestion,"[84, 113]",84,convenient_quick_jams_its_traffic,113,convenient_congestion_was_reduces_it,0.176250
1,164,ok_everything_alright_is_color,"[62, 68]",68,ok_alright_everything_fastmebte_tight,62,ok_everything_color_is_far,0.156394


In [18]:
def get_topics_at_depth(df, depth):
    from collections import deque
    
    # Build adjacency list with stored distance (no accumulation)
    adjacency = {}
    for _, row in df.iterrows():
        adjacency[row['Parent_ID']] = [
            (row['Child_Left_ID'], row['Child_Left_Name'], row['Distance']),
            (row['Child_Right_ID'], row['Child_Right_Name'], row['Distance'])
        ]
    
    root_id = df.iloc[0]['Parent_ID']
    root_name = df.iloc[0]['Parent_Name']
    
    # BFS
    queue = deque([(root_id, root_name, 0)])  # (id, name, depth)
    result = []
    
    while queue:
        node_id, node_name, curr_depth = queue.popleft()
        children = adjacency.get(node_id, [])
        
        for child_id, child_name, child_distance in children:
            child_depth = curr_depth + 1
            if child_depth == depth:
                result.append((child_id, child_name, child_distance))
            elif child_depth < depth:
                queue.append((child_id, child_name, child_depth))
    
    return result

In [19]:
topics_at_depth = get_topics_at_depth(hierarchical_topics, 3)
for topic in topics_at_depth:
    print(f"ID: {topic[0]}, Name: {topic[1]}, Distance: {topic[2]}")

ID: 298, Name: stock_available_says_are_order, Distance: 1.264975013262404
ID: 282, Name: shipping_delivery_expensive_costs_collect, Distance: 1.264975013262404
ID: 316, Name: app_ikea_payment_pay_the, Distance: 1.5825106371482802
ID: 319, Name: cash_staff_kindness_registers_cashier, Distance: 1.5825106371482802
ID: 214, Name: went_well_everything_smoothly_quickly, Distance: 1.3361152980435336
ID: 304, Name: everything_ok_perfect_super_great, Distance: 1.3361152980435336
ID: 318, Name: fast_frederic_efficient_practical_top, Distance: 1.5543861181025675
ID: 309, Name: quick_convenient_easy_and_super, Distance: 1.5543861181025675


In [20]:
def adjust_topics(df, topics, threshold):
    from collections import defaultdict

    # Build child->parent and parent->children maps
    child_to_parent = {}
    parent_to_children = defaultdict(list)
    for _, row in df.iterrows():
        p_id, p_name, p_dist = row['Parent_ID'], row['Parent_Name'], row['Distance']
        cl_id, cl_name = row['Child_Left_ID'], row['Child_Left_Name']
        cr_id, cr_name = row['Child_Right_ID'], row['Child_Right_Name']
        
        child_to_parent[cl_id] = (p_id, p_name, p_dist)
        child_to_parent[cr_id] = (p_id, p_name, p_dist)
        parent_to_children[p_id].append((cl_id, cl_name, p_dist))
        parent_to_children[p_id].append((cr_id, cr_name, p_dist))

    # Start with the current topics in a set
    final_topics = set(topics)
    
    # Below-threshold topics
    below_threshold = [t for t in topics if t[2] < threshold]

    # For each below-threshold topic, pair it with another topic of the same distance,
    # remove both, then add the parent. Then remove the highest-distance topic and add its children.
    for bt_id, bt_name, bt_dist in below_threshold:
        if (bt_id, bt_name, bt_dist) not in final_topics:
            continue

        # Find another topic with the same distance
        same_dist_candidates = [
            t for t in final_topics
            if t[2] == bt_dist and t != (bt_id, bt_name, bt_dist)
        ]
        if not same_dist_candidates:
            continue

        # Remove the below-threshold topic and its same-distance candidate
        same_dist_topic = same_dist_candidates[0]
        final_topics.remove((bt_id, bt_name, bt_dist))
        final_topics.remove(same_dist_topic)

        # Add the parent of the below-threshold topic
        parent = child_to_parent.get(bt_id, (bt_id, bt_name, bt_dist))
        final_topics.add(parent)

        # Find the highest-distance topic, remove it, and add its children
        if final_topics:
            highest_topic = max(final_topics, key=lambda x: x[2])
            final_topics.remove(highest_topic)
            h_id, h_name, h_dist = highest_topic
            for ch_id, ch_name, ch_dist in parent_to_children.get(h_id, []):
                final_topics.add((ch_id, ch_name, ch_dist))

    return list(final_topics)

In [21]:
topics = adjust_topics(hierarchical_topics, topics_at_depth, 1)
topics

[('304', 'everything_ok_perfect_super_great', 1.3361152980435336),
 ('319', 'cash_staff_kindness_registers_cashier', 1.5825106371482802),
 ('309', 'quick_convenient_easy_and_super', 1.5543861181025675),
 ('316', 'app_ikea_payment_pay_the', 1.5825106371482802),
 ('318', 'fast_frederic_efficient_practical_top', 1.5543861181025675),
 ('214', 'went_well_everything_smoothly_quickly', 1.3361152980435336),
 ('282', 'shipping_delivery_expensive_costs_collect', 1.264975013262404),
 ('298', 'stock_available_says_are_order', 1.264975013262404)]

In [22]:
def get_subtopics_for_topics(df, topics, threshold):
    """
    For each topic in 'topics', find subtopics by going up to 2 levels down a binary tree:
      1) If the topic's direct children (level 1) have distance < threshold, return those 2 children.
      2) Otherwise, go one more level (level 2) and return those 4 descendants

    Returns a dict: { "topic_id:topic_name": [ (child_id, child_name, distance), ... ] }
    """
    from collections import defaultdict, deque

    parent_to_children = defaultdict(list)
    for _, row in df.iterrows():
        p_id = row['Parent_ID']
        parent_to_children[p_id].append((row['Child_Left_ID'], row['Child_Left_Name'], row['Distance']))
        parent_to_children[p_id].append((row['Child_Right_ID'], row['Child_Right_Name'], row['Distance']))

    def collect_descendants(root_id, max_level=2):
        queue = deque([(root_id, 0)])
        levels_nodes = defaultdict(list)
        while queue:
            node_id, lvl = queue.popleft()
            for (cid, cname, cdist) in parent_to_children.get(node_id, []):
                levels_nodes[lvl + 1].append((cid, cname, cdist))
                if lvl + 1 < max_level:
                    queue.append((cid, lvl + 1))

        for level in range(1, max_level + 1):
            nodes = levels_nodes.get(level, [])
            if not nodes:
                return []
            if any(n[2] < threshold for n in nodes) or level == max_level:
                return nodes
        return []

    result = {}
    for (t_id, t_name, t_dist) in topics:
        subtopics = collect_descendants(t_id)
        result[(t_id, t_name)] = subtopics
    return result

In [23]:
subtopics = get_subtopics_for_topics(hierarchical_topics, topics, 1)

In [24]:
subtopics

{('304',
  'everything_ok_perfect_super_great'): [('163',
   'perfect_everything_yes_just_understandable',
   1.2199028094397024), ('292',
   'everything_super_great_found_fun',
   1.2199028094397024), ('68',
   'ok_alright_everything_fastmebte_tight',
   0.15639409081416045), ('62',
   'ok_everything_color_is_far',
   0.15639409081416045)],
 ('319',
  'cash_staff_kindness_registers_cashier'): [('181',
   'express_checkouts_kindness_availability_closed',
   1.1744683880863551), ('187',
   'cash_registers_register_kindness_availability',
   1.1744683880863551), ('299',
   'staff_cashier_thanks_friendly_worked',
   1.3901313685263195), ('301',
   'satisfied_shopping_quality_experience_selection',
   1.3901313685263195)],
 ('309',
  'quick_convenient_easy_and_super'): [('178',
   'easy_super_nice_quick_hurtigt',
   0.8132084998403942), ('166',
   'quick_easy_and_glad_english',
   0.8132084998403942), ('84',
   'convenient_quick_jams_its_traffic',
   0.1762496012160888), ('113',
   'conven

In [25]:
def get_leaves(topic_structure, hierarchical_topics):
    """
    For each subtopic, get its ID and retrieve the 'Topics' attribute from the hierarchical_topics dataframe.

    Parameters:
    - topic_structure: Dictionary containing topics and their subtopics.
    - hierarchical_topics: DataFrame containing hierarchical topic information.

    Returns:
    - Dictionary with subtopic IDs as keys and their 'Topics' attributes as values.
    """
    subtopic_topics = {}

    for main_topic, subtopics in topic_structure.items():
        for subtopic in subtopics:
            subtopic_id = subtopic[0]
            # Find the row in the dataframe with the matching subtopic ID
            row = hierarchical_topics[hierarchical_topics['Parent_ID'] == subtopic_id]
            if not row.empty:
                subtopic_topics[subtopic_id] = row.iloc[0]['Topics']
            else:
                subtopic_topics[subtopic_id] = [int(subtopic_id)]

    return subtopic_topics

In [26]:
subtopic_topics = get_leaves(subtopics, hierarchical_topics)
print(subtopic_topics)

{'163': [30, 85], '292': [12, 21, 39, 41, 97, 98, 144], '68': [68], '62': [62], '181': [55, 102], '187': [16, 46, 58], '299': [2, 11, 17, 34, 35, 49, 57, 59, 70, 99, 103, 120, 141, 146], '301': [8, 15, 40, 48, 65, 91, 116, 157], '178': [37, 128], '166': [23, 159], '84': [84], '113': [113], '313': [9, 13, 14, 18, 25, 27, 29, 33, 36, 42, 52, 56, 60, 63, 66, 67, 80, 82, 87, 89, 90, 92, 95, 100, 106, 111, 118, 121, 122, 126, 129, 131, 134, 138, 140, 142, 147, 148, 150, 151, 153, 158, 161], '199': [45, 117, 130], '300': [1, 4, 10, 20, 28, 32, 44, 83, 94, 123, 132, 143, 149, 152], '281': [6, 38, 43, 50, 54, 64, 72, 81, 86, 110], '162': [162], '186': [69, 104, 137], '311': [7, 22, 47, 51, 74, 78, 79, 88, 135, 139, 155], '188': [77, 115, 127], '136': [136], '175': [31, 53], '24': [24], '133': [133], '253': [0, 19, 26, 75, 101], '257': [73, 119, 145, 160], '198': [108, 109], '280': [76, 105, 112, 125, 156], '261': [3, 5, 61, 114, 124, 154], '252': [71, 93, 96, 107]}


In [27]:
def get_reviews_by_subtopic(subtopic_topics, topic_model, documents):
    """
    Get reviews associated with each subtopic ID in the subtopic_topics dictionary.

    Parameters:
    - subtopic_topics: Dictionary with subtopic IDs as keys and list of topic IDs as values.
    - topic_model: Trained BERTopic model.
    - documents: List of all input documents to the BERTopic model.

    Returns:
    - Dictionary with subtopic IDs as keys and list of reviews as values.
    """
    subtopic_reviews = {}

    # Get topic assignments for each document
    topic_assignments = topic_model.transform(documents)[0]

    for subtopic_id, topic_ids in subtopic_topics.items():
        # Filter documents based on the topic IDs
        associated_docs = [doc for doc, assigned_topic in zip(documents, topic_assignments) if assigned_topic in topic_ids]
        subtopic_reviews[subtopic_id] = associated_docs

    for subtopic_id, reviews in subtopic_reviews.items():
        print(f"Subtopic ID {subtopic_id} has {len(reviews)} reviews.")

    return subtopic_reviews

In [28]:
subtopic_reviews = get_reviews_by_subtopic(subtopic_topics, topic_model, reviews)

Subtopic ID 163 has 58 reviews.
Subtopic ID 292 has 218 reviews.
Subtopic ID 68 has 25 reviews.
Subtopic ID 62 has 26 reviews.
Subtopic ID 181 has 48 reviews.
Subtopic ID 187 has 112 reviews.
Subtopic ID 299 has 482 reviews.
Subtopic ID 301 has 253 reviews.
Subtopic ID 178 has 49 reviews.
Subtopic ID 166 has 53 reviews.
Subtopic ID 84 has 21 reviews.
Subtopic ID 113 has 16 reviews.
Subtopic ID 313 has 1018 reviews.
Subtopic ID 199 has 62 reviews.
Subtopic ID 300 has 529 reviews.
Subtopic ID 281 has 299 reviews.
Subtopic ID 162 has 10 reviews.
Subtopic ID 186 has 56 reviews.
Subtopic ID 311 has 294 reviews.
Subtopic ID 188 has 52 reviews.
Subtopic ID 136 has 13 reviews.
Subtopic ID 175 has 66 reviews.
Subtopic ID 24 has 42 reviews.
Subtopic ID 133 has 13 reviews.
Subtopic ID 253 has 317 reviews.
Subtopic ID 257 has 62 reviews.
Subtopic ID 198 has 33 reviews.
Subtopic ID 280 has 81 reviews.
Subtopic ID 261 has 235 reviews.
Subtopic ID 252 has 81 reviews.


In [29]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("API_KEY")

In [30]:
llm_client = AzureOpenAI(
    api_key=api_key,
    api_version="2023-07-01-preview",
    azure_endpoint="https://derai-vision.openai.azure.com/",
)

model = "gpt-4o-mini" 

In [31]:
def get_topic_keyword(cluster_words):
    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful expert summarizer that identifies and generates a concise, broad topic word for each cluster of words.\n"
                "The topic word should capture the essence of all the words in the cluster.\n"
                "Merge similar or related words into a single, broader category.\n"
                "Use singular words unless a plural form is necessary.\n"                
                "Use only one word. 2 or 3 words can be used only when they are part of a composite word and are better to represent the idea of the topic (e.g.: ease of use).\n"
                "If you identify a verb as a topic, use the noun version (e.g., use 'order' instead of 'ordering').\n"
                "Generalize the topic word; for example, if you encounter 'saleswoman' or 'salesman', abstract it to 'staff'.\n"
                "Provide the output as a single word."
            ),
        },
        {
            "role": "user",
            "content": (
                "Please read the following cluster of words carefully and generate a single topic word that captures the essence of all the words.\n"
                "The topic word should be broad and general, capturing the essence of the cluster's main points without being overly specific or redundant.\n"
                "The topics could be either nouns that refers to a certain characteristic of the product of spefic features or parts of the product (e.g.: click & collect, email redeem, etc.)\n"
                f"Cluster: {', '.join(cluster_words)}\n"
                "Topic word(s):"
            ),
        },
    ]

    response = ' '
    
    # Generate the topic word using the language model
    response = llm_client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=5,
        temperature=0.4,
        n=1,
        stop=None,
    )

    # Extract and return the topic word
    return response.choices[0].message.content.strip()

In [32]:
def get_subtopic_keyword(topic_keyword, cluster_words):
    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful expert summarizer that identifies and generates a concise, broad subtopic word for each cluster of words.\n"
                "The topic word should capture the essence of all the words in the cluster.\n"
                "The words you choose can be specific, since they are a specialization of a broader topic word.\n" 
                "Use singular words unless a plural form is necessary.\n"                
                "Use only one word unless 2 or 3 words are better to represent the idea of the subtopic.\n"
                "If you identify a verb as a subtopic, use the noun version (e.g., use 'order' instead of 'ordering').\n"
                "Generalize the topic word; for example, if you encounter 'saleswoman' or 'salesman', abstract it to 'staff'.\n"
                f"Provide the output as: '{topic_keyword} - <Subtopic word>'."
            ),
        },
        {
            "role": "user",
            "content": (
                "Please read the following cluster of words carefully and generate a single subtopic word that captures the essence of all the words.\n"
                "The subtopic is a specification of the broader topic, therefore it should be about an aspect that the customers mention and that is related to the broader topic.\n"
                "The topics could be either nouns that refers to a certain characteristic of the product of spefic features or parts of the product (e.g.: click & collect, email redeem, etc.)\n"
                f"The broader topic word is: {topic_keyword}\n"
                f"Cluster: {', '.join(cluster_words)}\n"
                "Topic word(s):"
            ),
        },
    ]

    response = ' '

    # Generate the topic word using the language model
    response = llm_client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=10,
        temperature=0.4,
        n=1,
        stop=None,
    )

    # Extract and return the topic word
    return response.choices[0].message.content.strip()

In [33]:
def get_review_summary_short(reviews, llm_client, model, selected_subtopic):
    messages = [
        {
            "role": "system",
            "content": (
                "You are a skilled summarizer specializing in customer feedback analysis.\n"
                "Your role is to identify and concisely summarize the main themes, sentiments, and frequently mentioned points in customer reviews.\n"
                "The reviews provided are related to an IKEA service and may discuss various aspects such as product quality, delivery, customer service, payment, or store experience.\n" 
                "The summary you generate will be used by coworkers to understand in a few words what the reviews are talking about.\n"           
                "Provide the output as a short text summary with no more than 70 words. Do not exceed this limit.\n"
            ),
        },
        {
            "role": "user",
            "content": (
                "Please read carefully the following customer reviews and generate a summary of the main aspects that customers are discussing.\n"
                "The summary should be as concise as possible, only reporting the main aspects.\n"
                f"The summary should focus on this particular topic: {selected_subtopic}. Ensure that all aspects of the text directly relate to this topic, without introducing unrelated information.\n"
                "In case an aspect is mentioned in many reviews, the summary should include 'Many customers' to highlight that it is a common positive/negative point.\n"
                "Focus on the most significant details that are repeated or impactful.\n"
                "I will provide you with the reviews and you will generate the summary.\n"
                f"Reviews: {reviews}\n"
                "Summary:\n"
            ),
        },
    ]

    # Generate the topic word using the language model
    response = llm_client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=100,
        temperature=0.5,
        n=1,
        stop=None,
    )

    # Extract and return the topic word
    return response.choices[0].message.content.strip()


def get_review_summary_long(reviews, llm_client, model, selected_subtopic):
    messages = [
        {
            "role": "system",
            "content": (
                "You are a skilled summarizer specializing in customer feedback analysis.\n"
                "Your role is to identify and concisely summarize the main themes, sentiments, and frequently mentioned points in customer reviews.\n"
                "The reviews provided are related to an IKEA service and may discuss various aspects such as product quality, delivery, customer service, payment, or store experience.\n" 
                "The summaries you generate will be used by coworkers to understand comprehensively the main positive and negative aspects of the reviews.\n"           
                "If a group of reviews does not contain any positive aspects, you can skip the positive points section.\n"           
                "If a group of reviews does not contain any negative aspects, you can skip the negative points section.\n"           
                "Provide the output in the following format: \n"
                "<b>Positive points:</b>\n • Point 1 \n • Point 2 \n ...\n"
                "<b>Negative points:</b>\n • Point 1 \n • Point 2 \n ...\n"
            ),
        },
        {
            "role": "user",
            "content": (
                "Please read carefully the following customer reviews and generate summaries of the main aspects that customers are discussing.\n"
                "The summary should be comprehensive, touching the main aspects mentioned by customer reviews.\n"
                f"The summary should focus on this particular topic: {selected_subtopic}. Ensure that all aspects of the text directly relate to this topic, without introducing unrelated information.\n"
                "In case an aspect is mentioned in many reviews, the summary should include 'Many customers' to highlight that it is a common positive/negative point.\n"
                f"Reviews: {reviews}\n"
                "Summary:\n"
            ),
        },
    ]

    # Generate the topic word using the language model
    response = llm_client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=250,
        temperature=0.5,
        n=1,
        stop=None,
    )

    # Extract and return the topic word
    return response.choices[0].message.content.strip()


In [None]:
import json

def create_json_structure(subtopics_structure, subtopic_reviews, output_file):
    """
    Create a JSON structure with topics, subtopics, and reviews, and save it to a file.

    Parameters:
    - subtopics_structure: Dictionary containing topics and their subtopics.
    - subtopic_reviews: Dictionary with subtopic IDs as keys and list of reviews as values.
    - output_file: Path to the output JSON file.
    """
    json_structure = {}

    for main_topic, subtopics in subtopics_structure.items():
        topic_name = main_topic[1]
        topic_keyword = get_topic_keyword(topic_name)
        subtopic_ids = [subtopic[0] for subtopic in subtopics]
        merged_reviews = [review for subtopic_id in subtopic_ids for review in subtopic_reviews.get(subtopic_id, [])]
        topic_short_summary = get_review_summary_short(merged_reviews, llm_client, model, topic_keyword)
        topic_long_summary = get_review_summary_long(merged_reviews, llm_client, model, topic_keyword)
        json_structure[main_topic[1]] = {
            "Keyword": topic_keyword,
            "Short summary": topic_short_summary,
            "Long summary": topic_long_summary,
            "Subtopics": {}
        }

        print(f"Processing main topic: {main_topic[0]} - {topic_name}")

        for subtopic in subtopics:
            subtopic_id = subtopic[0]
            subtopic_name = subtopic[1]
            subtopic_keyword = get_subtopic_keyword(topic_keyword, subtopic_name)
            reviews = subtopic_reviews.get(subtopic_id, [])
            subtopic_short_summary = get_review_summary_short(reviews, llm_client, model, subtopic_keyword)
            subtopic_long_summary = get_review_summary_long(reviews, llm_client, model, subtopic_keyword)
            
            json_structure[main_topic[1]]["Subtopics"][subtopic_name] = {
                "Subtopic_keyword": subtopic_keyword,
                "Short summary": subtopic_short_summary,
                "Long summary": subtopic_long_summary,
                "Reviews": reviews
            }

            print(f"  Subtopic ID: {subtopic_id} - {subtopic_name}")
            print(f"    Keyword: {subtopic_keyword}")
            print(f"    Number of reviews: {len(reviews)}")

    with open(output_file, 'w') as f:
        json.dump(json_structure, f, indent=4)

    print(f"JSON structure saved to {output_file}")

In [35]:
output_file = 'output.json'
create_json_structure(subtopics, subtopic_reviews, output_file)

Processing main topic: 304 - everything_ok_perfect_super_great
  Subtopic ID: 163 - perfect_everything_yes_just_understandable
    Keyword: perfection - clarity
    Number of reviews: 58
  Subtopic ID: 292 - everything_super_great_found_fun
    Keyword: perfection - service
    Number of reviews: 218
  Subtopic ID: 68 - ok_alright_everything_fastmebte_tight
    Keyword: perfection - alignment
    Number of reviews: 25
  Subtopic ID: 62 - ok_everything_color_is_far
    Keyword: perfection - color
    Number of reviews: 26
Processing main topic: 319 - cash_staff_kindness_registers_cashier
  Subtopic ID: 181 - express_checkouts_kindness_availability_closed
    Keyword: staff - checkout kindness
    Number of reviews: 48
  Subtopic ID: 187 - cash_registers_register_kindness_availability
    Keyword: staff - kindness
    Number of reviews: 112
  Subtopic ID: 299 - staff_cashier_thanks_friendly_worked
    Keyword: staff - cashier friendliness
    Number of reviews: 482
  Subtopic ID: 301 - s

## Run the demo

In [36]:
!streamlit run app.py

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://192.168.178.207:8501[0m
[0m
[34m[1m  For better performance, install the Watchdog module:[0m

  $ xcode-select --install
  $ pip install watchdog
            [0m
^C
