In [None]:
from google.cloud import bigquery
from openai import AzureOpenAI
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

client = bigquery.Client()

project_id = 'ingka-online-analytics-prod'
dataset_id = 'app_data_v2'
table_id = 'app_surveys'

table_ref = f'{project_id}.{dataset_id}.{table_id}'

## Query to test with a fixed number of reviews per day

num_reviews = 1000
num_reviews_per_day = 300

query_test = f"""
    WITH ranked_reviews AS (
        SELECT 
            date, 
            answer_translated,
            ROW_NUMBER() OVER (PARTITION BY date ORDER BY date DESC) as row_num
        FROM {table_ref}
        WHERE answer_translated IS NOT NULL AND rating != 0
    )
    SELECT *
    FROM ranked_reviews
    WHERE row_num <= {num_reviews_per_day}
    ORDER BY date DESC
    LIMIT {num_reviews}
"""

## With 6 months of data, the number of reviews will be between 2M and 3M
### Of this, only around 200k have a non-null answer_translated

query_1_month = f"""
    SELECT
        date, 
        answer_translated
    FROM {table_ref}
    WHERE date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 1 MONTH) AND current_date()
        AND answer_translated IS NOT NULL AND rating != 0
    ORDER BY date DESC
"""

query_3_months = f"""
    SELECT
        date, 
        answer_translated
    FROM {table_ref}
    WHERE date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 3 MONTH) AND current_date()
        AND answer_translated IS NOT NULL AND rating != 0
    ORDER BY date DESC
"""

query_6_months = f"""
    SELECT
        date, 
        answer_translated
    FROM {table_ref}
    WHERE date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 6 MONTH) AND current_date()
        AND answer_translated IS NOT NULL AND rating != 0
    ORDER BY date DESC
"""

query_job = client.query(query_3_months)

reviews = [row['answer_translated'] for row in query_job]
timestamps = [row['date'] for row in query_job]

In [None]:
## Identify and remove non-english reviews
### For 6 months of data, this takes around 10 minutes 

from langdetect import detect

print("Reviews before processing: ", len(reviews))

filtered_reviews = []
filtered_timestamps = []
removed_reviews = []

for review, timestamp in zip(reviews, timestamps):
    try:
        if detect(review) == 'en' and len(review.split()) > 1 and len(review) >= 10:
            filtered_reviews.append(review)
            filtered_timestamps.append(timestamp)
        else:
            removed_reviews.append(review)
    except:
        removed_reviews.append(review)

print("Removed reviews:")
for review in removed_reviews:
    print(review)

reviews = filtered_reviews
timestamps = filtered_timestamps

print("Reviews after processing: ", len(reviews))

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

formatted_timestamps = [ts.strftime("%Y-%m-%d") for ts in timestamps]

stop_words = set(stopwords.words('english')).union(set(ENGLISH_STOP_WORDS))

processed_reviews = [' '.join([word for word in word_tokenize(review.lower()) if word.isalnum() and word not in stop_words]) for review in reviews]

## Limiting the number of topics with nr_topics does not work
nr_topics_before = 'Auto'
topic_model = BERTopic()

# Fit the model on the reviews
topics, probabilities = topic_model.fit_transform(reviews)

nr_topics_after = 'auto'

# Further reduce topics if needed
# topic_model.reduce_topics(reviews, nr_topics=nr_topics_after)

topics_over_time = topic_model.topics_over_time(reviews, formatted_timestamps, datetime_format="%Y-%m-%d", nr_bins=10)
topics = topic_model.get_topics()

In [None]:
topic_info = topic_model.get_topic_info()
all_topic_names = '; '.join(topic_info['Name'])
all_topic_names

In [None]:
number_of_topics = len(topics)
number_of_topics

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_heatmap(top_n_topics = 8)

In [80]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

def get_topic_embedding(topic_id):
    words = [word for word, _ in topic_model.get_topic(topic_id)]
    embeddings = sentence_model.encode(words)
    return np.mean(embeddings, axis=0)

def topic_cosine_similarity(t1, t2):
    v1 = get_topic_embedding(t1)
    v2 = get_topic_embedding(t2)
    return cosine_similarity([v1], [v2])[0][0]

In [None]:
topics = topic_model.get_topic_freq()
valid_topics = topics[topics.Topic != -1].sort_values(by="Count", ascending=False)

threshold = 0.75

# Start by selecting the first 8 topics
top_topics = valid_topics.head(8).Topic.tolist()
index = 8  # Next topic candidate

while True:
    changed = False
    for i in range(len(top_topics)):
        for j in range(i + 1, len(top_topics)):
            sim = topic_cosine_similarity(top_topics[i], top_topics[j])
            if sim > threshold:
                # Discard the one with the highest ID
                discard_index = i if top_topics[i] > top_topics[j] else j
                top_topics.pop(discard_index)
                # Pull the next topic from valid_topics, if available
                if index < len(valid_topics):
                    new_topic = valid_topics.iloc[index].Topic
                    top_topics.append(new_topic)
                    index += 1
                changed = True
                break
        if changed:
            break
    if not changed:
        break

top_topics

In [None]:
new_models = {}
divided_reviews = []

for topic_id in top_topics:
    # Collect reviews for this specific topic
    topic_docs = [
        reviews[i]
        for i, t in enumerate(topic_model.get_document_info(reviews)['Topic'])
        if t == topic_id
    ]
    divided_reviews.append(topic_docs)
    print(f"Topic {topic_id} has {len(topic_docs)} documents")
    # Create a new BERTopic model and fit only these documents
    model = BERTopic()
    model.fit_transform(topic_docs)
    new_models[topic_id] = model

new_models 

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("API_KEY")

In [83]:
llm_client = AzureOpenAI(
    api_key=api_key,
    api_version="2023-07-01-preview",
    azure_endpoint="https://derai-vision.openai.azure.com/", 
)

model = "gpt-4o-mini" 

In [84]:
def get_topic_keyword(cluster_words):
    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful expert summarizer that identifies and generates a concise, broad topic word for each cluster of words.\n"
                "The topic word should capture the essence of all the words in the cluster.\n"
                "Merge similar or related words into a single, broader category.\n"
                "Use singular words unless a plural form is necessary.\n"
                "Use only one word. 2 or 3 words can be used only when they are part of a composite word and are better to represent the idea of the topic (e.g.: ease of use).\n"
                "If you identify a verb as a topic, use the noun version (e.g., use 'order' instead of 'ordering').\n"
                "Generalize the topic word; for example, if you encounter 'saleswoman' or 'salesman', abstract it to 'staff'.\n"
                "Provide the output as a single word."
            ),
        },
        {
            "role": "user",
            "content": (
                "Please read the following cluster of words carefully and generate a single topic word that captures the essence of all the words.\n"
                "The topic word should be broad and general, capturing the essence of the cluster's main points without being overly specific or redundant.\n"
                "The topics could be either nouns that refer to a certain characteristic of the product or specific features or parts of the product (e.g.: click & collect, email redeem, etc.)\n"
                "The probabilities indicate how important a certain word is in the topic.\n"
                f"Cluster: {', '.join([f'{word} ({prob:.4f})' for word, prob in cluster_words])}\n"
                "Topic word(s):"
            ),
        },
    ]

    response = ' '
    
    # Generate the topic word using the language model
    response = llm_client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=5,
        temperature=0.4,
        n=1,
        stop=None,
    )

    # Extract and return the topic word
    return response.choices[0].message.content.strip()

In [None]:
topic_keywords = {}

for topic in top_topics:
    topic_name = topic_info[topic_info['Topic'] == topic]['Name'].values[0]
    topic_keywords[topic_name] = get_topic_keyword(topic_model.get_topic(topic))

print(topic_keywords)

In [None]:
threshold = 0.8
topic_infos = []
final_subtopics = []

for i, model in enumerate(new_models.values()):
    topic_infos.append(model.get_topic_info())
    subtopics = model.get_topic_freq()
    print(subtopics)
    valid_subtopics = subtopics[subtopics.Topic != -1].sort_values(by="Count", ascending=False)
    top_subtopics = valid_subtopics.head(4).Topic.tolist()
    index = 4  # Next topic candidate

    while True:
        changed = False
        for i in range(len(top_subtopics)):
            for j in range(i + 1, len(top_subtopics)):
                sim = topic_cosine_similarity(top_subtopics[i], top_subtopics[j])
                if sim > threshold:
                    # Discard the one with the highest ID
                    discard_index = i if top_subtopics[i] > top_subtopics[j] else j
                    top_subtopics.pop(discard_index)
                    # Pull the next topic from valid_subtopics, if available
                    if index < len(valid_subtopics):
                        new_topic = valid_subtopics.iloc[index].Topic
                        top_subtopics.append(new_topic)
                        index += 1
                    changed = True
                    break
            if changed:
                break
        if not changed:
            break

    final_subtopics.append(top_subtopics)

print(f"Final subtopics for all models: {final_subtopics}")

In [None]:
final_subtopics

In [88]:
import json 
def get_subtopic_keyword(topic_keyword, cluster_words):
    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful expert summarizer that identifies and generates a concise, broad subtopic word for each cluster of words.\n"
                "The topic word should capture the essence of all the words in the cluster.\n"
                "The words you choose can be specific, since they are a specialization of a broader topic word.\n" 
                "Use singular words unless a plural form is necessary.\n"                
                "Use only one word unless 2 or 3 words are better to represent the idea of the subtopic.\n"
                "If you identify a verb as a subtopic, use the noun version (e.g., use 'order' instead of 'ordering').\n"
                "Generalize the topic word; for example, if you encounter 'saleswoman' or 'salesman', abstract it to 'staff'.\n"
                f"Provide the output as: '{topic_keyword} - <Subtopic word>'."
            ),
        },
        {
            "role": "user",
            "content": (
                "Please read the following cluster of words carefully and generate a single subtopic word that captures the essence of all the words.\n"
                "The subtopic is a specification of the broader topic, therefore it should be about an aspect that the customers mention and that is related to the broader topic.\n"
                "The topics could be either nouns that refers to a certain characteristic of the product of spefic features or parts of the product (e.g.: click & collect, email redeem, etc.)\n"
                "The probabilities indicate how important a certain word is in the topic.\n"
                f"The broader topic word is: {topic_keyword}\n"
                f"Cluster: {', '.join([f'{word} ({prob:.4f})' for word, prob in cluster_words])}\n"
                "Topic word(s):"
            ),
        },
    ]

    response = ' '
    model = "gpt-4o-mini" 
    # Generate the topic word using the language model
    response = llm_client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=10,
        temperature=0.4,
        n=1,
        stop=None,
    )

    # Extract and return the topic word
    return response.choices[0].message.content.strip()

In [None]:
subtopic_keywords = {}

for i, subtopics in enumerate(final_subtopics):
    # Get the main topic name by matching index i in topic_keywords
    parent_topic_name = list(topic_keywords.values())[i]
    print("parent_topic_name: ", parent_topic_name)
    parent_topic_id = top_topics[i]
    current_topic_info = topic_infos[i]

    # Use the corresponding BERTopic model from new_models for this main topic.
    subtopic_model = new_models[parent_topic_id]

    subtopic_keywords[list(topic_keywords.items())[i]] = {}

    # Generate a keyword for each subtopic
    for subtopic_id in subtopics:
        subtopic_name = current_topic_info[current_topic_info['Topic'] == subtopic_id]['Name'].values[0]
        subtopic_word_probs = subtopic_model.get_topic(subtopic_id)
        keyword = get_subtopic_keyword(parent_topic_name, subtopic_word_probs)
        subtopic_keywords[list(topic_keywords.items())[i]][subtopic_name] = keyword

subtopic_keywords

In [90]:
import json

def create_json_structure(subtopics_structure, output_file):
    """
    Create a JSON structure with topics, subtopics, and reviews, and save it to a file.

    Parameters:
    - subtopics_structure: Dictionary containing topics and their subtopics.
    - subtopic_reviews: Dictionary with subtopic IDs as keys and list of reviews as values.
    - output_file: Path to the output JSON file.
    """
    json_structure = {}

    for main_topic, subtopics in subtopics_structure.items():
        print(subtopics)
        topic_name = main_topic[0]
        topic_keyword = main_topic[1]
        json_structure[main_topic[0]] = {
            "keyword": topic_keyword,
            "subtopics": {}
        }

        print(f"Processing main topic: {topic_name} - {topic_keyword}")

        for i in range(len(subtopics)):
            subtopic_name = list(subtopics.keys())[i]
            subtopic_keyword = list(subtopics.values())[i]            

            json_structure[main_topic[0]]["subtopics"][subtopic_name] = {
                "subtopic_keyword": subtopic_keyword,
                # "reviews": reviews # Reviews for each subtopic need to be fetched and added here
            }

            print(f"  Subtopic ID: {subtopic_name} - {subtopic_keyword}")
            print(f"    Keyword: {subtopic_keyword}")

    with open(output_file, 'w') as f:
        json.dump(json_structure, f, indent=4)

    print(f"JSON structure saved to {output_file}")

In [None]:
len(divided_reviews)

In [None]:
output_file = 'top8.json'
create_json_structure(subtopic_keywords, output_file)

## Run the demo

In [None]:
!streamlit run app.py