In [None]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from restaurentpy.data import ReviewData
from cleantext import clean

# 1. Read Data ----

In [None]:
df = ReviewData(path='/Volumes/Macintosh HD/AI World/Review Data/BigPlate/', pat='xlsx').etl_review()
df['translate_review'] = df['review_text'].apply(lambda x: clean(x, no_emoji=True))

# 2. Bertopic ----

In [None]:
representation_model = KeyBERTInspired()

topic_model = BERTopic(representation_model = representation_model)

print('Training topic model for reviews...')
topics, ini_probs = topic_model.fit_transform(list(df.translate_review.values))

df['topic'] = topics
df['topic_prob'] = ini_probs

topics_info = topic_model.get_topic_info()

In [None]:
topic_number = 3

print( 'Reviews: \n' ,'\n'.join([str(elem) for elem in \
                                df[df['topic']==topic_number]\
                                    .sort_values('topic_prob', ascending=False)\
                                    .drop_duplicates(subset=['review_text'])['review_text']\
                                        .head(4).values])
)

print( '\nKey words: ' ,', '.join([str(elem) for elem in \
    topics_info.loc[topics_info['Topic']==topic_number, 'Representation'].values[0]]))

In [None]:
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""

In [None]:
# Example prompt 
example_prompt = """
I have a topic that contains the following documents (from customers reviews):
- Not good. Fries were stale.
- There burgers has gotten worse than ever but their fries are still good.
- Fries were too salty.

The topic is described by the following keywords: 'fries, fry, burger, burgers, fried, cold, frozen, cooked, greasy, warm'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.

[/INST] bad food
"""

base_prompt = system_prompt + example_prompt

In [None]:
import ollama

# Quantity of reviews
qty_comments_prompt = 10

def get_short_labels(df, topics_info, qty_comments_prompt, base_prompt):
    for topic in topics_info.loc[topics_info['Topic'] > -1, 'Topic']:
        print(topic)
        # X reviews with the higher probability. It doesn't consider the duplicated values
        comments_list = '\n- '.join([str(elem) for elem in \
                                df[df['topic']==topic].sort_values('topic_prob', ascending=False)\
                                    .drop_duplicates(subset=['translate_review'])['translate_review']\
                                        .head(qty_comments_prompt).values])
        
        # Key words list from topic        
        key_list = ', '.join([str(elem) for elem in topics_info.loc[topics_info['Topic']==topic, 'Representation'].values[0]])

        # The main prompt with the list of reviews and list of topic key words
        main_prompt = f"""
[INST]
I have a topic that contains the following documents (from customers reviews):
{comments_list}

The topic is described by the following keywords: '{key_list}'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST]
        """
        # Build input prompt
        prompt = base_prompt + main_prompt
       
        output = ollama.generate(model='llama3', prompt=prompt)
        topics_info.loc[topics_info['Topic']==topic, 'short_label'] = output['response']

        print('Output: ', output['response'])
    return topics_info

In [None]:
# Training topic model with Negative customer reviews
topics_info_output = get_short_labels(df, topics_info, qty_comments_prompt, base_prompt)

In [None]:
topics_info_output.short_label.value_counts()