In [1]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from restaurentpy.data import ReviewData
from cleantext import clean

  from .autonotebook import tqdm as notebook_tqdm
Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


# 1. Read Data ----

In [2]:
df = ReviewData(path='/Volumes/Macintosh HD/AI World/Review Data/BigPlate/', pat='xlsx').etl_review()
df['translate_review'] = df['review_text'].apply(lambda x: clean(x, no_emoji=True))

# 2. Bertopic ----

In [3]:
representation_model = KeyBERTInspired()

topic_model = BERTopic(representation_model = representation_model)

print('Training topic model for reviews...')
topics, ini_probs = topic_model.fit_transform(list(df.translate_review.values))

df['topic'] = topics
df['topic_prob'] = ini_probs

topics_info = topic_model.get_topic_info()

Training topic model for reviews...


In [4]:
topic_number = 3

print( 'Reviews: \n' ,'\n'.join([str(elem) for elem in \
                                df[df['topic']==topic_number]\
                                    .sort_values('topic_prob', ascending=False)\
                                    .drop_duplicates(subset=['review_text'])['review_text']\
                                        .head(4).values])
)

print( '\nKey words: ' ,', '.join([str(elem) for elem in \
    topics_info.loc[topics_info['Topic']==topic_number, 'Representation'].values[0]]))

Reviews: 
 Weru good friendly service by Ifthikar
Tasty food
Recommend
Food & service very good
(Ifthikar)
Good service given by Ifthikar, Quality food ! Would recommend.
Good quality food good service and affordable prices. Highly recommend. Crew member Iftikar provided good service

Key words:  ifthikars, ifthikar, ifthkar, ithikar, foods, meal, food, restaurant, service, served


In [15]:
topics_info.head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,569,-1_restaurant_dining_meal_dinner,"[restaurant, dining, meal, dinner, burger, foo...",[placed in marine drive and parking your vehic...
1,0,157,0_akram_akrams_foods_service,"[akram, akrams, foods, service, services, food...","[good food\ngood service (akram), akram good s..."
2,1,147,1_akram_akramservices_akrams_akramhe,"[akram, akramservices, akrams, akramhe, akramg...","[good service akram, good service akram, good ..."
3,2,141,2_rajendran_mrrajendran_rajendrans_mrrajendrans,"[rajendran, mrrajendran, rajendrans, mrrajendr...",[the food is very good and mr rajendran's serv...
4,3,98,3_ifthikars_ifthikar_ifthkar_ithikar,"[ifthikars, ifthikar, ifthkar, ithikar, foods,...","[good good food good service ifthikar, food & ..."
5,4,95,4_nan___,"[nan, , , , , , , , , ]","[nan, nan, nan]"
6,5,92,5_plate_plates_bigplate_dishes,"[plate, plates, bigplate, dishes, dinner, rest...",[i had a wonderful dining experience at big pl...
7,6,86,6_foods_food_burger_seafood,"[foods, food, burger, seafood, rice, chicken, ...",[food had great value and taste. we ordered ch...
8,7,78,7_shawarma_shawarmas_shawrma_beef,"[shawarma, shawarmas, shawrma, beef, restauran...","[so, i and my friend are fellow foodies. we we..."
9,8,73,8_restaurant_waiter_customers_food,"[restaurant, waiter, customers, food, staff, m...","[i came with my friends to take our dinner , a..."


In [5]:
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""

In [8]:
# Example prompt 
example_prompt = """
I have a topic that contains the following documents (from customers reviews):
- Not good. Fries were stale.
- There burgers has gotten worse than ever but their fries are still good.
- Fries were too salty.

The topic is described by the following keywords: 'fries, fry, burger, burgers, fried, cold, frozen, cooked, greasy, warm'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.

[/INST] bad food
"""

base_prompt = system_prompt + example_prompt

In [11]:
import ollama

# Quantity of reviews
qty_comments_prompt = 10

def get_short_labels(df, topics_info, qty_comments_prompt, base_prompt):
    for topic in topics_info.loc[topics_info['Topic'] > -1, 'Topic']:
        print(topic)
        # X reviews with the higher probability. It doesn't consider the duplicated values
        comments_list = '\n- '.join([str(elem) for elem in \
                                df[df['topic']==topic].sort_values('topic_prob', ascending=False)\
                                    .drop_duplicates(subset=['translate_review'])['translate_review']\
                                        .head(qty_comments_prompt).values])
        
        # Key words list from topic        
        key_list = ', '.join([str(elem) for elem in topics_info.loc[topics_info['Topic']==topic, 'Representation'].values[0]])

        # The main prompt with the list of reviews and list of topic key words
        main_prompt = f"""
[INST]
I have a topic that contains the following documents (from customers reviews):
{comments_list}

The topic is described by the following keywords: '{key_list}'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST]
        """
        # Build input prompt
        prompt = base_prompt + main_prompt
       
        output = ollama.generate(model='llama3', prompt=prompt)
        topics_info.loc[topics_info['Topic']==topic, 'short_label'] = output['response']

        print('Output: ', output['response'])
    return topics_info

In [18]:
# Training topic model with Negative customer reviews
topics_info_output = get_short_labels(df, topics_info.head(5), qty_comments_prompt, base_prompt)

0
Output:  [/INST] good restaurant
1
Output:  [/INST] good customer service
2
Output:  [/INST] good dining experience at mr. rajendran's restaurant
3
Output:  [/INST] positive customer reviews


In [None]:
topics_info_output.short_label.value_counts()