# **InsightMind - Topic Modeling Pipeline**

This data pipeline includes starts with preloaded data from Google Maps scrapped using APIFY, then follow these steps:


*   Data Processing
*   BERTopic Model
*   Representation model for Keyword extraction
*   Plain-english descriptions for topics



In [None]:
# Libraries
import pandas as pd
import os
import numpy as np
import tqdm
import datetime

# Read data

In [None]:
from google.colab import drive

drive.mount('/content/drive')

# define the Google Drive path to the files to read
file_path1 = '/content/drive/My Drive/notebooks/data/reviews.csv'
file_path2 = '/content/drive/My Drive/notebooks/data/list1.csv'
file_path3 = '/content/drive/My Drive/notebooks/data/info.csv'
file_path12 = '/content/drive/My Drive/notebooks/data/reviews2.csv'
file_path22 = '/content/drive/My Drive/notebooks/data/list2.csv'
file_path32 = '/content/drive/My Drive/notebooks/data/info2.csv'
file_path13 = '/content/drive/My Drive/notebooks/data/reviews3.csv'
file_path23 = '/content/drive/My Drive/notebooks/data/list3.csv'
file_path14 = '/content/drive/My Drive/notebooks/data/reviews4.csv'
file_path24 = '/content/drive/My Drive/notebooks/data/list4.csv'

# Read the CSV files into a pandas DataFrame
reviews_df = pd.read_csv(file_path1)
list_df = pd.read_csv(file_path2)
info_df = pd.read_csv(file_path3)

reviews_df2 = pd.read_csv(file_path12)
list_df2 = pd.read_csv(file_path22)
info_df2 = pd.read_csv(file_path32)

reviews_df3 = pd.read_csv(file_path13)
list_df3 = pd.read_csv(file_path23)

reviews_df4 = pd.read_csv(file_path14)
list_df4 = pd.read_csv(file_path24)


Mounted at /content/drive


# Clean and Process the data

In [None]:
reviews = reviews_df[['title','publishedAtDate','stars','text']].copy()
cs_list = list_df[['title','categoryName','address']].copy()
cs_list2 = list_df2[['title','categoryName','address']].copy()
cs_list3 = list_df3[['title','categoryName','address']].copy()
cs_list4 = list_df4[['title','categoryName','address']].copy()
info = info_df[['title','neighborhood','location/lat','location/lng']].copy()

reviews2 = reviews_df2[['title','publishedAtDate','stars','text']].copy()
reviews3 = reviews_df3[['title','publishedAtDate','stars','text']].copy()
reviews4 = reviews_df4[['title','publishedAtDate','stars','text']].copy()

# Concatenate reviews and reviews2
combined_reviews = pd.concat([reviews, reviews2], ignore_index=True)
combined_reviews = pd.concat([combined_reviews, reviews3], ignore_index=True)
combined_reviews = pd.concat([combined_reviews, reviews4], ignore_index=True)

# Concatenate list
combined_list = pd.concat([list_df, list_df2], ignore_index=True)
combined_list = pd.concat([combined_list, list_df3], ignore_index=True)
combined_list = pd.concat([combined_list, list_df4], ignore_index=True)

# Calculate the number of rows before dropping duplicates
initial_rows = combined_reviews.shape[0]

# Drop duplicate rows based on the 'text' column
combined_reviews_deduplicated = combined_reviews.drop_duplicates(subset=['publishedAtDate','text'])
combined_list_deduplicated = combined_list.drop_duplicates(subset=['title'])

# Calculate the number of rows after dropping duplicates
rows_after_deduplication = combined_reviews_deduplicated.shape[0]

# Count the number of duplicated records
duplicated_records_count = initial_rows - rows_after_deduplication

print(f"Number of duplicated records based on 'text' column: {duplicated_records_count}")

# You can now work with combined_reviews_deduplicated
reviews_def = combined_reviews_deduplicated.copy()

Number of duplicated records based on 'text' column: 3959


In [None]:
reviews_def = pd.merge(reviews_def, combined_list_deduplicated[['title','categoryName','location/lat','location/lng']], on='title', how='left')
#date format
reviews_def['publishedAtDate'] = pd.to_datetime(reviews_def['publishedAtDate']).dt.strftime('%Y-%m-%d')
reviews_def.dropna(subset=['text'], inplace=True)
#column names
reviews_def.rename(columns={'publishedAtDate': 'review_date', 'categoryName': 'category', 'location/lat': 'lat', 'location/lng': 'long'}, inplace=True)

In [None]:
pd.set_option('display.max_colwidth', None)
reviews_def[reviews_def['text']>=1000].sample(5)

Unnamed: 0,title,review_date,stars,text,category,lat,long,text_length
3124,Balzac's Powerhouse,2024-02-05,5,"Es un lugar el cual se tiene que ir ya qué es una cafetería original de canadá por lo tanto es una parada obligatoria qué hacer, el lugar en cuanto a diseño es muy original muy estético y acogedor lo cual lo hace excelente para pasar un buen rato solo o acompañado el servicio y atención es muy buena y La calidad de los productos también son muy buenas",Cafe,43.66964,-79.448545,353


In [None]:
star_counts = reviews_def['stars'].value_counts()
display(star_counts)

Unnamed: 0_level_0,count
stars,Unnamed: 1_level_1
5,4880
1,682
4,544
3,306
2,265


In [None]:
reviews_def.category.unique()

array(['Coffee shop', 'Cafe', 'Bakery', 'Coffee roasters',
       'Gourmet grocery store', 'Churreria', 'Espresso bar', 'Bagel shop',
       'Bicycle Shop', 'Creperie', 'Vintage clothing store', 'Bistro',
       'Chocolate factory', 'Cake shop', 'Venezuelan restaurant',
       'Ice cream shop', 'Pizza restaurant', 'Donut shop', 'Coffee store'],
      dtype=object)

In [None]:
reviews_def = reviews_def[reviews_def['title'] != 'Tim Hortons']
reviews_def = reviews_def[reviews_def['title'] != 'Starbucks']
reviews_def = reviews_def[reviews_def['category'].isin(['Coffee shop','Cafe','Coffee roasters','Espresso bar','Coffee store'])]

In [None]:
# Count the number of unique values in the 'title' column
unique_title_count = reviews_def['title'].nunique()

print(f"Total number of unique titles: {unique_title_count}")

Total number of unique titles: 204


In [None]:
reviews_def['text_length'] = reviews_def['text'].str.len()
reviews_def = reviews_def[reviews_def['text_length'] >= 50].reset_index(drop=True)
reviews_def = reviews_def[reviews_def['text_length'] <= 1200].reset_index(drop=True)

In [None]:
import re
# Remove escape sequences like \n, \t, etc.
reviews_def['text'] = reviews_def['text'].str.replace(r'\s+', ' ', regex=True).str.strip()

# Function to remove emojis/emoticons
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002700-\U000027BF"  # dingbats
        u"\U0001F900-\U0001F9FF"  # supplemental symbols
        u"\U00002600-\U000026FF"  # miscellaneous symbols
        u"\U0001FA70-\U0001FAFF"  # extended symbols (like 🫶🏽)
        u"\U00002500-\U00002BEF"  # various Chinese/Japanese characters and shapes
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Apply to DataFrame
reviews_def['text'] = reviews_def['text'].apply(remove_emojis)

In [None]:
positive_reviews = reviews_def[reviews_def['stars'].isin([4,5])].reset_index(drop=True)
negative_reviews = reviews_def[reviews_def['stars'].isin([1,2,3])].reset_index(drop=True)

In [None]:
print("positive reviews: ",positive_reviews.text.count(),"   Negative reviews: ",negative_reviews.text.count())

positive reviews:  4601    Negative reviews:  1172


## BERTopic

In [None]:
# Install BERTopic and its dependencies
!pip install bertopic umap-learn hdbscan -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/153.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m143.4/153.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from umap import UMAP
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from transformers import pipeline
from hdbscan import HDBSCAN

In [None]:
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=8, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
sentence_model = SentenceTransformer("all-MiniLM-L12-v2")

In [None]:
embeddings_p = sentence_model.encode(positive_reviews['text'], show_progress_bar=True)

Batches:   0%|          | 0/144 [00:00<?, ?it/s]

In [None]:
embeddings_n = sentence_model.encode(negative_reviews['text'], show_progress_bar=True)

Batches:   0%|          | 0/37 [00:00<?, ?it/s]

## Representation Model

In [None]:
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired, PartOfSpeech
import spacy

main_representation_model = KeyBERTInspired()
pos = PartOfSpeech("en_core_web_sm", top_n_words=15)
keybert_model = KeyBERTInspired(top_n_words=15)
mmr = MaximalMarginalRelevance(diversity=0.5, top_n_words = 5)

pos_key_mmr = [pos, keybert_model, mmr]

In [None]:
representation_model = {
    "main": main_representation_model,
    "part_of_speech": pos,
    "keyb": keybert_model,
    "mix": pos_key_mmr
}

In [None]:
topic_model_p = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    representation_model=representation_model
    )
topics_p, probs_p = topic_model_p.fit_transform(positive_reviews['text'], embeddings_p)

In [None]:
topic_model_n = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    representation_model=representation_model
    )
topics_n, probs_n = topic_model_n.fit_transform(negative_reviews['text'], embeddings_n)

In [None]:
pd.set_option('display.max_colwidth', 150)
topic_model_n.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,main,part_of_speech,keyb,mix,Representative_Docs
0,-1,542,-1_coffee_service_like_order,"[coffee, service, like, order, just, staff, place, good, time, drink]","[coffee shop, coffee, cafe, barista, customer service, latte, espresso, customers, customer, drinks]","[coffee, service, order, staff, place, good, time, drink, customer, experience, milk, food, drinks, people, cafe]","[coffee shop, coffee, cafe, barista, customer service, latte, espresso, customers, customer, drinks, taste, tasted, review, rude, service]","[coffee shop, customer service, latte, tasted, rude]","[I really really wanted to like this place because it is so close to my home, we desperately need a coffee spot there, and it has a really cool vi..."
1,0,105,0_rude_coffee_barista_staff,"[rude, coffee, barista, staff, customers, asked, just, attitude, service, customer]","[barista, rude staff, baristas, customer service, coffee shop, cafe, customer, customers, coffee, rude]","[rude, coffee, barista, staff, customers, attitude, service, customer, place, time, owner, experience, bad, drinks, drink]","[barista, rude staff, baristas, customer service, coffee shop, cafe, customer, customers, coffee, rude, latte, staff, shop, racist, service]","[rude staff, baristas, coffee shop, customer, racist]","[Had a poor experience at Dineen Coffee Co. The man taking my order (mustache, not in uniform) was rude ignored my “good morning” and snapped at m..."
2,1,104,1_coffee_place_nice_great,"[coffee, place, nice, great, good, cafe, really, atmosphere, shop, like]","[coffee shop, café, cafe, coffee mediocre, place coffee, coffee good, great coffee, coffee, espresso, latte]","[coffee, place, nice, great, good, cafe, atmosphere, shop, staff, time, small, better, seating, friendly, food]","[coffee shop, café, cafe, coffee mediocre, place coffee, coffee good, great coffee, coffee, espresso, latte, shop, place, drinks, cozy, cup]","[coffee shop, coffee mediocre, espresso, latte, cozy]","[This café near Lansdowne is easy to reach, though the location can be a bit confusing. It has a chill atmosphere with good music, and a family-fr..."
3,2,97,2_matcha_matcha latte_latte_milk,"[matcha, matcha latte, latte, milk, drink, like, iced, strawberry, drinks, powder]","[matcha latte, matcha lattes, matcha flavour, make matcha, iced matcha, strawberry matcha, matcha powder, matcha, quality matcha, lattes]","[matcha, matcha latte, latte, milk, drink, iced, strawberry, drinks, powder, flavour, matcha powder, cup, iced matcha, ice, good]","[matcha latte, matcha lattes, matcha flavour, make matcha, iced matcha, strawberry matcha, matcha powder, matcha, quality matcha, lattes, latte, w...","[matcha lattes, strawberry matcha, quality matcha, worst matcha, neo coffee]","[My boyfriend and I loved the vibes and atmosphere of the Rooms Coffee. However, we were super surprised how they use such low grade Matcha for th..."
4,3,55,3_latte_milk_like_chai,"[latte, milk, like, chai, tasted, ordered, tea, taste, tasted like, pistachio]","[iced latte, latte, latte just, ordered latte, lattes, saffron latte, chai latte, pistachio flavour, chai taste, flavor expected]","[latte, milk, chai, tea, taste, pistachio, oat, iced, drink, flavour, coffee, good, art, flavor, espresso]","[iced latte, latte, latte just, ordered latte, lattes, saffron latte, chai latte, pistachio flavour, chai taste, flavor expected, hojicha latte, c...","[iced latte, saffron latte, pistachio flavour, chai taste, hojicha latte]","[I came here today for the first time , I ordered a pistachio- saffron latte and butter croissant. I’ve had pistachio lattes many times and they u..."
5,4,53,4_sandwich_food_chicken_breakfast,"[sandwich, food, chicken, breakfast, bacon, egg, didn, got, sandwiches, place]","[meals, breakfast family, breakfast, breakfast sandwich, baked goods, egg sandwich, afford food, food, eggs bacon, avocado toast]","[sandwich, food, chicken, breakfast, bacon, egg, sandwiches, place, bread, menu, eggs, allergic, toast, ingredients, service]","[meals, breakfast family, breakfast, breakfast sandwich, baked goods, egg sandwich, afford food, food, eggs bacon, avocado toast, cake ordered, sa...","[breakfast sandwich, baked goods, afford food, avocado toast, eggs]",[I visited this place on February 11th and paid $14 +2.99 (for the egg) for the tiny avocado toast you see in the picture—plus tax. When I asked w...
6,5,42,5_coffee_beans_espresso_like,"[coffee, beans, espresso, like, taste, roasted, pour, good, jet fuel, fuel]","[special coffee, speciality coffee, coffee beans, coffee, coffee great, brewed coffee, coffee taste, coffee tastes, espresso horrible, coffee quite]","[coffee, beans, espresso, taste, roasted, pour, good, great, cup, americano, ice, disappointing, large, okay, paper]","[special coffee, speciality coffee, coffee beans, coffee, coffee great, brewed coffee, coffee taste, coffee tastes, espresso horrible, coffee quit...","[speciality coffee, brewed coffee, coffee tastes, choice beans, espresso shot]",[Very minimalist vibe was happy to wait for two Large Black Americanos as they don’t serve drip coffee. They do have several sugary variations. No...
7,6,38,6_cappuccino_milk_coffee_ordered,"[cappuccino, milk, coffee, ordered, whipped cream, whipped, cup, got, drink, like]","[coffee cappuccino, make cappuccino, cappuccino really, cappuccino regular, cappuccino tasted, cappuccino, ordered cappuccino, today cappuccino, c...","[cappuccino, milk, coffee, cup, drink, cream, hot, disappointing, taste, today, sure, double, temperature, options, better]","[coffee cappuccino, make cappuccino, cappuccino really, cappuccino regular, cappuccino tasted, cappuccino, ordered cappuccino, today cappuccino, c...","[coffee cappuccino, regular milk, cafe, latte, caphe]",[Gluten-Free Options: Yes Cross Contamination: Unsure Halal/ Kosher Options: No Vegan Option: Unsure Vegetarian Options: Yes Nut Free Options: Yes...
8,7,36,7_croissant_croissants_good_fresh,"[croissant, croissants, good, fresh, baked, cold, cheese, almond, warm, got]","[croissant good, croissant, croissants fresh, chocolate croissant, croissant fresh, place croissants, croissants, almond croissant, coming croissa...","[croissant, croissants, good, fresh, baked, cold, cheese, almond, warm, chocolate, french, stale, place, pastry, coffee]","[croissant good, croissant, croissants fresh, chocolate croissant, croissant fresh, place croissants, croissants, almond croissant, coming croissa...","[croissant fresh, croissants, bakery, pastries, coffee good]","[Used to go here in 2020, their croissants were the best. Now, not only they aren't as good as they used to be but the service is even worse. I or..."
9,8,33,8_coffee_prices_price_latte,"[coffee, prices, price, latte, good, 10, iced, 50, place, small]","[specialty coffee, best coffee, coffee shop, coffee place, starbucks, small coffee, coffee, good coffee, vanilla latte, prices]","[coffee, prices, price, latte, good, iced, place, small, overpriced, cup, iced latte, expensive, regular, shop, guys]","[specialty coffee, best coffee, coffee shop, coffee place, starbucks, small coffee, coffee, good coffee, vanilla latte, prices, iced latte, latte,...","[specialty coffee, coffee place, starbucks, prices, iced latte]",[This place is interesting. I was shocked these guys are charging airport prices for coffee!?! $10 dollars for two very lukewarm coffees - $1.50 f...
