In [2]:
# import libraries
# import transforms
# from transforms.api import Input, Output, transform
from bertopic import BERTopic
import numpy as np
import pandas as pd
from umap import UMAP
from hdbscan import HDBSCAN
import ast
# from pyspark.sql.types import StructType, StructField, IntegerType, ArrayType, StringType, DoubleType, FloatType
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech
import spacy
from bertopic.vectorizers import ClassTfidfTransformer
# import en_core_web_sm
from sentence_transformers import SentenceTransformer
# from transforms.external.systems import use_external_systems, EgressPolicy, Credential, ExportControl
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def getSentimentGroup(compound):
    if 0.66 <= compound < 1:
        return 'Very Good'
    elif 0.33 <= compound < 0.66:
        return 'Good'
    elif 0 <= compound < 0.33:
        return 'Neutral Postive'
    elif -0.33 <= compound < 0:
        return 'Neutral Negative'
    elif -0.66 <= compound < -0.33:
        return 'Bad'
    elif -1 <= compound < -0.66:
        return 'Very Bad'
       
 
def getSentimentDict(sentence):
    analyzer = SentimentIntensityAnalyzer()
    score = analyzer.polarity_scores(sentence)
    compound = score #score['compound']
    return compound


# params
# params = params.dataframe()
# params = params.collect()[0][:-1]
# [n_neighbors, _, min_cluster_size, _, _] = params
# n_neighbors = 15
# min_cluster_size = 15

n_neighbors=15
min_topic_size=15
min_cluster_size=15
top_n_words=10
diversity = 0.9

# umap model
umap_model = UMAP(n_neighbors=n_neighbors, n_components=5, min_dist=0, metric='cosine', random_state=42)

# clustering
hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
# representation model
keybert_model = KeyBERTInspired()
mmr_model = MaximalMarginalRelevance(diversity)

nlp = spacy.load("en_core_web_sm") #inshallah
pos = PartOfSpeech(top_n_words=top_n_words)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
    "pos": pos
}

#Train the model
# model = BERTopic(
#   # Pipeline models
#   embedding_model = embedding_model,
#   umap_model=umap_model,
#   hdbscan_model=hdbscan_model,
#   vectorizer_model=vectorizer_model,
#   representation_model=representation_model,
#   # Hyperparameters
#   top_n_words = top_n_words,
#   min_topic_size = min_topic_size,
#   #nr_topics=75,
#   ctfidf_model=ctfidf_model,
#   verbose=True
# )


embedding_model = SentenceTransformer('all-MiniLM-L6-v2',device="cpu")

model = BERTopic(
    verbose=True,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    representation_model= representation_model,
    embedding_model = embedding_model
)


# source_df = source_df.dataframe()
# # source_df = source_df.limit(2000)
# df = source_df.toPandas()

df = pd.read_csv('input/sentence_embedded.csv') # change the filename to your .csv file name and location

df = df.dropna(subset=['embedding'])

df['embedding'] = df['embedding'].apply(lambda x: ast.literal_eval(x))

docs = df.text.to_list()
embeddings = np.array(df['embedding'].apply(lambda x: np.array(x).astype(np.float32)).to_list())
# embeddings = np.array(df['embedding'].apply(lambda x: np.array(x, dtype=np.float32)).to_list())
topics, probs = model.fit_transform(docs, embeddings)
df["topic"] = topics
df["probability"] = probs

# get sentiment
df['sentiment_analysis'] = df['text'].apply(lambda x: getSentimentDict(x))
df['sentiment_compound'] = df['sentiment_analysis'].apply(lambda x: x['compound'])
df['sentiment'] = df['sentiment_compound'].apply(lambda x: getSentimentGroup(x)) # Good Bad

2024-08-27 11:31:52,812 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-27 11:32:32,150 - BERTopic - Dimensionality - Completed ✓
2024-08-27 11:32:32,152 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-27 11:32:32,759 - BERTopic - Cluster - Completed ✓
2024-08-27 11:32:32,768 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-08-27 11:32:57,223 - BERTopic - Representation - Completed ✓


In [8]:
out = model.get_topic_info()

print(out)

out.to_csv("temp_files/topic_info.csv")

     Topic  Count                              Name  \
0       -1   3912                  -1_and_the_to_we   
1        0    451  0_location_shopping_walk_walking   
2        1    353                    1_de_et_le_est   
3        2    352       2_experience_say_it_overall   
4        3    309        3_shower_bathroom_tub_bath   
..     ...    ...                               ...   
102    101     16       101_she_standing_health_her   
103    102     16            102_name_names_by_knew   
104    103     15     103_drawers_desk_closet_couch   
105    104     15    104_nights_august_spent_during   
106    105     15     105_issues_minor_few_complain   

                                        Representation  \
0    [and, the, to, we, hotel, was, of, in, with, for]   
1    [location, shopping, walk, walking, distance, ...   
2        [de, et, le, est, la, und, un, les, pour, au]   
3    [experience, say, it, overall, expectations, t...   
4    [shower, bathroom, tub, bath, toilet, marble

In [25]:
topic_info_df = pd.DataFrame(out)

topic_info_df.head()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,pos,Representative_Docs
0,-1,3912,-1_and_the_to_we,"[and, the, to, we, hotel, was, of, in, with, for]","[hotel, hotels, rooms, room, restaurant, staye...","[and, the, to, we, hotel, was, of, in, with, for]","[hotel, room, service, stay, nice, location, d...",[Having done a great deal of research about Ne...
1,0,451,0_location_shopping_walk_walking,"[location, shopping, walk, walking, distance, ...","[downtown, location, near, located, place, mid...","[location, shopping, walk, walking, distance, ...","[location, shopping, walk, distance, restauran...",[Close to great shopping and restaurants this ...
2,1,353,1_de_et_le_est,"[de, et, le, est, la, und, un, les, pour, au]","[hôtel, pas, dans, une, bien, être, de, le, pe...","[de, et, le, est, la, und, un, les, pour, au]","[et, und, pour, au, très, vous, sont, chambres...","[Bonjour, J'ai séjourné dans cette hôtel en ao..."
3,2,352,2_experience_say_it_overall,"[experience, say, it, overall, expectations, t...","[experience, overall, wonderful, amazing, grea...","[experience, say, it, overall, expectations, t...","[experience, overall, expectations, impressed,...","[Overall a great experience, it was really a w..."
4,3,309,3_shower_bathroom_tub_bath,"[shower, bathroom, tub, bath, toilet, marble, ...","[bathrooms, bathroom, shower, bath, bathtub, s...","[shower, bathroom, tub, bath, toilet, marble, ...","[shower, bathroom, tub, bath, toilet, marble, ...","[Bathroom has separate bath and shower, The ba..."


In [26]:
topic_info_df['Name'] = topic_info_df['Name'].apply(lambda x: x.split('_', 1)[1])

topic_info_df.head()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,pos,Representative_Docs
0,-1,3912,and_the_to_we,"[and, the, to, we, hotel, was, of, in, with, for]","[hotel, hotels, rooms, room, restaurant, staye...","[and, the, to, we, hotel, was, of, in, with, for]","[hotel, room, service, stay, nice, location, d...",[Having done a great deal of research about Ne...
1,0,451,location_shopping_walk_walking,"[location, shopping, walk, walking, distance, ...","[downtown, location, near, located, place, mid...","[location, shopping, walk, walking, distance, ...","[location, shopping, walk, distance, restauran...",[Close to great shopping and restaurants this ...
2,1,353,de_et_le_est,"[de, et, le, est, la, und, un, les, pour, au]","[hôtel, pas, dans, une, bien, être, de, le, pe...","[de, et, le, est, la, und, un, les, pour, au]","[et, und, pour, au, très, vous, sont, chambres...","[Bonjour, J'ai séjourné dans cette hôtel en ao..."
3,2,352,experience_say_it_overall,"[experience, say, it, overall, expectations, t...","[experience, overall, wonderful, amazing, grea...","[experience, say, it, overall, expectations, t...","[experience, overall, expectations, impressed,...","[Overall a great experience, it was really a w..."
4,3,309,shower_bathroom_tub_bath,"[shower, bathroom, tub, bath, toilet, marble, ...","[bathrooms, bathroom, shower, bath, bathtub, s...","[shower, bathroom, tub, bath, toilet, marble, ...","[shower, bathroom, tub, bath, toilet, marble, ...","[Bathroom has separate bath and shower, The ba..."


In [27]:
topic_info_df['Name'] = topic_info_df['Name'].str.replace('_', ' ')

topic_info_df.head()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,pos,Representative_Docs
0,-1,3912,and the to we,"[and, the, to, we, hotel, was, of, in, with, for]","[hotel, hotels, rooms, room, restaurant, staye...","[and, the, to, we, hotel, was, of, in, with, for]","[hotel, room, service, stay, nice, location, d...",[Having done a great deal of research about Ne...
1,0,451,location shopping walk walking,"[location, shopping, walk, walking, distance, ...","[downtown, location, near, located, place, mid...","[location, shopping, walk, walking, distance, ...","[location, shopping, walk, distance, restauran...",[Close to great shopping and restaurants this ...
2,1,353,de et le est,"[de, et, le, est, la, und, un, les, pour, au]","[hôtel, pas, dans, une, bien, être, de, le, pe...","[de, et, le, est, la, und, un, les, pour, au]","[et, und, pour, au, très, vous, sont, chambres...","[Bonjour, J'ai séjourné dans cette hôtel en ao..."
3,2,352,experience say it overall,"[experience, say, it, overall, expectations, t...","[experience, overall, wonderful, amazing, grea...","[experience, say, it, overall, expectations, t...","[experience, overall, expectations, impressed,...","[Overall a great experience, it was really a w..."
4,3,309,shower bathroom tub bath,"[shower, bathroom, tub, bath, toilet, marble, ...","[bathrooms, bathroom, shower, bath, bathtub, s...","[shower, bathroom, tub, bath, toilet, marble, ...","[shower, bathroom, tub, bath, toilet, marble, ...","[Bathroom has separate bath and shower, The ba..."


In [58]:
corr_matrix = pd.read_csv('temp_files/lets_fininsh.csv')

topics = corr_matrix.iloc[1:, 0].to_numpy()

rows_array = [row[1:] for row in corr_matrix.iloc[1:, 1:].to_numpy()]

corr_matrix = corr_matrix.drop(columns=[corr_matrix.columns[0], corr_matrix.columns[1]])

# print(topics)

print(rows_array)

# corr_matrix.head()

# Drop the second row (index 1)
# df = df.drop(index=[0,1])

[array([1.        , 0.02680252, 0.25574796, 0.174959  , 0.24810014,
       0.17344308, 0.15337189, 0.18543768, 0.26575947, 0.1724201 ,
       0.19642293, 0.02094883, 0.15050889, 0.23618756, 0.21829454,
       0.15073459, 0.14758993, 0.20546146, 0.70584944, 0.16084192,
       0.15662851, 0.16068698, 0.19574302, 0.00958129, 0.17367334,
       0.17669551, 0.0141002 , 0.11414108, 0.11766784, 0.07240703,
       0.1346838 , 0.14514125, 0.17939231, 0.11374096, 0.14698832,
       0.10983341, 0.17145703, 0.07399387, 0.12879951, 0.13661863,
       0.08558059, 0.11867829, 0.0911229 , 0.1753101 , 0.18436915,
       0.15793144, 0.12649345, 0.1536175 , 0.10201046, 0.12872109,
       0.34105262, 0.16939768, 0.15769213, 0.08600853, 0.17121432,
       0.03654593, 0.15243553, 0.1192226 , 0.20329342, 0.12327771,
       0.09687721, 0.07516861, 0.22915702, 0.12383274, 0.07498299,
       0.11502647, 0.06328701, 0.07954869, 0.12834377, 0.11869976,
       0.13734148, 0.17393297, 0.08509667, 0.0971817 , 0.0924

In [65]:
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from scipy.spatial.distance import squareform, pdist
import matplotlib.pyplot as plt

# Example correlation matrix as a NumPy array
corr_matrix = rows_array

# topics = corr_matrix.iloc[1].to_numpy()

# Convert the numpy array to a DataFrame for ease of use
corr_df = pd.DataFrame(corr_matrix, index=topics, columns=topics)

# Set the diagonal to zero
np.fill_diagonal(corr_df.values, 0)

# Step 1: Convert correlation matrix to distance matrix
distance_matrix = 1 - corr_df

# Step 2: Flatten the distance matrix for the clustering algorithm
# condensed_dist_matrix = squareform(distance_matrix)
condensed_dist_matrix = pdist(distance_matrix)

# Step 3: Perform hierarchical clustering
Z = linkage(condensed_dist_matrix, method='ward')

# Step 4: Determine the number of clusters and cut the dendrogram
num_clusters = 10  # For example, if you want 10 categories
clusters = fcluster(Z, num_clusters, criterion='maxclust')

# Step 5: Assign clusters to topics
topic_clusters = pd.Series(clusters, index=corr_df.index)

# Step 6: Group topics by their cluster and output the result with topic names
clustered_topics = topic_clusters.groupby(topic_clusters).apply(lambda x: list(x.index))

# Output the clustering with topic names in each bucket
for cluster_id, topics in clustered_topics.items():
    print(f"Cluster {cluster_id}: {topics}")
 

Cluster 1: ['1_de_et_le_est', '11_di_il_la_che', '23_el_que_de_en', '26_lol_cooney_genial_russian', '55_00_ten_10_ouch', '78_highly_recommended_recommend_agno']
Cluster 2: ['28_internet_wifi_free_wi', '34_lobby_gorgeous_is_small', '40_carlton_ritz_charlotte_at', '46_decor_furnishings_style_modern', '48_mandarin_oriental_boston_at', '53_worth_expensive_penny_cheap', '59_parking_valet_35_seems', '60_air_conditioning_conditioner_temperature', '64_star_stars_five_category', '66_check_fast_quick_efficient', '67_mirror_tv_bathroom_built', '71_san_diego_francisco_located', '72_tv_channels_large_hd', '73_waldorf_astoria_towers_grandest', '75_seattle_puget_sound_fairmont', '76_grounds_garden_maintained_beautiful', '79_turn_turndown_down_service', '80_smell_odor_smelled_mildew', '81_dirty_clean_carpets_dusty', '83_ice_water_cold_pack', '84_conrad_shack_shake_new', '85_room_spotless_noticed_huge', '87_clean_spacious_modern_room', '89_slippers_robes_toiletries_sized', '90_hyatt_park_chicago_connec