In [6]:
import pandas as pd
import numpy as np
#
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
#

In [7]:
df_input = pd.read_csv('data_2023.csv', low_memory = False, on_bad_lines = "skip")

In [8]:
#Initial copy to avoid re-input
df = df_input.copy()
#Drop empty comment
df = df.dropna(subset = "Comment_Translation")
#Filter columns
df = df[['CGID', 'Survey_Completed_Date', 'CustomerNumber', 'HeinekenRegion', 'Country', 'CustomerCluster',
         'BusinessSegment', 'CustomerType', 'NetPromoterScore', 'Reasons_List', 'Comment', 'Comment_Translation']]
#Extract months
#df['Survey_Completed_Date'] = pd.to_datetime(df['Survey_Completed_Date'])
#df['Month'] = pd.DatetimeIndex(df['Survey_Completed_Date']).month
#Filter countries
#df = df[df['Country'] != 'VN'] 
df = df[df['Country'] != 'MX'] #Different operating system
df = df[df['Country'] != 'BI'] #Translation error
df = df[df['Country'] != 'KH'] #Translation error

#Data cleaning
#Convert everything to string type
df['Comment_Translation'] = df['Comment_Translation'].astype('str')
#Exclude numeric
df = df[~df['Comment_Translation'].str.isdigit()]
df = df[~df['Comment_Translation'].str.isnumeric()]
df = df[df['Comment_Translation'] != '100%']
#Exclue certain patterns that confuse the model (e.g. Nil vs NB (shorts for Nigerian breweries))
df = df[df['Comment_Translation'] != 'Nil']
df = df[df['Comment_Translation'] != 'Nill']
#Exclude CSAT
df = df[df['NetPromoterScore'] != -1]
#Remove certain patterns + HTML linebreak
rmv_lst = ['<br/>', '=', '=-']
for i in range(len(rmv_lst)):
    df = df.replace(rmv_lst[i], '', regex = True)
#Remove NULL/BLANK
df = df[df['Comment_Translation'] != '']
#display(df)

# Model

In [16]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
#embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Step 2 - Reduce dimensionality
umap_model = PCA(n_components=2)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# All steps together
topic_model = BERTopic(nr_topics=50,
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model                 # Step 5 - Topic representation
)
docs = df.Comment_Translation
topics, probs = topic_model.fit_transform(docs)

In [None]:
topic_model.save("BERTopic_Model", serialization="pickle")

In [9]:
output = df.copy()
output['Topic'] = topic_model.topics_

### Optimization

In [54]:
temp = output.copy()
cmt = temp['Comment_Translation'].tolist()
updated_topic = []
for i in range(len(cmt)):
    #print(cmt[i])
    #print(topic_model.transform(cmt[i])[0][0])
    updated_topic.append(topic_model.transform(cmt[i])[0][0])

In [56]:
output.to_csv('TM_Results.csv', encoding = 'utf_8_sig', index = False)