# Libraries and Functions

In [128]:
from utils import * 

import numpy as np
import pandas as pd
from pprint import pprint
import os


# Models
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# Coherence Scores
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.option_context('display.max_colwidth', 500);

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)


config = get_config('config.yaml')

In [194]:
# reference from https://github.com/MaartenGr/BERTopic/pull/95
# with little modification

def preprocess_text(documents):
    """ Basic preprocessing of text
    Steps:
        * Lower text
        * Replace \n and \t with whitespace
        * Only keep alpha-numerical characters
    """
    cleaned_documents = [doc.lower() for doc in documents]
    cleaned_documents = [doc.replace("\n", " ") for doc in cleaned_documents]
    cleaned_documents = [doc.replace("\t", " ") for doc in cleaned_documents]
    cleaned_documents = [re.sub(r'[^A-Za-z0-9 ]+', '', doc) for doc in cleaned_documents]
    cleaned_documents = [doc if doc != "" else "emptydoc" for doc in cleaned_documents]
    return cleaned_documents

def get_coherence_score(documents, bertopic_model, coherence=None):

    """ Getting a coherence score for the topics generated by the model

    Steps : 
          * Clean documents using inbuilt preprocessing
          * Initialize inbuilt CountVectorizer
          * Build a dictionary using tokens from the tokenized clean documents
          * Get all topic words from all topics, build a coherence model
    """

    if coherence == None:
        coherence = 'c_v'

    cleaned_documents = preprocess_text(documents)
    
    n_gram_range: Tuple[int, int] = (1, 1)
    vectorizer_model = CountVectorizer(ngram_range=n_gram_range)
    vectorizer_model.fit(cleaned_documents)
    tokenizer = vectorizer_model.build_tokenizer()

    words = vectorizer_model.get_feature_names()
    tokens = [tokenizer(doc) for doc in cleaned_documents]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words_ = [[words for words, _ in bertopic_model.get_topic(topic)] for topic in range(len(set(topics))-1)]

    coherence_model = CoherenceModel(topics=topic_words_, 
                                     texts=tokens,
                                     corpus=corpus,
                                     dictionary=dictionary,
                                     coherence=coherence)

    return coherence_model.get_coherence()

# BOLT

## P1

In [197]:
# Import dataset
import pandas as pd
df = pd.read_csv(config['csv_input_local']['bolt_apple_google_p1'], index_col=0)
df = df.reset_index(drop=True)
reviews = df.review.tolist()
# reviews = reviews[:500] # testing purpose
len(reviews)

40365

In [138]:
# %%time
# from bertopic import BERTopic

# # topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
# # topics, probs = topic_model.fit_transform(reviews)
# topic_model = BERTopic(language="english", nr_topics=20, low_memory=True, verbose=True)
# topics, _ = topic_model.fit_transform(reviews)

Batches:   0%|          | 0/1262 [00:00<?, ?it/s]

2021-08-04 13:02:49,149 - BERTopic - Transformed documents to Embeddings
2021-08-04 13:03:14,531 - BERTopic - Reduced dimensionality with UMAP
2021-08-04 13:03:18,536 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-08-04 13:04:17,529 - BERTopic - Reduced number of topics from 533 to 21


Wall time: 6min 37s


In [139]:
# # Save model
# # topic_model.save("my_model.pt")
# topic_model.save("bert_topic_model/bolt_model_p1.pt", save_embedding_model=True)


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



In [196]:
# Load model
my_model = BERTopic.load("bert_topic_model/bolt_model_p1.pt")

In [200]:
# freq = my_model.get_topic_info(); freq.head(50)

In [142]:
# for i in range(0,len(freq)-1):
#   # print(i)
#   print('topic ' + str(i))
#   print(my_model.get_topic(i))  # Select the most frequent topic

In [199]:
# my_model.visualize_topics()

In [202]:
# my_model.visualize_heatmap(n_clusters=20, width=1000, height=1000)

### Get Coherence Score

In [144]:
get_coherence_score(reviews, my_model)

0.4410106989450021

## P2

In [188]:
# Import dataset
import pandas as pd
df = pd.read_csv(config['csv_input_local']['bolt_apple_google_p2'], index_col=0)
df = df.reset_index(drop=True)
reviews = df.review.tolist()
# reviews = reviews[:500] # testing purpose
len(reviews)

17930

In [189]:
# %%time
# from bertopic import BERTopic

# # topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
# # topics, probs = topic_model.fit_transform(reviews)
# topic_model = BERTopic(language="english", nr_topics=20, low_memory=True, verbose=True)
# topics, _ = topic_model.fit_transform(reviews)

Batches:   0%|          | 0/561 [00:00<?, ?it/s]

2021-08-04 13:21:35,143 - BERTopic - Transformed documents to Embeddings
2021-08-04 13:21:47,190 - BERTopic - Reduced dimensionality with UMAP
2021-08-04 13:21:50,221 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-08-04 13:22:13,817 - BERTopic - Reduced number of topics from 264 to 21


Wall time: 2min 31s


In [190]:
# # Save model
# topic_model.save("bert_topic_model/bolt_model_p2.pt", save_embedding_model=True)


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



In [191]:
# Load model
my_model = BERTopic.load("bert_topic_model/bolt_model_p2.pt")

In [155]:
# freq = my_model.get_topic_info(); freq.head(50)

Unnamed: 0,Topic,Count,Name
0,-1,4366,-1_this_my_driver_that
1,0,1805,0_drivers_friendly_good_driver
2,1,1142,1_service_code_promo_app
3,2,891,2_uber_taxify_drivers_app
4,3,874,3_time_minutes_waiting_arrival
5,4,790,4_great_its_think_grace
6,5,759,5_prices_affordable_discounts_price
7,6,748,6_perfect_experience_love_great
8,7,732,7_service_exceptional_great_excellent
9,8,686,8_ride_rides_enjoyed_enjoy


In [186]:
# my_model.visualize_topics()

In [None]:
# my_model.visualize_heatmap(n_clusters=20, width=1000, height=1000)

### Get Coherence Score

In [157]:
get_coherence_score(reviews, my_model)

0.4151859968841558

## P3

In [171]:
# Import dataset
import pandas as pd
df = pd.read_csv(config['csv_input_local']['bolt_apple_google_p3'], index_col=0)
df = df.reset_index(drop=True)
reviews = df.review.tolist()
# reviews = reviews[:500] # testing purpose
len(reviews)

10785

In [61]:
# %%time
# from bertopic import BERTopic

# # topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
# # topics, probs = topic_model.fit_transform(reviews)
# topic_model = BERTopic(language="english", nr_topics=20, low_memory=True, verbose=True)
# topics, _ = topic_model.fit_transform(reviews)

Batches:   0%|          | 0/338 [00:00<?, ?it/s]

2021-08-04 12:24:15,634 - BERTopic - Transformed documents to Embeddings
2021-08-04 12:24:21,703 - BERTopic - Reduced dimensionality with UMAP
2021-08-04 12:24:22,132 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-08-04 12:24:31,458 - BERTopic - Reduced number of topics from 103 to 21


Wall time: 1min 43s


In [173]:
# # Save model
# # topic_model.save("my_model.pt")
# topic_model.save("bert_topic_model/bolt_model_p3.pt", save_embedding_model=True)

In [172]:
# Load model
my_model = BERTopic.load("bert_topic_model/bolt_model_p3.pt")

In [66]:
# my_model.topics

{-1: [('app', 0.018353204641459606),
  ('my', 0.016679866022725625),
  ('driver', 0.015753801934949206),
  ('on', 0.01485196499772251),
  ('your', 0.012536687454366514),
  ('service', 0.012335311559943295),
  ('ride', 0.012250616571304954),
  ('drivers', 0.012044195580789677),
  ('trip', 0.011666838177053471),
  ('be', 0.011023728757457999)],
 0: [('drivers', 0.06191044168497363),
  ('friendly', 0.0500010120485949),
  ('good', 0.046331205045209205),
  ('nice', 0.030445943712041762),
  ('driver', 0.027910788752815312),
  ('cars', 0.021509001279395097),
  ('safe', 0.01866213358189536),
  ('clean', 0.017881654417416598),
  ('excellent', 0.017044106172922076),
  ('polite', 0.014187469581955218)],
 1: [('app', 0.05698491537237867),
  ('drivers', 0.048502691102256246),
  ('good', 0.03560001868590272),
  ('friendly', 0.023154923412571426),
  ('awesome', 0.0222456674635554),
  ('nice', 0.01989353292091475),
  ('cars', 0.014434753055681973),
  ('driver', 0.014255213811518399),
  ('easy', 0.0141

In [174]:
# freq = my_model.get_topic_info(); freq.head(50)

In [175]:
# for i in range(0,len(freq)-1):
#   # print(i)
#   print('topic ' + str(i))
#   print(my_model.get_topic(i))  # Select the most frequent topic

In [176]:
# my_model.visualize_topics()

In [177]:
# my_model.visualize_heatmap(n_clusters=20, width=1000, height=1000)

### Get Coherence Score

In [117]:
get_coherence_score(reviews, my_model)

0.4518213156209174