## Save and load topic modeling
- Links: [Medium Blog](https://medium.com/grabngoinfo/topic-modeling-with-deep-learning-using-python-bertopic-cf91f5676504)

# Import library

In [8]:
from topic_modeling import __version__

__version__

''

In [34]:
import pandas as pd
import os

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, util
from umap import UMAP

from topic_modeling.config.core import DATASET_DIR
from topic_modeling.processing.data_handling import read_data_from_file

# Processing Data

In [41]:
# train_dataset_dir = str(DATASET_DIR / "Restaurant_Reviews.tsv")
# df = pd.read_csv(train_dataset_dir, sep='\t')

# train_dataset_dir = str(DATASET_DIR / "training.jsonl")
# df = pd.read_json(train_dataset_dir, lines=True)\

df = read_data_from_file("training.jsonl")
df.head()

Unnamed: 0,text,label
0,Only skill development is good.,Positive
1,"working culture,behavior, work life balance,jo...",Negative
2,Nothing,Positive
3,Company is good but company owners are treatin...,Negative
4,Best company,Positive


In [22]:
docs = df.text.to_list()

In [24]:
docs = [i for i in docs if i != None]
print(len(docs))

699


In [25]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(stop_words="english")

In [26]:
%%time
model_embedding = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = model_embedding.encode(docs)

CPU times: user 4.18 s, sys: 134 ms, total: 4.31 s
Wall time: 1.92 s


In [49]:
%%time
model = BERTopic(
    language="english",
    n_gram_range=(1, 2),
    vectorizer_model=vectorizer_model,
    # # nr_topics='auto',
    # min_topic_size=2,
    # # # diversity=0.7,
    # seed_topic_list=[
    #     ["experience", "bad", "good", "nice"],
    #     ["place", "atmosphere", "toilet", "clean"],
    #     ["staff", "waitress", "service"],
    #     ["wait", "time", "long"],
    #     ["food", "taste"]
    # ],
    # calculate_probabilities=True
    ).fit(docs, corpus_embeddings)

CPU times: user 5.62 s, sys: 239 ms, total: 5.85 s
Wall time: 3.22 s


In [50]:
topics, probabilities = model.transform(docs, corpus_embeddings)

In [51]:
df_topic_freq = model.get_topic_freq()
topics_count = len(df_topic_freq) - 1
df_topic_freq

Unnamed: 0,Topic,Count
1,-1,109
2,0,90
3,1,67
10,2,61
8,3,57
4,4,53
9,5,47
11,6,35
12,7,29
16,8,26


In [52]:
topics_count

17

In [53]:
model.visualize_topics()

In [54]:
model.visualize_barchart(top_n_topics=topics_count)

In [55]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(corpus_embeddings)
model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)