In [1]:
from bertopic import BERTopic

from topic_modeling.pipeline import train_umap, get_embeddings
from topic_modeling.processing.data_handling import read_data_from_file
from topic_modeling.processing.data_processing import delete_none_docs
from topic_modeling.config.core import config

In [None]:
df = read_data_from_file(config.bert_topic_config.training_dataset_name)

def predict(user_data):
    docs = user_data.text.to_list()
    docs = delete_none_docs(docs)

    corpus_embeddings = get_embeddings(docs=docs)
    reduced_embeddings = train_umap(corpus_embeddings)

    model = BERTopic.load("/teamspace/studios/this_studio/nlp-hr-feedback/training_pipeline/topic_modeling/topic_modeling/trained_model/bert_topic_modeling_v0.0.0")

    model.fit_transform(docs)

    df = model.get_topic_info()

    return df


result = predict(df)
result.head()

In [2]:
df = read_data_from_file(config.bert_topic_config.training_dataset_name)

docs = df.text.to_list()
docs = delete_none_docs(docs)[:500]
len(docs)


500

In [None]:
corpus_embeddings = get_embeddings(docs=docs)
reduced_embeddings = train_umap(corpus_embeddings)

In [3]:
from bertopic import BERTopic
model = BERTopic.load("/teamspace/studios/this_studio/nlp-hr-feedback/training_pipeline/topic_modeling/topic_modeling/trained_model/bert_topic_modeling_v0.0.0")

In [4]:

# https://github.com/MaartenGr/BERTopic/issues/278
topics, probabilities = model.fit_transform(docs)

In [5]:
topics

[2,
 -1,
 0,
 -1,
 8,
 5,
 7,
 6,
 -1,
 2,
 -1,
 -1,
 8,
 7,
 1,
 7,
 4,
 2,
 1,
 4,
 6,
 3,
 5,
 -1,
 4,
 1,
 2,
 3,
 3,
 3,
 -1,
 8,
 2,
 3,
 3,
 -1,
 3,
 -1,
 5,
 -1,
 3,
 3,
 0,
 5,
 2,
 3,
 0,
 9,
 5,
 8,
 0,
 8,
 3,
 -1,
 5,
 7,
 6,
 -1,
 6,
 -1,
 4,
 8,
 5,
 9,
 4,
 1,
 2,
 -1,
 0,
 3,
 8,
 5,
 7,
 3,
 -1,
 -1,
 7,
 -1,
 4,
 4,
 1,
 0,
 2,
 1,
 5,
 0,
 0,
 1,
 6,
 9,
 0,
 1,
 1,
 0,
 6,
 1,
 7,
 5,
 -1,
 0,
 0,
 -1,
 7,
 4,
 4,
 -1,
 2,
 0,
 2,
 1,
 0,
 4,
 2,
 -1,
 -1,
 -1,
 -1,
 1,
 6,
 -1,
 6,
 -1,
 3,
 -1,
 0,
 2,
 2,
 -1,
 1,
 -1,
 1,
 3,
 0,
 0,
 0,
 0,
 6,
 1,
 0,
 -1,
 1,
 -1,
 -1,
 0,
 -1,
 8,
 5,
 1,
 -1,
 1,
 0,
 -1,
 6,
 2,
 2,
 5,
 2,
 3,
 -1,
 9,
 0,
 0,
 2,
 -1,
 0,
 0,
 3,
 0,
 -1,
 9,
 -1,
 1,
 -1,
 3,
 -1,
 7,
 -1,
 0,
 9,
 -1,
 8,
 -1,
 4,
 -1,
 4,
 1,
 -1,
 7,
 -1,
 -1,
 -1,
 0,
 0,
 -1,
 6,
 5,
 7,
 6,
 -1,
 -1,
 1,
 8,
 2,
 3,
 1,
 -1,
 -1,
 -1,
 6,
 -1,
 -1,
 8,
 2,
 1,
 6,
 2,
 -1,
 -1,
 0,
 3,
 3,
 -1,
 4,
 4,
 3,
 6,
 7,
 2,
 -1,
 5,
 4,
 5,
 0,
 0,
 7,

In [6]:
df = model.get_topic_info()

In [7]:
df.head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,132,-1_work_good_working_new,"[work, good, working, new, job, company, time,...",[Quaterly meets in a year about new products l...
1,0,73,0_dislike_like_na_ok,"[dislike, like, na, ok, say, yes, problem, sha...","[Nothing for Dislike, No dislike, Nothing to d..."
2,1,49,1_culture_good_company_team,"[culture, good, company, team, working, work, ...","[Work culture is very good, Over all good work..."
3,2,46,2_management_politics_staff_strategy,"[management, politics, staff, strategy, term, ...",[Slow to responses . Management is unclear as ...
4,3,46,3_bajaj_electricals_electrical_good,"[bajaj, electricals, electrical, good, company...","[Bajaj working culture very good,, I like work..."


In [None]:
model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [None]:
fig = model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [None]:
import plotly.io as pio
file_path = "/teamspace/studios/this_studio/nlp-hr-feedback/training_pipeline/topic_modeling/topic_modeling/datasets/img.png"
pio.write_image(fig, file_path, format='png')