# BERTopic: Text only

In [1]:
import sys
sys.path.append('../../../')
import time
import pandas as pd

from src.helpers.postgres_helpers import execute_sql_select

## 1. Load prepared data from DB

In [2]:
table_name = f"tg_oct_topic_modeling"
database = "telegram"

query = f"""SELECT * FROM {table_name}"""

data = execute_sql_select(command=query, database=database, return_result_as_df=True)
df = data.copy()

Column names:  ['index', 'channel_name', 'channel_id', 'message_id', 'text', 'cleaned_text', 'msg_date', 'media_id', 'msg_type', 'img_id', 'file_path', 'dhash', 'phash']
Connection to DB closed


In [3]:
len(df)

40110

In [4]:
len(df.channel_name.unique())

571

## Drop duplicate texts

In [7]:
df = df.drop_duplicates('cleaned_text')
len(df)

36717

In [8]:
# Select docs 
docs = list(df.cleaned_text.values)
print(len(docs))

36717


## 3. Train model

### Deduplication by text, min_topic_sizes = 10,20,50

In [8]:
from bertopic import BERTopic
from umap import UMAP

# Set seed in UMAP model for reproducibility
umap_model = UMAP(
    random_state=42
)

MIN_TOPIC_SIZES = [10, 20, 50]

for VAL in MIN_TOPIC_SIZES: 
    # Train our model with texts only
    topic_model = BERTopic(
        language='multilingual', 
        umap_model=umap_model, 
        min_topic_size = VAL
    ) 

    print(f"Generating topics with min_topic_size = {str(VAL)}.")
    
    t0 = time.perf_counter()  
    # Train model 
    topics, probs = topic_model.fit_transform(docs)
    topic_info = topic_model.get_topic_info()
    print(f"{len(topic_info)} topics found.")
    print("Time elapsed: ", time.strftime("%H:%M:%S", time.gmtime((time.perf_counter() - t0))))

    # save model using safetensors
    version = f"v6_dedup_{str(VAL)}"
    model_name = f"bertopic_text_{version}"
    topic_model.save(f"models/{model_name}", serialization="safetensors", save_ctfidf=True)

    topic_info = topic_model.get_topic_info()
    topic_info.to_csv(f"bertopic_results/{model_name}_topic_info.csv")

    docs_topic_info = topic_model.get_document_info(docs)
    docs_topic_info.to_csv(f"bertopic_results/{model_name}_docs_topic_info.csv")

Generating topics with min_topic_size = 10.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

447 topics found.
Time elapsed:  00:01:25
Generating topics with min_topic_size = 20.
168 topics found.
Time elapsed:  00:01:01
Generating topics with min_topic_size = 50.
9 topics found.
Time elapsed:  00:01:01
