# BERTopic: Multimodal (Texts + Images)

In [1]:
import sys
sys.path.append('../../../')
import time
import pandas as pd

from src.helpers.postgres_helpers import execute_sql_select

## 1. Load data

In [2]:
table_name = "tg_oct_topic_modeling"
database = "telegram"

query = f"""SELECT * FROM {table_name}"""

data = execute_sql_select(command=query, database=database, return_result_as_df=True)
df = data.copy()

Column names:  ['index', 'channel_name', 'channel_id', 'message_id', 'text', 'cleaned_text', 'msg_date', 'media_id', 'msg_type', 'img_id', 'file_path', 'dhash', 'phash']
Connection to DB closed


## 2. Deduplicate by cleaned-text + dhash (base corpus)

In [4]:
#df = df.drop_duplicates('cleaned_text').drop_duplicates('dhash')

df = df.dropna(subset=['cleaned_text', 'dhash'])
len(df)

40110

## Select docs and images

In [5]:
docs = list(df.cleaned_text.values)
images = list(df.file_path.values)

## Initialize models

In [7]:
from bertopic import BERTopic
from bertopic.representation import VisualRepresentation
#from sentence_transformers import SentenceTransformer
from bertopic.backend import MultiModalBackend
from umap import UMAP


# Image embedding model
embedding_model = MultiModalBackend('clip-ViT-B-32', batch_size=32)

# Image to text representation model
image_to_text_model = "nlpconnect/vit-gpt2-image-captioning"

representation_model = {
    "Visual_Aspect": VisualRepresentation(image_to_text_model=image_to_text_model)
}

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


## 3. Train models with min_topic_sizes 10, 20, 50

In [8]:
# Set seed in UMAP model for reproducibility
umap_model = UMAP(
    random_state=42
)

MIN_TOPIC_SIZES = [10, 20, 50]


for VAL in MIN_TOPIC_SIZES: 

    # Train our model with images only
    topic_model = BERTopic(
        embedding_model=embedding_model, 
        representation_model=representation_model,
        umap_model= umap_model,
        min_topic_size = VAL
    )

    print(f"Generating topics with min_topic_size = {str(VAL)}.")

    t0 = time.perf_counter()
    
    # Train model
    topics, probs = topic_model.fit_transform(documents=docs, images=images)#, embeddings=doc_image_embeddings)
    print("Time elapsed: ", time.strftime("%H:%M:%S", time.gmtime((time.perf_counter() - t0))))

    topic_info = topic_model.get_topic_info()
    print(f"{len(topic_info)} topics found.")
    print("Time elapsed: ", time.strftime("%H:%M:%S", time.gmtime((time.perf_counter() - t0))))

    # save model using safetensors
    version = f"v6_{str(VAL)}"
    model_name = f"bertopic_multimodal_{version}"
    topic_model.save(f"models/{model_name}", serialization="safetensors", save_ctfidf=True)

    topic_info = topic_model.get_topic_info()
    topic_info.to_csv(f"bertopic_results/{model_name}_topic_info.csv")

    docs_topic_info = topic_model.get_document_info(images)
    docs_topic_info.to_csv(f"bertopic_results/{model_name}_docs_topic_info.csv")

Token indices sequence length is longer than the specified maximum sequence length for this model (103 > 77). Running this sequence through the model will result in indexing errors


Generating topics with min_topic_size = 10.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Time elapsed:  00:16:05
483 topics found.
Time elapsed:  00:16:05
Generating topics with min_topic_size = 20.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Time elapsed:  00:15:25
211 topics found.
Time elapsed:  00:15:25
Generating topics with min_topic_size = 50.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Time elapsed:  00:15:36
85 topics found.
Time elapsed:  00:15:36
