this notebook is for trying out small BERTopic models (2000 messages each) per different word_counts and then comparing them. If it works, this will be the procedure. 

1. cut df into word count ranges

In [1]:
import pandas as pd
df = pd.read_csv('path/to/textanal_short.csv')


### step 1: cut df into word count ranges

In [9]:
contains_floats_df = df['text_clean'].apply(lambda x: isinstance(x, float)).any()

if contains_floats_df:
    print("The column contains float values.")
    df = df[~df['text_clean'].apply(lambda x: isinstance(x, float))]
else:
    print("No float values in 'text_clean'.")

No float values in 'text_clean'.


In [5]:
import pandas as pd
import numpy as np

# Assuming df is your DataFrame and it already includes a 'word_count' column

# Corrected Define the word count bins and their labels
bins = [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 130, 150, 170, 200, 250, np.inf]
labels = ['5-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100', '101-110', '111-130', '131-150', '151-170', '171-200', '201-250', '251+']

# Categorize each message into word count ranges
df['word_count_range'] = pd.cut(df['word_count'], bins=bins, labels=labels, right=False)


In [2]:
!pip freeze > requirements.txt

subsamples holds df with the ranges

In [8]:
# Function to sample messages from each range
def sample_messages(df, word_count_range):
    return df[df['word_count_range'] == word_count_range].sample(n=2000, random_state=1)

# Dictionary to hold the subsamples for each word count range
subsamples = {}

for label in labels:
    # Attempt to sample 5,000 messages from each range
    try:
        subsamples[label] = sample_messages(df, label)
    except ValueError:
        # If the range has fewer than 5,000 messages, take all messages from that range
        subsamples[label] = df[df['word_count_range'] == label]
        print(f"Not enough messages in range {label}. Took all available messages.")

# At this point, `subsamples` is a dictionary where each key is a word count range (label)
# and each value is a DataFrame containing the subsample of 2,000 messages for that range.


### step 2 prepare text for BERT and run for each subsample

### step 3 now run BERTopic for each subsample


In [16]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic

# Download stopwords and combine them
nltk.download('stopwords')
stopwords_combined = stopwords.words('german') + stopwords.words('french') + \
                      stopwords.words('spanish') + stopwords.words('english')

# Initialize Sentence Transformer Model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Prepare UMAP and HDBSCAN models
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples=2, cluster_selection_method='eom')

# The rest of your BERTopic setup remains the same
# Iterate over each subsample and run BERTopic
bertopic_models = {}
for label, df_subsample in subsamples.items():
    # Ensure the text column exists
    if 'text_clean' not in df_subsample.columns:
        print(f"Column 'text_clean' not found in subsample {label}. Skipping...")
        continue
    
    # Encode documents using Sentence Transformer
    documents = df_subsample['text_clean'].tolist()
    document_vectors = model.encode(documents, show_progress_bar=True)
    
    # Initialize CountVectorizer with custom stopwords
    vectorizer = CountVectorizer(stop_words=stopwords_combined)
    
    # Create and fit BERTopic model
    topic_model = BERTopic(language="multilingual", embedding_model=model, umap_model=umap_model,
                           hdbscan_model=hdbscan_model, vectorizer_model=vectorizer, verbose=True)
    
    # Fit the model with the texts and the corresponding embeddings
    topics, probabilities = topic_model.fit_transform(documents, document_vectors)
    
    # Store the model for later access
    bertopic_models[label] = topic_model
    
    # Visualize the top n topics (optional: save the visualization)
    topic_model.visualize_barchart(top_n_topics=10)
    # topic_model.visualize_barchart(top_n_topics=10).write_html(f"{label}_barchart.html")
    
    # Print topic information
    print(topic_model.get_topic_info().head())

# At this point, bertopic_models contains all your models, accessible by their word count range labels.


[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2024-03-20 14:31:28,874 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-20 14:31:44,553 - BERTopic - Dimensionality - Completed ✓
2024-03-20 14:31:44,555 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-20 14:31:44,652 - BERTopic - Cluster - Completed ✓
2024-03-20 14:31:44,658 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-20 14:31:44,838 - BERTopic - Representation - Completed ✓


   Topic  Count                                               Name  \
0     -1    443    -1_channel_german_epochtimesdeutschland_beitrag   
1      0    152                                0_berlin_2022_02_24   
2      1     99            1_unzensiert_unzensiertv2_infoseite_gut   
3      2     95  2_epochtimesdeutschland_beitrag_neuer_deutschland   
4      3     59                   3_tagesschau_trump_biden_politik   

                                      Representation  \
0  [channel, german, epochtimesdeutschland, beitr...   
1  [berlin, 2022, 02, 24, 2021, 01, 10, demo, 03,...   
2  [unzensiert, unzensiertv2, infoseite, gut, net...   
3  [epochtimesdeutschland, beitrag, neuer, deutsc...   
4  [tagesschau, trump, biden, politik, politische...   

                                 Representative_Docs  
0  [our channel in german  bulgarian channel , ou...  
1  [der platz füllt sich!münchen 16.03.2022, demo...  
2  [@unzensiert infoseite @unzensiert / @unzensie...  
3  [warum windräder vi

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2024-03-20 14:31:47,491 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-20 14:32:03,245 - BERTopic - Dimensionality - Completed ✓
2024-03-20 14:32:03,248 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-20 14:32:03,352 - BERTopic - Cluster - Completed ✓
2024-03-20 14:32:03,357 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-20 14:32:03,532 - BERTopic - Representation - Completed ✓


   Topic  Count                                         Name  \
0     -1    334                 -1_channel_com_fairtalk_icic   
1      0    144         0_kurze_videos_finden_aussagekräftig   
2      1    104  1_epochtimesdeutschland_beitrag_neuer_times   
3      2     80            2_ukraine_nato_abonniere_russland   
4      3     77       3_telegram_newsletter_facebook_twitter   

                                      Representation  \
0  [channel, com, fairtalk, icic, news, law, rock...   
1  [kurze, videos, finden, aussagekräftig, unterh...   
2  [epochtimesdeutschland, beitrag, neuer, times,...   
3  [ukraine, nato, abonniere, russland, на, kanal...   
4  [telegram, newsletter, facebook, twitter, quer...   

                                 Representative_Docs  
0  [#einappell jetzt schon hier: vimeo.com/ondema...  
1  [gute nacht. bis morgen . [14643]kurze videos ...  
2  [landwirtschaft: immer mehr biobauern in deuts...  
3  [  eines der regierungsflugzeuge kehrt nach mo...  
4  

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2024-03-20 14:32:06,349 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-20 14:32:21,452 - BERTopic - Dimensionality - Completed ✓
2024-03-20 14:32:21,455 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-20 14:32:21,540 - BERTopic - Cluster - Completed ✓
2024-03-20 14:32:21,544 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-20 14:32:21,826 - BERTopic - Representation - Completed ✓


   Topic  Count                                               Name  \
0     -1    460      -1_schnell_finden_aussagekräftig_unterhaltsam   
1      0    110             0_ukraine_abonniere_russische_russland   
2      1    102              1_unzensierten_posten_überall_gruppen   
3      2     81  2_wirtschaftskrieg_infokrieg_viruskrieg_klagew...   
4      3     55                 3_telegram_twitter_instagram_stuht   

                                      Representation  \
0  [schnell, finden, aussagekräftig, unterhaltsam...   
1  [ukraine, abonniere, russische, russland, über...   
2  [unzensierten, posten, überall, gruppen, neuen...   
3  [wirtschaftskrieg, infokrieg, viruskrieg, klag...   
4  [telegram, twitter, instagram, stuht, kai, fac...   

                                 Representative_Docs  
0  [erinnere jetzt einen menschen an sein licht [...  
1  [   gepanzerter zug yenisei ist gestern im geb...  
2  [geisteskranke welt - und sie machen alle mit ...  
3  [cropenergies ag / 

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2024-03-20 14:32:25,469 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-20 14:32:38,330 - BERTopic - Dimensionality - Completed ✓
2024-03-20 14:32:38,333 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-20 14:32:38,423 - BERTopic - Cluster - Completed ✓
2024-03-20 14:32:38,428 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-20 14:32:38,715 - BERTopic - Representation - Completed ✓


   Topic  Count                                          Name  \
0     -1    382                  -1_mehr_deutschland_на_kanal   
1      0     99        0_ukraine_russland_russische_abonniere   
2      1     89        1_politik_politiker_polizei_polizisten   
3      2     88               2_corona_maßnahmen_pandemie_pcr   
4      3     75  3_unterhaltsam_aussagekräftig_videos_schnell   

                                      Representation  \
0  [mehr, deutschland, на, kanal, diebasis, teleg...   
1  [ukraine, russland, russische, abonniere, ukra...   
2  [politik, politiker, polizei, polizisten, deut...   
3  [corona, maßnahmen, pandemie, pcr, test, unter...   
4  [unterhaltsam, aussagekräftig, videos, schnell...   

                                 Representative_Docs  
0  [ sei du selbst die veränderung, die du dir wü...  
1  [selenskyj hat sein volk über die opferzahlen ...  
2  [der kölner kardinal woelki ist verärgert, wei...  
3  [analyse-dateien inzidenzen 30.04.2021   - inz...

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2024-03-20 14:32:42,790 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-20 14:32:56,595 - BERTopic - Dimensionality - Completed ✓
2024-03-20 14:32:56,599 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-20 14:32:56,679 - BERTopic - Cluster - Completed ✓
2024-03-20 14:32:56,692 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-20 14:32:57,026 - BERTopic - Representation - Completed ✓


   Topic  Count                                          Name  \
0     -1    520                  -1_menschen_kanal_mehr_covid   
1      0    155       0_ukraine_russland_russische_russischen   
2      1     93                  1_corona_dr_maßnahmen_impfen   
3      2     68                    2_euro_000_news_milliarden   
4      3     66  3_kenjebsen_kostenlos_gesundheit_psychologin   

                                      Representation  \
0  [menschen, kanal, mehr, covid, video, 19, bitt...   
1  [ukraine, russland, russische, russischen, put...   
2  [corona, dr, maßnahmen, impfen, med, pandemie,...   
3  [euro, 000, news, milliarden, gold, dollar, ju...   
4  [kenjebsen, kostenlos, gesundheit, psychologin...   

                                 Representative_Docs  
0  [meine lieben bikerfreunde, was sagt ihr dazu?...  
1  [ die ukraine ist nach russland zurückgekehrt ...  
2  [ bild legt nach!  "aus unerreichbaren inziden...  
3  [die fast 50.000 eu-beamten sollen mitten in d...

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2024-03-20 14:33:01,913 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-20 14:33:15,536 - BERTopic - Dimensionality - Completed ✓
2024-03-20 14:33:15,539 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-20 14:33:15,616 - BERTopic - Cluster - Completed ✓
2024-03-20 14:33:15,621 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-20 14:33:16,040 - BERTopic - Representation - Completed ✓


   Topic  Count                                Name  \
0     -1    659        -1_mehr_kanal_russland_wurde   
1      0    104         0_video_videos_youtube_film   
2      1     62    1_euro_bank_milliarden_inflation   
3      2     61  2_channel_german_bulgarian_russian   
4      3     49       3_straße_brandenburger_uhr_00   

                                      Representation  \
0  [mehr, kanal, russland, wurde, ukraine, gnv, c...   
1  [video, videos, youtube, film, livestreams, bi...   
2  [euro, bank, milliarden, inflation, europäisch...   
3  [channel, german, bulgarian, russian, ukrainia...   
4  [straße, brandenburger, uhr, 00, 18, tor, mont...   

                                 Representative_Docs  
0  [wer hat zeit und interesse einen impfbus zu "...  
1  [exklusiv: alle 100 videos! beim @der rote hah...  
2  [spritpreis-schock nach tankrabatt-ende!böses ...  
3  [  photo: geranium-2 barrage munitions hit mil...  
4  [berlin: demonstration für pressefreiheit vor ...  


Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2024-03-20 14:33:21,817 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-20 14:33:36,126 - BERTopic - Dimensionality - Completed ✓
2024-03-20 14:33:36,128 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-20 14:33:36,206 - BERTopic - Cluster - Completed ✓
2024-03-20 14:33:36,214 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-20 14:33:36,681 - BERTopic - Representation - Completed ✓


   Topic  Count                                Name  \
0     -1    482         -1_kanal_krieg_wer_menschen   
1      0    300           0_corona_covid_19_impfung   
2      1     67  1_channel_german_bulgarian_russian   
3      2     51       2_video_videos_youtube_natron   
4      3     46          3_epoch_times_direkt_infos   

                                      Representation  \
0  [kanal, krieg, wer, menschen, ukraine, wurde, ...   
1  [corona, covid, 19, impfung, mrna, gesundheit,...   
2  [channel, german, bulgarian, russian, ukrainia...   
3  [video, videos, youtube, natron, dominik, sku,...   
4  [epoch, times, direkt, infos, folgt, artikel, ...   

                                 Representative_Docs  
0  [zwar kein traktor, aber im herzen sind wir da...  
1  [umfrage der us-behörde cdc zeigt, dass mehr a...  
2  [  russian troops destroyed a storage depot of...  
3  [  videos vom aktuellen geschehen    video 1:r...  
4  [5,5 prozent mehr lohn: arbeitgeber und ig met...  


Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2024-03-20 14:33:43,442 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-20 14:33:56,773 - BERTopic - Dimensionality - Completed ✓
2024-03-20 14:33:56,780 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-20 14:33:56,855 - BERTopic - Cluster - Completed ✓
2024-03-20 14:33:56,862 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-20 14:33:57,410 - BERTopic - Representation - Completed ✓


   Topic  Count                                       Name  \
0     -1    622               -1_mehr_kanal_video_menschen   
1      0     83           0_telegram_twitter_facebook_musk   
2      1     65  1_corona_newsletter_weiterlesen_abonniert   
3      2     55                2_covid_19_coronavirus_sars   
4      3     49       3_channel_german_bulgarian_ukrainian   

                                      Representation  \
0  [mehr, kanal, video, menschen, us, ukraine, co...   
1  [telegram, twitter, facebook, musk, insoumissi...   
2  [corona, newsletter, weiterlesen, abonniert, g...   
3  [covid, 19, coronavirus, sars, virus, cov, pcr...   
4  [channel, german, bulgarian, ukrainian, russia...   

                                 Representative_Docs  
0  [ alcyon pleyaden favoriten - papst franziskus...  
1  [  présidentielle : les 18 propositions de mél...  
2  [dammbruch!! bewiesen: die corona impfungen si...  
3  [ärzte zeigen ihr gesicht: klares nein zur cov...  
4  [  footage o

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2024-03-20 14:34:04,393 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-20 14:34:18,274 - BERTopic - Dimensionality - Completed ✓
2024-03-20 14:34:18,284 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-20 14:34:18,359 - BERTopic - Cluster - Completed ✓
2024-03-20 14:34:18,371 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-20 14:34:18,921 - BERTopic - Representation - Completed ✓


   Topic  Count                                              Name  \
0     -1    573                       -1_mehr_bitte_ukraine_kanal   
1      0    101                            0_covid_test_corona_dr   
2      1     68  1_report24_unterstüzung_unterstuetzen_verbreiten   
3      2     63                 2_china_us_geopolitik_interessant   
4      3     53                 3_straße_freiheit_protest_münchen   

                                      Representation  \
0  [mehr, bitte, ukraine, kanal, wurde, menschen,...   
1  [covid, test, corona, dr, impfung, 19, grippe,...   
2  [report24, unterstüzung, unterstuetzen, verbre...   
3  [china, us, geopolitik, interessant, sanktione...   
4  [straße, freiheit, protest, münchen, frieden, ...   

                                 Representative_Docs  
0  [allround-tv vor ort: krank nach impfung      ...  
1  [nun endlich bewiesen, prof. drosten hat den p...  
2  [die umverteilung von windstrom aus dem norden...  
3  [abbruchunternehmen habec

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2024-03-20 14:34:25,759 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-20 14:34:38,821 - BERTopic - Dimensionality - Completed ✓
2024-03-20 14:34:38,825 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-20 14:34:38,913 - BERTopic - Cluster - Completed ✓
2024-03-20 14:34:38,921 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-20 14:34:39,550 - BERTopic - Representation - Completed ✓


   Topic  Count                                              Name  \
0     -1    649                      -1_mehr_kanal_menschen_wurde   
1      0    113                0_channel_german_bulgarian_russian   
2      1     52                 1_covid_virus_impfung_coronavirus   
3      2     44  2_report24_unterstüzung_unterstuetzen_informiert   
4      3     38                   3_euro_2021_milliarden_erdbeben   

                                      Representation  \
0  [mehr, kanal, menschen, wurde, на, bitte, ukra...   
1  [channel, german, bulgarian, russian, ukrainia...   
2  [covid, virus, impfung, coronavirus, 19, coron...   
3  [report24, unterstüzung, unterstuetzen, inform...   
4  [euro, 2021, milliarden, erdbeben, prozent, 00...   

                                 Representative_Docs  
0  [reformation 2.0 in magdeburg am dom (29.04.20...  
1  [  footage of the aftermath of the night strik...  
2  [durch die schockierenden fälle von sinusvenen...  
3  [es gibt themen, über die

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2024-03-20 14:34:46,577 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-20 14:34:59,177 - BERTopic - Dimensionality - Completed ✓
2024-03-20 14:34:59,184 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-20 14:34:59,252 - BERTopic - Cluster - Completed ✓
2024-03-20 14:34:59,265 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-20 14:34:59,989 - BERTopic - Representation - Completed ✓


   Topic  Count                                       Name  \
0     -1    560               -1_mehr_kanal_menschen_wurde   
1      0     82         0_channel_german_bulgarian_russian   
2      1     80    1_ukraine_russland_russischen_russische   
3      2     77                 2_opc_vitamin_haut_wirkung   
4      3     73  3_lockdown_mehr_bundesregierung_lockdowns   

                                      Representation  \
0  [mehr, kanal, menschen, wurde, geht, deutschla...   
1  [channel, german, bulgarian, russian, russia, ...   
2  [ukraine, russland, russischen, russische, str...   
3  [opc, vitamin, haut, wirkung, bio, körper, han...   
4  [lockdown, mehr, bundesregierung, lockdowns, b...   

                                 Representative_Docs  
0  [     #freemichaelballweg demo am 9. juli 2022...  
1  [   a captive of the ukrainian armed forces to...  
2  [ heutige bedingungslose unterstützung des wes...  
3  [opc ist ein bestandteil von traubenkernextrak...  
4  [      für n

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2024-03-20 14:35:07,100 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-20 14:35:20,028 - BERTopic - Dimensionality - Completed ✓
2024-03-20 14:35:20,030 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-20 14:35:20,114 - BERTopic - Cluster - Completed ✓
2024-03-20 14:35:20,118 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-20 14:35:20,869 - BERTopic - Representation - Completed ✓


   Topic  Count                                     Name  \
0     -1    615             -1_mehr_menschen_immer_wurde   
1      0     68  0_russland_russischen_ukraine_russische   
2      1     62           1_corona_maßnahmen_covid_daten   
3      2     54            2_video_zwanzig4_dlive_videos   
4      3     46  3_deutschland_deutschen_fraktion_berlin   

                                      Representation  \
0  [mehr, menschen, immer, wurde, kanal, uhr, geh...   
1  [russland, russischen, ukraine, russische, str...   
2  [corona, maßnahmen, covid, daten, 19, pandemie...   
3  [video, zwanzig4, dlive, videos, youtube, movi...   
4  [deutschland, deutschen, fraktion, berlin, fra...   

                                 Representative_Docs  
0  ["endstation karlsruhe .jetzt lasst uns noch '...  
1  [  außenminister der russischen föderation ser...  
2  [absolute zensurgefahr   dieses buch wird wahr...  
3  [  20:iv live wahlparty zur landtagswahl baden...  
4  [wenn das die deutschen 

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2024-03-20 14:35:27,874 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-20 14:35:42,348 - BERTopic - Dimensionality - Completed ✓
2024-03-20 14:35:42,350 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-20 14:35:42,444 - BERTopic - Cluster - Completed ✓
2024-03-20 14:35:42,452 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-20 14:35:43,243 - BERTopic - Representation - Completed ✓


   Topic  Count                                     Name  \
0     -1    601              -1_mehr_menschen_wurde_gibt   
1      0     92       0_channel_german_russian_bulgarian   
2      1     68  1_russischen_ukraine_russische_russland   
3      2     44         2_russland_gas_dollar_milliarden   
4      3     43          3_impfung_honig_lauterbach_eier   

                                      Representation  \
0  [mehr, menschen, wurde, gibt, uhr, immer, buch...   
1  [channel, german, russian, bulgarian, russia, ...   
2  [russischen, ukraine, russische, russland, str...   
3  [russland, gas, dollar, milliarden, industrie,...   
4  [impfung, honig, lauterbach, eier, buch, mediz...   

                                 Representative_Docs  
0  [   infos zum 1. august 2021 in berlin  der 1....  
1  [    the russian armed forces are advancing in...  
2  [die russische  luftwaffe hat einem tag hat 97...  
3  [russland exportiert in eu-länder öl und gas a...  
4  [dieses buch enthält den

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2024-03-20 14:35:50,364 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-20 14:36:01,930 - BERTopic - Dimensionality - Completed ✓
2024-03-20 14:36:01,932 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-20 14:36:02,035 - BERTopic - Cluster - Completed ✓
2024-03-20 14:36:02,040 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-20 14:36:03,021 - BERTopic - Representation - Completed ✓


   Topic  Count                                    Name  \
0     -1    697          -1_mehr_menschen_ukraine_wurde   
1      0    119         0_channel_russian_german_russia   
2      1     63                 1_virus_covid_corona_kw   
3      2     39  2_gesundheit_holistische_heilung_honig   
4      3     34               3_contre_site_plus_macron   

                                      Representation  \
0  [mehr, menschen, ukraine, wurde, kanal, russla...   
1  [channel, russian, german, russia, ukrainian, ...   
2  [virus, covid, corona, kw, 19, pandemie, cov, ...   
3  [gesundheit, holistische, heilung, honig, impf...   
4  [contre, site, plus, macron, facebook, twitter...   

                                 Representative_Docs  
0  [   erst die krankmachende und hochgefährliche...  
1  [     highlights:  g7 finance ministers to app...  
2  [medienstudie: die verengung der welt. zur med...  
3  [  verbotene und gefährliche pestizide in euro...  
4  [  retraites : le gouvernement

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2024-03-20 14:36:10,287 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-20 14:36:20,424 - BERTopic - Dimensionality - Completed ✓
2024-03-20 14:36:20,425 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-20 14:36:20,512 - BERTopic - Cluster - Completed ✓
2024-03-20 14:36:20,517 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-20 14:36:21,322 - BERTopic - Representation - Completed ✓


   Topic  Count                                     Name  \
0     -1    550           -1_ukraine_mehr_wurde_menschen   
1      0     71       0_channel_german_ukrainian_russian   
2      1     55           1_diebasis_grünen_grüne_partei   
3      2     49       2_polizei_stgb_polizisten_verboten   
4      3     39  3_gaza_israel_gazastreifen_israelischen   

                                      Representation  \
0  [ukraine, mehr, wurde, menschen, deutschland, ...   
1  [channel, german, ukrainian, russian, bulgaria...   
2  [diebasis, grünen, grüne, partei, betten, krei...   
3  [polizei, stgb, polizisten, verboten, verbot, ...   
4  [gaza, israel, gazastreifen, israelischen, ham...   

                                 Representative_Docs  
0  [  dringender unterstützungsaufruf   am 01.08....  
1  [   ukrainian militants have shelled a penal c...  
2  [      nrw funkt  partei  diebasis    gründung...  
3  [#politpolizei#polizeigründas wirklich #allerl...  
4  [ eu-mitarbeiter sind wü

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2024-03-20 14:36:28,270 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-20 14:36:38,019 - BERTopic - Dimensionality - Completed ✓
2024-03-20 14:36:38,021 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-20 14:36:38,093 - BERTopic - Cluster - Completed ✓
2024-03-20 14:36:38,099 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-20 14:36:39,044 - BERTopic - Representation - Completed ✓


   Topic  Count                                Name  \
0     -1    520        -1_mehr_wurde_menschen_immer   
1      0    129             0_covid_corona_19_virus   
2      1     84     1_russland_tonnen_ukraine_putin   
3      2     77          2_video_videos_film_google   
4      3     57  3_channel_russian_german_ukrainian   

                                      Representation  \
0  [mehr, wurde, menschen, immer, ukraine, schon,...   
1  [covid, corona, 19, virus, impfung, dr, corona...   
2  [russland, tonnen, ukraine, putin, russischen,...   
3  [video, videos, film, google, telegram, youtub...   
4  [channel, russian, german, ukrainian, forces, ...   

                                 Representative_Docs  
0  [es war mal die kleine güzey,  die lange zeit ...  
1  [geheime cdc-berichte bestätigen, dass 6 milli...  
2  [  rise of russia : american thinker nannte di...  
3  [  bundesregierung muss stellungnahme an un zu...  
4  [     the highlights of the night:  the europe...  


Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2024-03-20 14:36:46,165 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-20 14:36:54,951 - BERTopic - Dimensionality - Completed ✓
2024-03-20 14:36:54,953 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-20 14:36:55,021 - BERTopic - Cluster - Completed ✓
2024-03-20 14:36:55,026 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-20 14:36:56,553 - BERTopic - Representation - Completed ✓


   Topic  Count                                             Name  \
0     -1    669                       -1_menschen_uhr_wurde_mehr   
1      0     85                      0_menschen_mehr_immer_leben   
2      1     54        1_virus_covid_coronavirus_schutzmaßnahmen   
3      2     53  2_streitkräfte_richtung_russischen_ukrainischen   
4      3     47                  3_video_youtube_videos_pankalla   

                                      Representation  \
0  [menschen, uhr, wurde, mehr, immer, deutschlan...   
1  [menschen, mehr, immer, leben, viele, welt, tu...   
2  [virus, covid, coronavirus, schutzmaßnahmen, s...   
3  [streitkräfte, richtung, russischen, ukrainisc...   
4  [video, youtube, videos, pankalla, film, demo,...   

                                 Representative_Docs  
0  [ montagstermine für den 14.03.2022 in thüring...  
1  [facebookfund     das hab ich gerade von meine...  
2  [jetzt wird es hart.  die fraktionen der cdu/c...  
3  [     sonderchronik der spezial

### step 3: find similar topics across models

In [19]:
def find_similar_topics(word, topic_model):
    similar_topics, similarity = topic_model.find_topics(word, top_n=5)
    topic_info_df = topic_model.get_topic_info()

    print(f"The top 5 topics relating to '{word}' are: ")
    print("")

    for topic, similarity_score in zip(similar_topics, similarity):
        if topic == -1:
            print("Topic number: -1 (Outliers/Noise)")
            print("Similarity score:", similarity_score)
        else:
            name = topic_info_df[topic_info_df.Topic == topic]['Name'].values[0]
            representation = topic_info_df[topic_info_df.Topic == topic]['Representation'].values[0]
            representative_docs = topic_info_df[topic_info_df.Topic == topic]['Representative_Docs'].values[0]
            print(f"Topic number: {topic}")
            print(f"Name of the topic: {name}")
            print(f"Representative words: {representation}")
            print(f"Representative messages: {representative_docs}")
            print(f"Similarity score: {similarity_score}")
        print("----------------------")

In [20]:
keywords = ["wahl", "politik", "elite", "betrug", "volk", "leute", "unterdrückung", "staat", 
            "gewalt", "machtbegrenzung", "schwarmintelligenz", "freiheit", "basis", "Partei", 
            "system", "plandemie", "basisdemokratie", "Demokratie"]

for word in keywords:
    print(f"\nFinding topics similar to '{word}' across all models:")
    for label, topic_model in bertopic_models.items():
        print(f"\nWord Count Range: {label}")
        find_similar_topics(word, topic_model)
    print("\n" + "="*80)



Finding topics similar to 'wahl' across all models:

Word Count Range: 5-10
The top 5 topics relating to 'wahl' are: 

Topic number: 61
Name of the topic: 61_zeitung_fake_news_verbreitet
Representative words: ['zeitung', 'fake', 'news', 'verbreitet', 'kanzler', 'weitergeben', 'ard20', '2024güzey', 'abgeführt', 'abgesagt']
Representative messages: ['schweizer bauer, eine zeitung welche unabhängig recherchiert!', 'leserbrief, kleine zeitung kärnten, 4. 7. 2021', 'bauern-demo eskaliert: auftritt von özdemir abgesagt   berliner zeitung']
Similarity score: 0.2034912258386612
----------------------
Topic number: 53
Name of the topic: 53_claudio_siber_david_reitschuster
Representative words: ['claudio', 'siber', 'david', 'reitschuster', 'online', 'onlinekanal', 'newskanal', 'faz', 'timeskanal', 'oilprice']
Representative messages: ['   faz onlinekanal david claudio siber', '   reitschuster onlinekanal david claudio siber', 'gefunden bei:reitschuster online   kanal david claudio siber']
Simil