In [1]:
import pymongo
import re
from bertopic import BERTopic
from scipy.spatial import distance
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import numpy
german_stop_words = stopwords.words('german')

In [2]:
client = pymongo.MongoClient("mongodb://localhost:27017/")

# 1. Get the Data
The overall idea is to create a topic model over a set of channels, such that we can then assign each channel a vector representing the relative frequency of the topics. To do this we have to get the data from the databse.

### Get all the channels
First of all we want to know which channels exist, such that we can then query all messages for a given channel.

In [3]:
channels = client.telegram.channels

def get_list_of_all_channel_names():
    query = {"$or": [{"language": "de"}, {"language": None}]}
    cursor = channels.find(query, {"_id": 0, "channel": 1})
    channel_names = []
    for channel in cursor:
        channel_names.append(channel['channel'])
    return channel_names

So let us see some channels...

In [5]:
channel_names = get_list_of_all_channel_names()
print('Number of channels: ', len(channel_names))
print("First ten channels: ", channel_names[:10])

Number of channels:  46275
First ten channels:  ['chrisaresoffiziell', 'attilahildmann', 'insvenswelt', 'arcadimagazin', 'achseostwest', 'connectivevents', 'antispiegel', 'evahermanoffiziell', 'oliverjanich', 'compactmagazin']


### Get messages of a particular channel
In order to perform topic modelling on a set of channels, we have to get messages of these channels. As we are only interested in german text, we filter only the german messages.
To improve the topic modeling, we replace all URLs by \<URL\> and all usernames by \<USERNAME\>

In [6]:
def replace_URL(string):
    return re.sub('http[s]?://\S+', '<URL>', string)

In [7]:
string = "Das ist ein Beispiel mit einer URL http://beispiel.org oder in der secure Variante https://beispiel.com"
print(replace_URL(string))

Das ist ein Beispiel mit einer URL <URL> oder in der secure Variante <URL>


In [8]:
def replace_username(string):
    return re.sub('@\S+', '<USERNAME>', string)

In [9]:
string = "Folgt alle @ATTILAHILDMANN oder dem neuen Account @ATTILAHILDMANN_2"
print(replace_username(string))

Folgt alle <USERNAME> oder dem neuen Account <USERNAME>


In [10]:
def replace_URL_and_username(string):
    return replace_username(replace_URL(string))

In [11]:
messages = client.telegram.messages


def get_messages_from_channel(channel_name, max_number_of_messages=100000):
    query = {"channel_name": channel_name, "language" : "de"}
    query_res = messages.find(query, {"_id": 0, "text": 1}, limit=max_number_of_messages)
    message_list = []
    for msg in query_res:
        if msg['text'] is not None:
            preprocessed_msg = replace_URL_and_username(msg['text'])
            message_list.append(preprocessed_msg)
    return message_list

So let us see some messages of one particular channel...

In [12]:
channel_messages = get_messages_from_channel("connectivevents", 300);
print('Number of messages: ', len(channel_messages))
print('First three messages: ', channel_messages[:3])

Number of messages:  300
First three messages:  ["LOCKDOWN FOREVER\n❓\n❓\n❓\n🔺\n Bundesländer einigen sich bei Telefonschalte auf Verlängerung des Lockdowns\n🔺\n Am Samstagnachmittag einigten sich die Länder in einer Telefonschalte zur Vorbereitung der kommenden Konferenz der Ministerpräsidenten mit der Kanzlerin darauf, den aktuellen Lockdown über den 10. Januar hinaus zu verlängern. \n🔺\n \nDie Chefs der Staatskanzleien waren sich allerdings über die Dauer der Verlängerung nicht einig\n❕\n🔺\n Die Chefs der Staatskanzleien hätten sich bei einer Telefonkonferenz am Samstagnachmittag nicht einigen können, ob die Verlängerung zunächst für zwei oder drei Wochen beschlossen werden sollte, berichtet die Frankfurter Allgemeine Sonntagszeitung\nNoch Fragen?\n \n🤬\nHier den vollständigen Artikel lesen\n👇\n<URL>\nMehr kostenfreie Info's und Enthüllungen:\n👉\n \n<URL>", '😅\n😂\n mehr zum lachen & lustig sein auf: \n<URL>', '😅\n😂\n mehr zum lachen & lustig sein auf: \n<URL>']


### Get messages of a set of channels
We want to create a topic model over a set of channels, so we have to collect messages from these channels. We have to make sure that we sample the same amount of messages from each channel.

In [28]:
# if a channel has less than MIN_NUMBER_OF_MESSAGES then we do not use this channel
MIN_NUMBER_OF_MESSAGES = 300

def get_messages_from_channels(channels, verbose=False):
    max_number_of_messages = 500
    channel_to_msgs_dict = {}
    for idx, channel in enumerate(channels):
        msg_list = get_messages_from_channel(channel, max_number_of_messages)
        nbo_messages = len(msg_list)
        if nbo_messages >= MIN_NUMBER_OF_MESSAGES:
            channel_to_msgs_dict[channel] = msg_list
            max_number_of_messages = min(max_number_of_messages, nbo_messages)
        
        if verbose and idx % 10 == 0:
            print('Loaded ' + str(idx) + ' channels')
    
    all_messages = []
    channels_with_enough_messages = []
    for channel, msg_list in channel_to_msgs_dict.items():
        all_messages.extend(msg_list[:max_number_of_messages])
        channels_with_enough_messages.append(channel)
    
    return all_messages, channels_with_enough_messages

In [22]:
all_messages, remaining_channels = get_messages_from_channels(channel_names[:1000], verbose=True)
print('Number of messages: ', len(all_messages))
print('Remaining channels: ', remaining_channels)

Loaded 0 channels
Loaded 10 channels
Loaded 20 channels
Loaded 30 channels
Loaded 40 channels
Loaded 50 channels
Loaded 60 channels
Loaded 70 channels
Loaded 80 channels
Loaded 90 channels
Loaded 100 channels
Loaded 110 channels
Loaded 120 channels
Loaded 130 channels
Loaded 140 channels
Loaded 150 channels
Loaded 160 channels
Loaded 170 channels
Loaded 180 channels
Loaded 190 channels
Loaded 200 channels
Loaded 210 channels
Loaded 220 channels
Loaded 230 channels
Loaded 240 channels
Loaded 250 channels
Loaded 260 channels
Loaded 270 channels
Loaded 280 channels
Loaded 290 channels
Loaded 300 channels
Loaded 310 channels
Loaded 320 channels
Loaded 330 channels
Loaded 340 channels
Loaded 350 channels
Loaded 360 channels
Loaded 370 channels
Loaded 380 channels
Loaded 390 channels
Loaded 400 channels
Loaded 410 channels
Loaded 420 channels
Loaded 430 channels
Loaded 440 channels
Loaded 450 channels
Loaded 460 channels
Loaded 470 channels
Loaded 480 channels
Loaded 490 channels
Loaded 500 

# 2. Fit the model
In the second step we can use the collected messages to create a topic model

In [23]:
vectorizer_model = CountVectorizer(ngram_range=(1, 1), stop_words=german_stop_words)
model = BERTopic(language = "german", nr_topics="auto", calculate_probabilities = False, vectorizer_model=vectorizer_model, verbose=True)
topics, probabilities = model.fit_transform(all_messages)
model.save('model_1000')

HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=4819.0), HTML(value='')))




2021-05-03 19:17:31,149 - BERTopic - Transformed documents to Embeddings
2021-05-03 19:19:10,521 - BERTopic - Reduced dimensionality with UMAP
2021-05-03 19:19:28,797 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-05-03 20:11:48,037 - BERTopic - Reduced number of topics from 2109 to 2013
  self._set_arrayXarray(i, j, x)


In [24]:
print(model.get_topic_info())

      Topic  Count                                               Name
0        -1  75101      -1_lauterbach_chlordioxid_kanzlerin_thüringen
1         3   8851  3_ignazbearth_unblogdlive_gestrigen_einsteigerset
2       339   2681     339_wiener_salzburg_österreichischen_hauptchat
3       769    821  769_amtsenthebungsverfahren_republikaner_impea...
4       563    807       563_chinas_chinesischen_chinesische_chinesen
...     ...    ...                                                ...
1980   1009     10  1009_mikrogeräte_sternförmige_darmschleimhaut_...
1981   2000     10  2000_abstimmung_wahlbeteiligung_referendum_hun...
1982   1999     10             1999_gefährten_lucky_stolz_liebevoller
1983   1003     10  1003_immobilieninvestment_oportunity_beraterst...
2012   2107     10  2107_biermarken_mitschläger_kalchreuth_geschle...

[2013 rows x 3 columns]


In [27]:
model.get_topic(339)

[('wiener', 0.0043820582962283096),
 ('salzburg', 0.003823720394698017),
 ('österreichischen', 0.003583943778832602),
 ('hauptchat', 0.003500585103270566),
 ('österreichische', 0.002723549814401268),
 ('österreicher', 0.002659468120426261),
 ('müllner', 0.0025520203967139784),
 ('innsbruck', 0.0020280766748765394),
 ('ungarn', 0.002026565838617191),
 ('abgeriegelt', 0.0020156541320707233)]

# 3. Get the topic distribution of the channels
Now that we have trained the model, we can predict the topic for each message. To compare channels, we compute the topic distribution of each channel. This means, that we predict the topic for each message in the channel and then use  a vector containing the relative frequency of the topics as the topic representation of the channel. 

In [31]:
# first of all we load our model
#we have trained several models
# model_800 is one model trained on the first 800 channels and the nr_topics set to 200
# another model is model_1500 trained on the first 1500 channels, nr_topics= 200, +link_preview +link_preview_title
model = BERTopic.load('model_1500')

In [20]:
def get_topic_distribution(topics_per_msg, no_topics):
    topic_counts = numpy.zeros(no_topics)
    for topic in topics_per_msg:
        if topic != -1:
            topic_counts[topic]+=1
    total = numpy.sum(topic_counts)
    return topic_counts/total

We do not want to recompute the topic distributions every time, therefore we store the topic distribution vector in the channels collection in the new field "topic_1500" (also for the model_800 some reduced topic vector was stored in the field "topic_800_reduced")

In [51]:
def save_topic_vectors(channel_to_topic_distr_dict):
    for channel, topic_distr in channel_to_topic_distr_dict.items():
        query = {"channel": channel}
        new_value = {"$set": {"topic_1500": topic_distr.tolist()}}
        channels.update_one(query, new_value)

So now we can iterate over all the channels, get all the corresponding messages, predict the topic of each message, and compute the topic distribution of the channel. As this is very computational expensive, this was done using Google Colab.

In [18]:
def predict_topics():
    channel_to_topic_distr = {}
    for channel in channel_names:
        msgs = get_messages_from_channel(channel)
        if len(msgs) >= 100:
            topics, _ = model.transform(msgs)
            channel_to_topic_distr[channel] = get_topic_distribution(topics, no_topics)
    save_topic_vectors(channel_to_topic_distr)

# 4. Compare the topic distributions of the channels
Now that we have computed the topic distributions for all the channels (all german channels containing at least 100 german messages), we want to compare their topic distribution in order to find similar channels.

First of all let us get an overview for how many channels and messages we have computed the topic vector representation

In [13]:
def get_channel_names_with_topic_1500():
    query = {'topic_1500': {'$exists': True}}
    return [channel['channel'] for channel in channels.find(query, {'_id': 0, 'channel': 1})]


def get_number_of_messages(channels):
    count = 0
    for channel in channels:
        msgs = get_messages_from_channel(channel)
        count += len(msgs)
    return count

channels_with_topic = get_channel_names_with_topic_1500()
print('Number of channels: ', len(channels_with_topic))
print('Number of messages: ', get_number_of_messages(channels_with_topic))

Number of channels:  2215
Number of messages:  5301885


Ok, then we can start now by retrieving the topic_1500 vector for each channel from the database

In [34]:
query = {'topic_1500': {'$exists': True}}
name_topic_filter = {'_id': 0, 'channel': 1, 'topic_1500': 1}
channel_to_topic = {channel['channel']: channel['topic_1500'] for channel in channels.find(query, name_topic_filter)}

One way to measure the similarity of two probability distributions is to compute the Jensen-Shannon-Divergence. It is guaranteed to lie in the interval [0, 1] where a higher value means a higher similarity.

In [35]:
def jensen_shannon_divergence(distr_1, distr_2):
    return distance.jensenshannon(distr_1, distr_2) **2

In [36]:
divergences = []
for idx, channel_name_1 in enumerate(channel_to_topic.keys()):
    topic_distr_1 = channel_to_topic[channel_name_1]
    for channel_name_2 in list(channel_to_topic.keys())[idx + 1:]:
        topic_distr_2 = channel_to_topic[channel_name_2]
        js_div = jensen_shannon_divergence(topic_distr_1, topic_distr_2)
        divergences.append(js_div)

divergences.sort(reverse=True)
print(divergences[:500])

[0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454, 0.6931471805599454,