In [1]:
import pandas as pd
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models import HdpModel
from stop_words import get_stop_words

In [2]:
# loading data set and creating corpus for topic modelling
df = pd.read_csv('cleaned_final.csv')
lst = df.loc[:, 'body'].tolist()

In [3]:
# stripping words of punctuation with translate() and
# creating list of words in lower case of each text if word is not in stop word list
import string
table = str.maketrans(dict.fromkeys(string.punctuation))
stoplist = get_stop_words('german')
stoplist.append('page')
stoplist = stoplist + get_stop_words('french') + get_stop_words('english') + get_stop_words('pt')
texts = [[word for word in text.translate(table).lower().split() if word not in stoplist and not word.isdigit()] for text in lst if type(text) is not float]
len(texts)

18348

In [4]:
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

In [5]:
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
from gensim import corpora
dictionary = corpora.Dictionary(processed_corpus)

In [6]:
# vectorize porcessed corpus
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

In [7]:
from gensim.models import HdpModel

hdp = HdpModel(bow_corpus, dictionary)

In [9]:
hdp.print_topics(num_topics = 20,  num_words = 10)

[(0,
  '0.005*menschen + 0.005*covid19 + 0.005*mehr + 0.004*sei + 0.003*coronavirus + 0.003*virus + 0.003*gibt + 0.003*patienten + 0.003*sagt + 0.003*viele'),
 (1,
  '0.008*covid19 + 0.006*menschen + 0.004*patienten + 0.004*coronavirus + 0.004*mehr + 0.004*sagt + 0.004*sei + 0.004*zahl + 0.003*gibt + 0.003*personen'),
 (2,
  '0.005*mehr + 0.005*patienten + 0.004*covid19 + 0.004*menschen + 0.004*virus + 0.003*sagt + 0.003*schon + 0.003*coronavirus + 0.003*viele + 0.003*gibt'),
 (3,
  '0.004*covid19 + 0.003*tour + 0.003*sagt + 0.002*mehr + 0.002*menschen + 0.002*zwei + 0.002*wurden + 0.002*sei + 0.002*wochen + 0.002*coronavirus'),
 (4,
  '0.004*ioc + 0.003*mehr + 0.003*covid19 + 0.003*welt + 0.002*sagte + 0.002*pandemie + 0.002*jahr + 0.002*schon + 0.002*absage + 0.002*menschen'),
 (5,
  '0.009*personen + 0.005*menschen + 0.005*covid19 + 0.005*landkreis + 0.004*quarantäne + 0.004*zahl + 0.003*kreis + 0.003*aktuell + 0.003*positiv + 0.003*insgesamt'),
 (6,
  '0.013*worms + 0.012*vg + 0.00