In [1]:
import pandas as pd
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
from gensim.models.ldamodel import LdaModel
from pprint import pprint


In [2]:
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('german')

In [3]:
df = pd.read_csv("data/open_tasks_Corn_EKV_DE.csv")
data = df["description"]

In [46]:
import spacy

# Load the German language model
nlp = spacy.load("de_core_news_sm")

def preprocess_data(text): 
 # lowercase, tokenize, and remove stopwords
    doc = nlp(text)
    verbs = [token.text.lower() for token in doc if token.pos_ == "VERB"]
    preprocessed_text = simple_preprocess(text)
    return [word for word in preprocessed_text if word not in verbs and word not in stop_words]

In [48]:
preprocessed_data = data.apply(preprocess_data).to_list()

In [49]:
# Create a Dictionary: a mapping between words and their integer IDs
id2word = corpora.Dictionary(preprocessed_data)

# Create a corpus: a list of documents represented as a BoW
corpus = [id2word.doc2bow(text) for text in preprocessed_data]

In [61]:
# Build the LDA model 
num_topics = 3
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=42, passes=10, alpha='auto', per_word_topics=True)

In [63]:
# Print the keywords for each topic
# lda_model.print_topics()

In [64]:
# Evaluation
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_data, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.6074114916865837


In [65]:
# Visualization 
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics
pyLDAvis.enable_notebook()

In [66]:
vis = gensimvis.prepare(lda_model, corpus, id2word) 

In [67]:
vis