In [None]:
import requests
import pandas as pd
import lxml.etree as ET
from bs4 import BeautifulSoup as soup
import unicodedata
import tqdm
from time import sleep
import time

In [None]:
df = pd.read_hdf("df_prep.h5", key="df")
df

# Vorbereitungen:

## Laden spezifischer Bibliotheken: 

In [None]:
import spacy
from spacy import displacy
import os
import plotly.graph_objects as go
import plotly.express as px

In [None]:
#Code für den Download eines der Sprachmodelle: 
!python -m spacy download de

In [None]:
#Laden eines Trained Model for German
nlp = spacy.load('de_core_news_sm')

In [None]:
print(nlp.pipe_names)

## Tokenization: 

In [None]:
def process_text(text):
    return nlp(text)

In [None]:
df['token'] = df['title_clean'].apply(process_text)
df

## Lemmatization: 

In [None]:
def get_lemma(doc):
    return [(token.lemma_) for token in doc]

In [None]:
df['lemma'] = df['token'].apply(get_lemma)
df

### Einfache Worthäufigkeitsprüfung: 

In [None]:
# Hier zu suchenden Begriff eingeben: 
term = "Pandemie"  

In [None]:
print(f'"{term}" erscheint in der Spalte Token ' + str(df.token.apply(lambda x: str(x)).apply(lambda x: x.count(term)).sum()) + ' mal.')#
print(f'"{term}" erscheint in der Spalte Lemma ' + str(df['lemma'].apply(lambda x: str(x)).apply(lambda x: x.count(term)).sum()) + ' mal.')

## Optional: Part of speech tags

In [None]:
def get_pos(doc):
    return [(token.pos_, token.tag_) for token in doc]

In [None]:
df['POS'] = df['token'].apply(get_pos)
df.head()

# Topic Modeling:

### Latent Dirichlet Allocation (LDA):¶

In [None]:
import gensim
from gensim import corpora
from gensim.parsing.preprocessing import preprocess_string

In [None]:
lemmatized_text = ' '.join(df['lemma'].explode())

#Alternative:
#lemmatized_text = df.token.apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))

In [None]:
print(lemmatized_text)

In [None]:
#Erstellen eines Dictionary und einer Doc-Term-Matrix:
processed_text = preprocess_string(lemmatized_text)
dictionary = corpora.Dictionary([processed_text])
doc_term_matrix = [dictionary.doc2bow(doc.split()) for doc in processed_text]

In [None]:
#Alternative: 
#dictionary = corpora.Dictionary(lemmatized_text.apply(lambda x: x.split()))
#doc_term_matrix = [dictionary.doc2bow(doc.split()) for doc in lemmatized_text]


In [None]:
# Apply LDA for topic modeling
lda_model = gensim.models.LdaModel(doc_term_matrix, num_topics=4, id2word=dictionary, passes=15)

In [None]:
# Visualize the topics
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, doc_term_matrix, dictionary)
vis