In [10]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

In [11]:
tokenizer = RegexpTokenizer(r'\w+')

In [12]:
# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

In [13]:
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 


In [14]:
# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

In [15]:
# list for tokenized documents in loop
texts = []

In [16]:
# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

In [17]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

In [18]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [19]:
print(corpus[0])

[(0, 2), (1, 1), (2, 2), (3, 2), (4, 1), (5, 1)]


In [20]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

In [21]:
print(ldamodel.print_topics(num_topics=2, num_words=4))

[(0, '0.068*"brother" + 0.068*"mother" + 0.068*"drive" + 0.041*"pressur"'), (1, '0.086*"health" + 0.086*"good" + 0.086*"brocolli" + 0.061*"eat"')]


Beispiele für die Vorverarbeitung. Noch eine Möglichkeit wäre TextBlob

## spaCy

In [5]:
import spacy
nlp = spacy.load("de_core_news_sm")

In [6]:
doc = nlp("Das ist ein Beispielsatz für eine Vorverarbeitung.")

1. Does the substring match a tokenizer exception rule? For example, “don’t” does not contain whitespace, but should be split into two tokens, “do” and “n’t”, while “U.K.” should always remain one token.
2. Can a prefix, suffix or infix be split off? For example punctuation like commas, periods, hyphens or quotes.

- Doc = A container for accessing linguistic annotations.
- Tokenizer: Segment text, and create Doc objects with the discovered segment boundaries.
- Lemmatizer: Determine the base forms of words. For example, the lemma of “was” is “be”, and the lemma of “rats” is “rat”.
- Tagger: Annotate part-of-speech tags on Doc objects.
- Dependency Parsing: Assigning syntactic dependency labels, describing the relations between individual tokens, like subject or object.

- Text: The original word text.
- Lemma: The base form of the word.
- POS: The simple UPOS part-of-speech tag.
- Tag: The detailed part-of-speech tag.
- Dep: Syntactic dependency, i.e. the relation between tokens.
- Shape: The word shape – capitalization, punctuation, digits.
- is alpha: Is the token an alpha character?
- is stop: Is the token part of a stop list, i.e. the most common words of the language?

In [7]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Das der PRON PDS sb Xxx True True
ist sein AUX VAFIN ROOT xxx True True
ein einen DET ART nk xxx True True
Beispielsatz Beispielsatz NOUN NN pd Xxxxx True False
für für ADP APPR mnr xxx True True
eine einen DET ART nk xxxx True True
Vorverarbeitung Vorverarbeitung NOUN NN nk Xxxxx True False
. . PUNCT $. punct . False False


## NLTK

In [1]:
import nltk
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk import pos_tag, word_tokenize
from germalemma import GermaLemma

In [2]:
stop_words = set(stopwords.words('german'))  

In [3]:
doc = '''Das ist ein Beispielsatz für eine Vorverarbeitung. Und noch ein zweiter Satz, weil es so schön ist. Dieser Satz ist für viele Corpi und mehrere Anzahlen.'''

In [4]:
word_tokens = word_tokenize(doc) 
tokenized = sent_tokenize(doc)


wordsList = [w for w in word_tokens if not w in stop_words] 
wordsList = [] 

for w in word_tokens:  
    if w not in stop_words:  
        wordsList.append(w) 
        
print(word_tokens)  
print(wordsList)  


for i in tokenized:
    wordsList = nltk.word_tokenize(i) 
    tagged = nltk.pos_tag(wordsList) 
  
    print(tagged) 

['Das', 'ist', 'ein', 'Beispielsatz', 'für', 'eine', 'Vorverarbeitung', '.', 'Und', 'noch', 'ein', 'zweiter', 'Satz', ',', 'weil', 'es', 'so', 'schön', 'ist', '.', 'Dieser', 'Satz', 'ist', 'für', 'viele', 'Corpi', 'und', 'mehrere', 'Anzahlen', '.']
['Das', 'Beispielsatz', 'Vorverarbeitung', '.', 'Und', 'zweiter', 'Satz', ',', 'schön', '.', 'Dieser', 'Satz', 'viele', 'Corpi', 'mehrere', 'Anzahlen', '.']
[('Das', 'NNP'), ('ist', 'NN'), ('ein', 'NN'), ('Beispielsatz', 'NNP'), ('für', 'NN'), ('eine', 'NN'), ('Vorverarbeitung', 'NNP'), ('.', '.')]
[('Und', 'NNP'), ('noch', 'CC'), ('ein', 'RB'), ('zweiter', 'NN'), ('Satz', 'NNP'), (',', ','), ('weil', 'VBP'), ('es', 'RB'), ('so', 'RB'), ('schön', 'JJ'), ('ist', 'NN'), ('.', '.')]
[('Dieser', 'NNP'), ('Satz', 'NNP'), ('ist', 'NN'), ('für', 'NN'), ('viele', 'NN'), ('Corpi', 'NNP'), ('und', 'NN'), ('mehrere', 'RB'), ('Anzahlen', 'NNP'), ('.', '.')]


Bei NLTK ist es etwas schwieriger mit der Lammatizierung. Da müssten wir nochmal schauen

In [None]:
#from nltk.stem import GermanWortschatzLemmatizer

In [None]:
#lemmatizer = GermaLemma()

# passing the word and the POS tag ("N" for noun)
#lemma = lemmatizer.find_lemma(doc)
#print(lemma)
# -> lemma is "Feinstaubbelastung"