# NLTK - Natural Language Toolkit

In [34]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import nltk

In [74]:
text_en = "Hello, what's happening? I play football. It makes fun! I love you."
text_de = "Hallo aus St. Petersburg in Russland. Wie geht es dir? Hab Spaß!"

## deutsche Stopwörter von NLTK

In [8]:
from nltk.corpus import stopwords
stop_ger = stopwords.words('german')
print(stop_ger[0:8])
print("Anzahl deutscher Stopwörter: {}".format(len(stop_ger)))

['aber', 'alle', 'allem', 'allen', 'aller', 'alles', 'als', 'also']
Anzahl deutscher Stopwörter: 231


## Tokenization

#### Texte in einzelne Sätze zerlegen

In [82]:
sentences_en = nltk.sent_tokenize(text_en)

In [25]:
sentences_de = nltk.sent_tokenize(text_de)
sentences_de

['Hallo aus St. Petersburg in Russland.', 'Wie geht es dir?', 'Hab Spaß!']

#### Text in Wörter zelegen

In [26]:
for s in sentences_de:
    print(nltk.word_tokenize(s))

['Hallo', 'aus', 'St.', 'Petersburg', 'in', 'Russland', '.']
['Wie', 'geht', 'es', 'dir', '?']
['Hab', 'Spaß', '!']


## POT-Tagging (Part of Speech Tagging)

In [83]:
for s in sentences_en:
    print(nltk.pos_tag(nltk.word_tokenize(s)))

[('Hello', 'NNP'), (',', ','), ('what', 'WP'), ("'s", 'VBZ'), ('happening', 'VBG'), ('?', '.')]
[('I', 'PRP'), ('play', 'VBP'), ('football', 'NN'), ('.', '.')]
[('It', 'PRP'), ('makes', 'VBZ'), ('fun', 'NN'), ('!', '.')]
[('I', 'PRP'), ('love', 'VBP'), ('you', 'PRP'), ('.', '.')]


In [22]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

## Stemming

In [39]:
from nltk.stem import SnowballStemmer
s = SnowballStemmer("german")

In [50]:
s.stem("Bäume")
s.stem("Eier") # funktioniert noch nicht optimal --> nach besseren Tools für Stemming schauen
s.stem("laufen")
s.stem("Reifen")
s.stem("lief") #schneidet meist nur das Ende des Wortes ab

'baum'

'eier'

'lauf'

'reif'

'lief'

## Lemmatizer
sind besser als Stemmer, weil diese noch die Wort Assoziation kennen -> damit aber auch mehr Rechenaufwand

#### ! spacy hat eine Implemtierungsversion

In [51]:
from nltk.stem.wordnet import WordNetLemmatizer
l = WordNetLemmatizer()

In [55]:
l.lemmatize("going","v") #muss angeben um welche Wortart es sich handelt

'go'

In [84]:
words_tagged = nltk.pos_tag(nltk.word_tokenize(sentences_en[0]))

In [85]:
from nltk.corpus import wordnet

In [86]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV 
    else:
        return wordnet.NOUN

In [87]:
l.lemmatize("going", wordnet.VERB)

'go'

In [88]:
for word in words_tagged:
    print(l.lemmatize(word[0], get_wordnet_pos(word[1])))

Hello
,
what
's
happen
?
