https://datascience.blog.wzb.eu/2016/07/13/accurate-part-of-speech-tagging-of-german-texts-with-nltk/

https://datascience.blog.wzb.eu/2016/07/13/autocorrecting-misspelled-words-in-python-using-hunspell/

https://github.com/ptnplanet/NLTK-Contributions/blob/master/ClassifierBasedGermanTagger/ClassifierBasedGermanTagger.py

https://github.com/WZBSocialScienceCenter/germalemma

In [40]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem.snowball import GermanStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from nltk.tag.sequential import ClassifierBasedTagger
from nltk.corpus import ConllCorpusReader
from sklearn.utils import shuffle
%matplotlib inline

# Reading in the data

In [41]:
sw = '''aber           |  but

alle           |  all
allem
allen
aller
alles

als            |  than, as
also           |  so
am             |  an + dem
an             |  at

ander          |  other
andere
anderem
anderen
anderer
anderes
anderm
andern
anderr
anders

auch           |  also
auf            |  on
aus            |  out of
bei            |  by
bin            |  am
bis            |  until
bist           |  art
da             |  there
damit          |  with it
dann           |  then

der            |  the
den
des
dem
die
das

daß            |  that

derselbe       |  the same
derselben
denselben
desselben
demselben
dieselbe
dieselben
dasselbe

dazu           |  to that

dein           |  thy
deine
deinem
deinen
deiner
deines

denn           |  because

derer          |  of those
dessen         |  of him

dich           |  thee
dir            |  to thee
du             |  thou

dies           |  this
diese
diesem
diesen
dieser
dieses


doch           |  (several meanings)
dort           |  (over) there


durch          |  through

ein            |  a
eine
einem
einen
einer
eines

einig          |  some
einige
einigem
einigen
einiger
einiges

einmal         |  once

er             |  he
ihn            |  him
ihm            |  to him

es             |  it
etwas          |  something

euer           |  your
eure
eurem
euren
eurer
eures

für            |  for
gegen          |  towards
gewesen        |  p.p. of sein
hab            |  have
habe           |  have
haben          |  have
hat            |  has
hatte          |  had
hatten         |  had
hier           |  here
hin            |  there
hinter         |  behind

ich            |  I
mich           |  me
mir            |  to me


ihr            |  you, to her
ihre
ihrem
ihren
ihrer
ihres
euch           |  to you

im             |  in + dem
in             |  in
indem          |  while
ins            |  in + das
ist            |  is

jede           |  each, every
jedem
jeden
jeder
jedes

jene           |  that
jenem
jenen
jener
jenes

jetzt          |  now
kann           |  can

kein           |  no
keine
keinem
keinen
keiner
keines

können         |  can
könnte         |  could
machen         |  do
man            |  one

manche         |  some, many a
manchem
manchen
mancher
manches

mein           |  my
meine
meinem
meinen
meiner
meines

mit            |  with
muss           |  must
musste         |  had to
nach           |  to(wards)
nicht          |  not
nichts         |  nothing
noch           |  still, yet
nun            |  now
nur            |  only
ob             |  whether
oder           |  or
ohne           |  without
sehr           |  very

sein           |  his
seine
seinem
seinen
seiner
seines

selbst         |  self
sich           |  herself

sie            |  they, she
ihnen          |  to them

sind           |  are
so             |  so

solche         |  such
solchem
solchen
solcher
solches

soll           |  shall
sollte         |  should
sondern        |  but
sonst          |  else
über           |  over
um             |  about, around
und            |  and

uns            |  us
unse
unsem
unsen
unser
unses

unter          |  under
viel           |  much
vom            |  von + dem
von            |  from
vor            |  before
während        |  while
war            |  was
waren          |  were
warst          |  wast
was            |  what
weg            |  away, off
weil           |  because
weiter         |  further

welche         |  which
welchem
welchen
welcher
welches

wenn           |  when
werde          |  will
werden         |  will
wie            |  how
wieder         |  again
will           |  want
wir            |  we
wird           |  will
wirst          |  willst
wo             |  where
wollen         |  want
wollte         |  wanted
würde          |  would
würden         |  would
zu             |  to
zum            |  zu + dem
zur            |  zu + der
zwar           |  indeed
zwischen       |  between
'''

In [42]:
names = [
    'augsburger_allgemeine',
    'badische_zeitung',
    'berliner_zeitung',
    'frankfurter_allgemeine_zeitung',
    'koelner_stadt_anzeiger',
    'leipziger_volkszeitung',
    'mitteldeutsche_zeitung',
    'rheinische_post',
    'saechsische_zeitung',
    'stuttgarter_zeitung',
    'sueddeutsche_zeitung',
    'weser_kurier',
]

In [43]:
filenames = []
dates = ["oct31", 'nov1', 'nov2', 'nov3', 'nov4']
for name in names:
    for date in dates:
        filename = "data/" + name + "/" + name + "_" + date + ".json"
        filenames.append(filename)

In [44]:
# Some sites have two headlines -- combine them.
def combine_headlines(df, filename):
    if 'article_headline_strong' in df.columns:
        df['article_headline'] = df['article_headline_strong'] + " " + df['article_headline_normal']
        df.drop(['article_headline_strong', 'article_headline_normal'], axis=1, inplace=True)
    return df

In [45]:
def prep_article(df):
    df.article_text = df.article_text.apply(lambda x: ' '.join(x))
    return df

In [46]:
# Print an error message if there is a problem reading in a filename.

dfs = []
for filename in filenames:
    try:
        with open(filename, 'r') as f_obj:
            df = pd.read_json(f_obj)
    except FileNotFoundError:
        print("\nERROR :::: ", filename, "\n")
    else:
        df['source'] = filename.split('/')[1]
        combine_headlines(df, filename)
        prep_article(df)
        print("Success: ", filename)
        dfs.append(df)

Success:  data/augsburger_allgemeine/augsburger_allgemeine_oct31.json
Success:  data/augsburger_allgemeine/augsburger_allgemeine_nov1.json
Success:  data/augsburger_allgemeine/augsburger_allgemeine_nov2.json
Success:  data/augsburger_allgemeine/augsburger_allgemeine_nov3.json
Success:  data/augsburger_allgemeine/augsburger_allgemeine_nov4.json
Success:  data/badische_zeitung/badische_zeitung_oct31.json
Success:  data/badische_zeitung/badische_zeitung_nov1.json
Success:  data/badische_zeitung/badische_zeitung_nov2.json
Success:  data/badische_zeitung/badische_zeitung_nov3.json
Success:  data/badische_zeitung/badische_zeitung_nov4.json
Success:  data/berliner_zeitung/berliner_zeitung_oct31.json
Success:  data/berliner_zeitung/berliner_zeitung_nov1.json
Success:  data/berliner_zeitung/berliner_zeitung_nov2.json
Success:  data/berliner_zeitung/berliner_zeitung_nov3.json
Success:  data/berliner_zeitung/berliner_zeitung_nov4.json
Success:  data/frankfurter_allgemeine_zeitung/frankfurter_allg

In [47]:
# This command creates the actual DataFrame.

df = pd.concat(dfs)

In [48]:
# How many total words are in the corpus?

a = df.dropna().article_text
a = np.array(a)
big_text = []
for i in a:
    i = re.sub(r'[^A-Za-z ]*', '', i)
    big_text.append(i)
corpus = " ".join(big_text)
len(corpus)

2543410

# Word2Vec Pipeline

In [59]:
# Put all article text into a list of big strings.
# May need to go back and df.dropna()
# Plus text cleaning.  get rid of punctuation? etc.


corpus = []
gsm = GermanStemmer()
for row in df.iterrows():
    article_text = row[1][2]
    article_text = article_text.lower()
    article_text = re.sub(r'[^A-Za-zäüöß ]*', '', article_text)
    stemmed = gsm.stem(article_text)
    tokenized = word_tokenize(stemmed)
    lemmatized = lemmatize_german(tokenized)
    corpus.append(lemmatized)

In [60]:
model = gensim.models.Word2Vec(corpus, size=100, window=10, min_count=3, workers=2,sg=1)

In [61]:
list(model.wv.vocab.items())[:20]

[('einst', <gensim.models.keyedvectors.Vocab at 0x121075208>),
 ('starten', <gensim.models.keyedvectors.Vocab at 0x121075630>),
 ('auf', <gensim.models.keyedvectors.Vocab at 0x121075710>),
 ('der', <gensim.models.keyedvectors.Vocab at 0x121075ac8>),
 ('gelande', <gensim.models.keyedvectors.Vocab at 0x12129e160>),
 ('bundeswehr', <gensim.models.keyedvectors.Vocab at 0x12129e080>),
 ('nachdem', <gensim.models.keyedvectors.Vocab at 0x12129e978>),
 ('letzter', <gensim.models.keyedvectors.Vocab at 0x12129e198>),
 ('aus', <gensim.models.keyedvectors.Vocab at 0x12129ecf8>),
 ('langst', <gensim.models.keyedvectors.Vocab at 0x12129eef0>),
 ('abziehen', <gensim.models.keyedvectors.Vocab at 0x12129e9e8>),
 ('sein', <gensim.models.keyedvectors.Vocab at 0x12129ee80>),
 ('sollen', <gensim.models.keyedvectors.Vocab at 0x12129ef28>),
 ('dort', <gensim.models.keyedvectors.Vocab at 0x12129edd8>),
 ('nun', <gensim.models.keyedvectors.Vocab at 0x12129e6a0>),
 ('gedeihen', <gensim.models.keyedvectors.Vocab

In [141]:
# YES

model.most_similar('ostdeutsch', topn=10)

[('differenzen', 0.9250619411468506),
 ('sondierer', 0.92262202501297),
 ('fortschritte', 0.9128716588020325),
 ('problemen', 0.8996192812919617),
 ('bekenntnis', 0.8994232416152954),
 ('einfuhren', 0.8973312973976135),
 ('gezielt', 0.8950703144073486),
 ('jamaikasondierungen', 0.8910605907440186),
 ('klimaschutz', 0.8854465484619141),
 ('bundeln', 0.8846082091331482)]

In [137]:
# YES

model.most_similar('westdeutsch', topn=10)

[('sechzig', 0.969581663608551),
 ('altenpflege', 0.9694417715072632),
 ('skandalen', 0.9664402008056641),
 ('schlagzeile', 0.9612240791320801),
 ('ruff', 0.9607419371604919),
 ('grossartiges', 0.9607359170913696),
 ('arbeitsbedingungen', 0.9607052206993103),
 ('berufsschule', 0.9606802463531494),
 ('ddrzeiten', 0.9593487977981567),
 ('bruche', 0.9592036008834839)]

# Preprocessing tools thanks to the internet

In [2]:
class ClassifierBasedGermanTagger(ClassifierBasedTagger):
    """A classifier based German part-of-speech tagger. It has an accuracy of
    96.09% after being trained on 90% of the German TIGER corpus. The tagger
    extends the NLTK ClassifierBasedTagger and implements a slightly modified
    feature detector.
    """

    def feature_detector(self, tokens, index, history):
        """Implementing a slightly modified feature detector.
        @param tokens: The tokens from the sentence to tag.
        @param index: The current token index to tag.
        @param history: The previous tagged tokens.
        """

        word = tokens[index]
        if index == 0: # At the beginning of the sentence
            prevword = prevprevword = None
            prevtag = prevprevtag = None
            #word = word.lower() # Lowercase at the beginning of sentence
        elif index == 1:
            prevword = tokens[index-1] # Note: no lowercase
            prevprevword = None
            prevtag = history[index-1]
            prevprevtag = None
        else:
            prevword = tokens[index-1]
            prevprevword = tokens[index-2]
            prevtag = history[index-1]
            prevprevtag = history[index-2]

        if re.match('[0-9]+([\.,][0-9]*)?|[0-9]*[\.,][0-9]+$', word):
            # Included "," as decimal point
            shape = 'number'
        elif re.compile('\W+$', re.UNICODE).match(word):
            # Included unicode flag
            shape = 'punct'
        elif re.match('([A-ZÄÖÜ]+[a-zäöüß]*-?)+$', word):
            # Included dash for dashed words and umlauts
            shape = 'upcase'
        elif re.match('[a-zäöüß]+', word):
            # Included umlauts
            shape = 'downcase'
        elif re.compile("\w+", re.UNICODE).match(word):
            # Included unicode flag
            shape = 'mixedcase'
        else:
            shape = 'other'

        features = {
            'prevtag': prevtag,
            'prevprevtag': prevprevtag,
            'word': word,
            'word.lower': word.lower(),
            'suffix3': word.lower()[-3:],
            #'suffix2': word.lower()[-2:],
            #'suffix1': word.lower()[-1:],
            'preffix1': word[:1], # included
            'prevprevword': prevprevword,
            'prevword': prevword,
            'prevtag+word': '%s+%s' % (prevtag, word),
            'prevprevtag+word': '%s+%s' % (prevprevtag, word),
            'prevword+word': '%s+%s' % (prevword, word),
            'shape': shape
            }
        return features

In [4]:
corp = ConllCorpusReader('.', 'tiger_release_aug07.corrected.16012013.conll09',
                                     ['ignore', 'words', 'ignore', 'ignore', 'pos'],
                                     encoding='utf-8')

In [5]:
tagged_sents = corp.tagged_sents()
shuffle(tagged_sents)
split_perc = 0.1
split_size = int(len(tagged_sents) * split_perc)
train_sents, test_sents = tagged_sents[split_size:], tagged_sents[:split_size]

In [6]:
tagger = ClassifierBasedGermanTagger(train=train_sents)

In [7]:
accuracy = tagger.evaluate(test_sents)

In [8]:
def read_lemmata_from_tiger_corpus(tiger_corpus_file, valid_cols_n=15, col_words=1, col_lemmata=2):
    lemmata_mapping = {}

    with open(tiger_corpus_file) as f:
        for line in f:
            parts = line.split()
            if len(parts) == valid_cols_n:
                w, lemma = parts[col_words], parts[col_lemmata]
                if w != lemma and w not in lemmata_mapping and not lemma.startswith('--'):
                    lemmata_mapping[w] = lemma

    return lemmata_mapping

In [9]:
# This is a dictionary that maps words to their lemma.

lemmata_mapping = read_lemmata_from_tiger_corpus('tiger_release_aug07.corrected.16012013.conll09')

In [37]:
def lemmatize_german(corpus):
    lemmata = []
    for w in corpus:
        w_lemma = lemmata_mapping.get(w, None)
        if w_lemma:
            lemmata.append(w_lemma)
        else:
            lemmata.append(w)
    return lemmata

In [55]:
test_article_string = df.iloc[0, 2]

In [56]:
test_article_list = word_tokenize(test_article_string)

In [57]:
# string --> word_tokenize --> list of strings
test_article_list[:10]

['Einst',
 'starteten',
 'auf',
 'dem',
 'Gelände',
 'die',
 'Kampfflugzeuge',
 'der',
 'Bundeswehr',
 '.']

In [58]:
lemmatize_german(test_article_list)

['einst',
 'starten',
 'auf',
 'der',
 'Gelände',
 'der',
 'Kampfflugzeug',
 'der',
 'Bundeswehr',
 '-',
 'nachdem',
 'der',
 'letzter',
 '``',
 'Tornado',
 "''",
 'aus',
 'Memmingerberg',
 'lang',
 'abziehen',
 'sein',
 ',',
 'sollen',
 'dort',
 'nun',
 'Hanfpflanzen',
 'gedeihen',
 '-',
 'in',
 'ein',
 'Atomschutzbunker',
 'der',
 'ehemalig',
 'Fliegerhorstes',
 'einer',
 'paar',
 'Kilometer',
 'von',
 'Memmingen',
 'entfernen',
 'wollen',
 'Wissenschaftler',
 'künftig',
 'an',
 'Cannabispflanze',
 'forschen',
 '-',
 'der',
 'Projekt',
 'werden',
 'von',
 'ein',
 'Unternehmer',
 'aus',
 'Schwabe',
 'und',
 'der',
 'technisch',
 'Universität',
 'München',
 'vorantreiben',
 '-',
 'noch',
 'stehen',
 'allerdings',
 'der',
 'Genehmigung',
 'der',
 'Bundesinstitut',
 'für',
 'Arzneimittel',
 'und',
 'Medizinprodukt',
 'aus',
 '-',
 'sollen',
 'der',
 'Erlaubnis',
 'bis',
 'Ende',
 'der',
 'Jahr',
 'vorliegen',
 ',',
 'können',
 'der',
 'Anbau',
 'der',
 'Cannabispflanze',
 'in',
 'Frühjah