# Spell checker 

We can add new words to the spell checker or use the spell checker directly for other languages than English.

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker() # loads default word frequency list
#spell.word_frequency.load_text_file('meine_datei.txt')

# if I just want to make sure some words are not flagged as misspelled
spell.word_frequency.load_words(['Nürnberg','München'])
spell.known(['Nürnberg','München'])  # will return both now!

spell_de = SpellChecker(language='de', case_sensitive=True) 

## Exercise 1:

Load all German cities from the ``de.csv`` into the spell checker. You need to parse the CSV file with the csv Python package. The data can be found at [https://simplemaps.com/data/de-cities](https://simplemaps.com/data/de-cities).

In [None]:
import csv

city_list = []
for city in cities:
    pass
spell_de.known([city_list])

# German word2vec

Spacy works with many other languages like for German, French, and Spanish. The full list can be found at [https://spacy.io/usage/models](https://spacy.io/usage/models). Let's take a look how it works and compare it with the English version. Please keep in mind that we use the small version of the model. You can always download a more precise model for German like [https://spacy.io/models/de](https://spacy.io/models/de).

In [27]:
import spacy

nlp = spacy.load("en")

tokens = nlp(u'king queen horse cat lamp')

for first_token in tokens:
    for second_token in tokens:
        print(first_token.text, second_token.text, first_token.similarity(second_token))

king king 1.0
king queen 0.49073455
king horse 0.33970487
king cat 0.3899064
king lamp 0.39127484
queen king 0.49073455
queen queen 1.0
queen horse 0.46672326
queen cat 0.5033449
queen lamp 0.38795537
horse king 0.33970487
horse queen 0.46672326
horse horse 1.0
horse cat 0.3925663
horse lamp 0.415988
cat king 0.3899064
cat queen 0.5033449
cat horse 0.3925663
cat cat 1.0
cat lamp 0.3361321
lamp king 0.39127484
lamp queen 0.38795537
lamp horse 0.415988
lamp cat 0.3361321
lamp lamp 1.0


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


In [26]:
import spacy

nlp = spacy.load("de")

tokens = nlp(u'könig königen pferd katze lampe')

for first_token in tokens:
    for second_token in tokens:
        print(first_token.text, second_token.text, first_token.similarity(second_token))

könig könig 1.0
könig königen 0.37712693
könig pferd 0.5495152
könig katze 0.508159
könig lampe 0.26994777
königen könig 0.37712693
königen königen 1.0
königen pferd 0.28978667
königen katze 0.6135397
königen lampe 0.37072814
pferd könig 0.5495152
pferd königen 0.28978667
pferd pferd 1.0
pferd katze 0.46782535
pferd lampe 0.3315421
katze könig 0.508159
katze königen 0.6135397
katze pferd 0.46782535
katze katze 1.0
katze lampe 0.29568455
lampe könig 0.26994777
lampe königen 0.37072814
lampe pferd 0.3315421
lampe katze 0.29568455
lampe lampe 1.0


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


In [4]:
doc1 = nlp(u"Es war einmal ein kleines süßes Mädchen, das hatte jedermann lieb, der sie nur ansah, am allerliebsten aber ihre Großmutter, die wusste gar nicht, was sie alles dem Kinde geben sollte.")
doc2 = nlp(u"Einmal schenkte sie ihm ein Käppchen von rotem Samt, und weil ihm das so wohl stand, und es nichts anders mehr tragen wollte, hieß es nur das Rotkäppchen.")
doc3 = nlp(u"Eines Tages sprach seine Mutter zu ihm: Komm, Rotkäppchen, da hast du ein Stück Kuchen und eine Flasche Wein, bring das der Großmutter hinaus; sie ist krank und schwach und wird sich daran laben.")

In [6]:
len(doc1.vector)

96

# New Named Entitiy Recognition training

Based on [https://spacy.io/usage/training](https://spacy.io/usage/training), we can train the model with NER entries.

In [30]:
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


# training data
TRAIN_DATA = [
    ("I like Nürnberg and Kraków.", {"entities": [(7, 15, "GPE"), (20, 26, "GPE")]}),
]


@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int),
)
def train_ner(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

            
train_ner('en','/home/codete/workshop/')            

Loaded model 'de'
Losses {'ner': 9.517211854457855}
Losses {'ner': 9.851449131965637}
Losses {'ner': 5.231690675318276}
Losses {'ner': 9.805707931518555}
Losses {'ner': 6.0083559161710625}
Losses {'ner': 4.325909856152975}
Losses {'ner': 6.56257762936616}
Losses {'ner': 5.230663496709894}
Losses {'ner': 6.867360518317582}
Losses {'ner': 8.012482523918152}
Losses {'ner': 6.320870776597961}
Losses {'ner': 6.256802976131439}
Losses {'ner': 5.434958373902415}
Losses {'ner': 6.349413275718689}
Losses {'ner': 4.2376793867958575}
Losses {'ner': 7.630850732326508}
Losses {'ner': 4.1439585315024825}
Losses {'ner': 2.1816334187130115}
Losses {'ner': 8.86284452676773}
Losses {'ner': 3.8126182841625385}
Losses {'ner': 6.962542176246643}
Losses {'ner': 4.545411241294438}
Losses {'ner': 6.70014488697052}
Losses {'ner': 8.526988863945007}
Losses {'ner': 6.621102124452591}
Losses {'ner': 6.189238959038657}
Losses {'ner': 4.659776524218387}
Losses {'ner': 4.672071951588805}
Losses {'ner': 7.96494091032

In [21]:
nlp2 = spacy.load('/home/codete/workshop')
text = "Barack Obama is the president of the United States. "
text += "Paris is a nice city. German people."
text += "The training it in Nürnberg."
doc = nlp2(text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_) for t in doc])
nlp = spacy.load("en") 
doc2 = nlp(text)
print("Entities", [(ent.text, ent.label_) for ent in doc2.ents])
print("Tokens", [(t.text, t.ent_type_) for t in doc2])


Entities [('Barack Obama', 'PERSON'), ('the United States', 'GPE'), ('Paris', 'GPE'), ('German', 'GPE'), ('Nürnberg', 'GPE')]
Tokens [('Barack', 'PERSON', 3), ('Obama', 'PERSON', 1), ('is', '', 2), ('the', '', 2), ('president', '', 2), ('of', '', 2), ('the', 'GPE', 3), ('United', 'GPE', 1), ('States', 'GPE', 1), ('.', '', 2), ('Paris', 'GPE', 3), ('is', '', 2), ('a', '', 2), ('nice', '', 2), ('city', '', 2), ('.', '', 2), ('German', 'GPE', 3), ('people', '', 2), ('.', '', 2), ('The', '', 2), ('training', '', 2), ('it', '', 2), ('in', '', 2), ('Nürnberg', 'GPE', 3), ('.', '', 2)]
Entities [('Barack Obama', 'PERSON'), ('the United States', 'GPE'), ('Paris', 'GPE'), ('German', 'NORP'), ('Nürnberg', 'GPE')]
Tokens [('Barack', 'PERSON', 3), ('Obama', 'PERSON', 1), ('is', '', 2), ('the', '', 2), ('president', '', 2), ('of', '', 2), ('the', 'GPE', 3), ('United', 'GPE', 1), ('States', 'GPE', 1), ('.', '', 2), ('Paris', 'GPE', 3), ('is', '', 2), ('a', '', 2), ('nice', '', 2), ('city', '', 2), (

In [29]:
import spacy

nlp = spacy.load("en")

doc = nlp(u'Nürnberg')

print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_) for t in doc])


Entities []
Tokens [('Nürnberg', '', 2)]


## Exercise 2

Please add the German cities into the NER of spacy model.

# Text summarization

We can easily summarize text with Gensim and other similar tools. Text ranking method is used in case of Gensim.

In [61]:
from gensim.summarization import summarizer as gensim_summarizer

file = open("./datasets/brexit.txt", "r",encoding="utf-8") 
article = file.read()

gensim_summary1 = gensim_summarizer.summarize(article, ratio=0.15)
gensim_summary2 = gensim_summarizer.summarize(article, word_count=20)

In [62]:
gensim_summary1

"Boris Johnson has launched the Conservative Party's election campaign, saying his Brexit deal delivers everything I campaigned for.\nAnd Conservative Party chairman James Cleverly said: We need to break the Brexit deadlock and get on with delivering on voters priorities - something the last Parliament proved incapable of doing."

In [63]:
gensim_summary2

"Boris Johnson has launched the Conservative Party's election campaign, saying his Brexit deal delivers everything I campaigned for."