### This notebook presents the application of three diferent NER algorithms: StanfordNER, Spacy and flair. They are evaluated by ability of recognize persons and locations of text sample. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter, defaultdict

In [2]:
import spacy

import nltk
from nltk.tag.stanford import StanfordNERTagger as NERTagger

from flair.data import Sentence
from flair.models import SequenceTagger
from segtok.segmenter import split_single

import stanfordnlp
from spacy_stanfordnlp import StanfordNLPLanguage

# Data

The first chapter of harry potter and the philosopher's stone is used to evaluate the performance of different NER algorithms

In [4]:
with open('data/harry.txt', encoding='UTF-8') as f:
    ttext = f.read()

# Comparison

In [5]:
person_outputs = {}

## spaCy

In [6]:
lib_key = 'spacy'
person_outputs[lib_key] = []

In [7]:
nlp = spacy.load('en')

for doc in nlp.pipe([ttext], n_threads=1, batch_size=1):
    display(doc.ents)
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            person_outputs['spacy'].append(ent)

(Dursley,
 number four,
 Privet Drive,
 Dursley,
 Grunnings,
 Dursley,
 Dursleys,
 Dudley,
 Dursleys,
 Potters,
 Dursley,
 several years,
 Dursley,
 Potters,
 Potters,
 Dursley,
 Tuesday,
 Dursley,
 Dursley,
 half past eight,
 Dursley,
 Dursley,
 Dudley,
 Dudley,
 Dursley,
 number four,
 first,
 second,
 Dursley,
 Privet,
 Dursley,
 Dursley,
 Privet Drive,
 Dursley,
 that day,
 Dursley,
 Dursley,
 Dursley,
 a few minutes later,
 Dursley,
 Grunnings,
 Dursley,
 ninth,
 that morning,
 nighttime,
 Dursley,
 five,
 baker,
 The Potters,
 Harry,
 Dursley,
 Harry,
 Harry,
 Harvey,
 Harold,
 Dursley,
 that afternoon,
 five o'clock,
 a few seconds,
 Dursley,
 today,
 last!Even Muggles,
 Dursley,
 Dursley,
 Muggle,
 four,
 Shoo,
 Dursley,
 Dursley,
 Dursley,
 normal day,
 Next Door,
 Dudley,
 Dursley,
 Dudley,
 today,
 hunt at night,
 daylight,
 hundreds,
 Jim McGuffin,
 tonight,
 Jim,
 Ted,
 today,
 Kent,
 Yorkshire,
 Dundee,
 yesterday,
 Bonfire Night,
 next week,
 tonight,
 Dursley,
 Britain,

In [8]:
np.unique(np.array(sp_person_outputs[lib_key]))

array([Dursley, Dursley, Dursley, Dursleys, Dudley, Dursleys, Dursley,
       Dursley, Dursley, Dursley, Dursley, Dursley, Dursley, Dudley,
       Dudley, Dursley, Dursley, Dursley, Dursley, Privet Drive, Dursley,
       Dursley, Dursley, Dursley, Dursley, Dursley, Dursley, baker, Harry,
       Dursley, Harry, Harry, Harold, Dursley, Dursley, last!Even Muggles,
       Dursley, Dursley, Shoo, Dursley, Dursley, Dursley, Next Door,
       Dudley, Dursley, Dudley, Jim McGuffin, Jim, Ted, Kent, Yorkshire,
       Bonfire Night, Dursley, Dursley, Dursley, Dursley, Dursley,
       Dursley, Dursley, Dudley, Dursley, Harry, Dursley, Dursley,
       Dursley, Dursley, Dursley, Dursley, Dursley, Privet,
       Albus Dumbledore, Albus Dumbledore, Dursley, Dumbledore, Item,
       McGonagall, Item, quien, Item, McGonagall, McGonagall, Item,
       Flocks, Kent, Dumbledore, McGonagall, Item, Dumbledore, Muggle,
       McGonagall, Este, McGonagall, quien, McGonagall, Dumbledore,
       Pomfrey, McGonag

In [9]:
spacy_locations = []
for doc in nlp.pipe([ttext]):
    for ent in doc.ents:
        if ent.label_ == 'LOCATION':
            spacy_locations.append(ent)

In [10]:
spacy_locations

[]

## NLTK

In [11]:
st = NERTagger('../../../StanfordNER/classifiers/english.all.3class.distsim.crf.ser.gz', '../../../StanfordNER/stanford-ner.jar')

In [12]:
lib_key = 'nltk'
st_person_outputs[lib_key] = []
errors = []

text = ''.join(char for char in ttext if ord(char) < 128)
for sent in nltk.sent_tokenize(text):
    tokens = nltk.tokenize.word_tokenize(sent)
    try:
        tags = st.tag(tokens)
    except Exception:
        print(*tokens)
        print()
        errors.append(tokens)
    else:
        for tag in tags:
            if tag[1] == 'PERSON':
                person_outputs[lib_key].append(tag)

In [24]:
person_outputs['nltk']

[('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dudley', 'PERSON'),
 ('Potter', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursleys', 'PERSON'),
 ('Dudley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dudley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dudley', 'PERSON'),
 ('Dudley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Harry', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Potter', 'PERSON'),
 ('Potter', 'PERSON'),
 ('Harry', 'PERSON'),
 ('Harry', 'PERSON'),
 ('Harvey', 'PERSON'),
 ('Harold', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Dursley', 'PERSON'),
 ('Durs

In [14]:
nltk_locations = []
for sent in nltk.sent_tokenize(text):
    tokens = nltk.tokenize.word_tokenize(sent)
    tags = st.tag(tokens)
    for tag in tags:
        if tag[1] == 'LOCATION':
            nltk_locations.append(tag)

In [15]:
nltk_locations

[('Privet', 'LOCATION'),
 ('Drive', 'LOCATION'),
 ('Privet', 'LOCATION'),
 ('Drive', 'LOCATION'),
 ('Kent', 'LOCATION'),
 ('Yorkshire', 'LOCATION'),
 ('Britain', 'LOCATION'),
 ('Privet', 'LOCATION'),
 ('Drive', 'LOCATION'),
 ('Privet', 'LOCATION'),
 ('Drive', 'LOCATION'),
 ('Privet', 'LOCATION'),
 ('Drive', 'LOCATION'),
 ('London', 'LOCATION'),
 ('Underground', 'LOCATION'),
 ('Privet', 'LOCATION'),
 ('Drive', 'LOCATION'),
 ('Privet', 'LOCATION'),
 ('Drive', 'LOCATION'),
 ('Este', 'LOCATION')]

## Flair

In [16]:
tagger = SequenceTagger.load('ner')

2020-01-05 12:26:21,407 loading file C:\Users\aland\.flair\models\en-ner-conll03-v0.4.pt


In [17]:
lib_key = 'flair'
person_outputs[lib_key] = []
sentences = [sent for sent in split_single(ttext)]
for sent in sentences:
    sent = Sentence(sent)
    tagger.predict(sent)
    for entity in sent.get_spans('ner'):
        if entity.tag == 'PER':
            person_outputs[lib_key].append(entity)


2020-01-05 12:29:22,193 ACHTUNG: An empty Sentence was created! Are there empty strings in your dataset?
2020-01-05 12:29:22,194 Ignore 1 sentence(s) with no tokens.


In [18]:
person_outputs[lib_key]

[<PER-span (3,4): "Mrs. Dursley,">,
 <PER-span (2): "Dursley">,
 <PER-span (2): "Dursley">,
 <PER-span (2): "Dursleys">,
 <PER-span (8): "Dudley">,
 <PER-span (1): "Potter">,
 <PER-span (4): "Dursley's">,
 <PER-span (4): "Dursley">,
 <PER-span (1): "Dudley">,
 <PER-span (5): "Dursley">,
 <PER-span (2): "Dursley">,
 <PER-span (4): "Dursley">,
 <PER-span (1): "Dudley">,
 <PER-span (6): "Dursley">,
 <PER-span (1): "Dursley">,
 <PER-span (9): "Dudley">,
 <PER-span (2): "Dudley">,
 <PER-span (5): "Dursley">,
 <PER-span (5): "Dursley">,
 <PER-span (2): "Dursley">,
 <PER-span (3): "Dursley">,
 <PER-span (2): "Dursley">,
 <PER-span (2): "Dursley">,
 <PER-span (2): "Dursley">,
 <PER-span (6): "Dursley">,
 <PER-span (1): "Dursley">,
 <PER-span (2): "Dursley">,
 <PER-span (2): "Dursley,">,
 <PER-span (1): "He'd">,
 <PER-span (12): "Harry"">,
 <PER-span (2): "Dursley">,
 <PER-span (1): "Potter">,
 <PER-span (5): "Potter">,
 <PER-span (11): "Harry.">,
 <PER-span (11): "Harry.">,
 <PER-span (1): "He

In [19]:
flair_locations = []
for sent in sentences:
    sent = Sentence(sent)
    tagger.predict(sent)
    for entity in sent.get_spans('ner'):
        if entity.tag == 'LOC':
            flair_locations.append(entity)


2020-01-05 12:32:08,176 ACHTUNG: An empty Sentence was created! Are there empty strings in your dataset?
2020-01-05 12:32:08,177 Ignore 1 sentence(s) with no tokens.


In [20]:
flair_locations

[<LOC-span (11): "Privet">,
 <LOC-span (5): "Grunnings">,
 <LOC-span (2): "Harold.">,
 <LOC-span (9): "Jim?"">,
 <LOC-span (5): "Britain?">,
 <LOC-span (5,6): "Privet Drive">,
 <LOC-span (10,11): "Privet Drive.">,
 <LOC-span (5): "Dursleys'">,
 <LOC-span (5): "Kent">,
 <LOC-span (5): "SW">,
 <LOC-span (4,5): "Godric's Hollow.">,
 <LOC-span (1): "Este?">,
 <LOC-span (4): "Bristol">,
 <LOC-span (12): "London">,
 <LOC-span (11): "Dursleys'">,
 <LOC-span (8,9): "Privet Drive,">,
 <LOC-span (8): "Este">]