original at="https://github.com/mohammedterry/NLP_for_ML/blob/master/NER.ipynb"

In [1]:
example_document = '''Baidu's Apollo Project is one of the world's leading autonomous driving and AI programs, with one of the largest partner ecosystems and over 100 global partners as of 2018, including BYD, Dongfeng, Microsoft, Intel, Nvidia, Daimler AG, ZTE, Grab, Ford, Hyundai and Honda.'''

In [2]:
import requests
url = "https://cloud-api.gate.ac.uk/process-document/annie-named-entity-recognizer"
headers = {'Content-Type': 'text/plain'}
response = requests.post(url, data=example_document, headers=headers).json()

import json
print(json.dumps(response, indent=2))

{
  "text": "Baidu's Apollo Project is one of the world's leading autonomous driving and AI programs, with one of the largest partner ecosystems and over 100 global partners as of 2018, including BYD, Dongfeng, Microsoft, Intel, Nvidia, Daimler AG, ZTE, Grab, Ford, Hyundai and Honda.",
  "entities": {
    "Date": [
      {
        "indices": [
          167,
          171
        ],
        "kind": "date",
        "rule": "TempYear2",
        "ruleFinal": "YearOnlyFinal"
      }
    ],
    "Organization": [
      {
        "indices": [
          8,
          22
        ],
        "orgType": "unknown",
        "rule": "OrgXBase",
        "ruleFinal": "OrgFinal"
      },
      {
        "indices": [
          198,
          207
        ],
        "orgType": "company",
        "rule": "GazOrganization",
        "ruleFinal": "OrgFinal"
      },
      {
        "indices": [
          209,
          214
        ],
        "orgType": "company",
        "rule": "GazOrganization",
        "rule

# NLTK

In [5]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [6]:
def nltk_ner(document):
    return {(' '.join(c[0] for c in chunk), chunk.label() ) for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(document))) if hasattr(chunk, 'label') }

In [7]:
nltk_ner(example_document)

{('AI', 'ORGANIZATION'),
 ('Apollo Project', 'PERSON'),
 ('BYD', 'ORGANIZATION'),
 ('Baidu', 'GPE'),
 ('Daimler AG', 'PERSON'),
 ('Dongfeng', 'PERSON'),
 ('Ford', 'ORGANIZATION'),
 ('Grab', 'PERSON'),
 ('Honda', 'GPE'),
 ('Hyundai', 'PERSON'),
 ('Intel', 'ORGANIZATION'),
 ('Microsoft', 'PERSON'),
 ('Nvidia', 'GPE'),
 ('ZTE', 'ORGANIZATION')}

# Spacy

In [9]:
#!python3 -m spacy download en_core_web_lg
import spacy
sp_lg = spacy.load('en_core_web_lg') 

In [10]:
def spacy_large_ner(document):
    return {(ent.text.strip(), ent.label_) for ent in sp_lg(document).ents}

In [11]:
spacy_large_ner(example_document)

{('2018', 'DATE'),
 ('Apollo Project', 'ORG'),
 ('BYD', 'ORG'),
 ('Baidu', 'ORG'),
 ('Daimler AG', 'ORG'),
 ('Dongfeng', 'GPE'),
 ('Ford', 'ORG'),
 ('Honda', 'ORG'),
 ('Hyundai', 'ORG'),
 ('Intel', 'ORG'),
 ('Microsoft', 'ORG'),
 ('Nvidia', 'ORG'),
 ('ZTE', 'ORG'),
 ('one', 'CARDINAL'),
 ('over 100', 'CARDINAL')}

# Polyglot

In [14]:
!pip3 install -U git+https://github.com/aboSamoor/polyglot.git@master
!polyglot download embeddings2.en ner2.en
from polyglot.text import Text

Collecting git+https://github.com/aboSamoor/polyglot.git@master
  Cloning https://github.com/aboSamoor/polyglot.git (to revision master) to c:\users\gaurav\appdata\local\temp\pip-req-build-moik5t39
  Resolved https://github.com/aboSamoor/polyglot.git to commit 9b93b2ecbb9ba1f638c56b92665336e93230646a
Collecting PyICU>=1.8
  Using cached PyICU-2.7.4.tar.gz (298 kB)
  Using cached PyICU-2.7.3.tar.gz (295 kB)
  Using cached PyICU-2.7.2.tar.gz (293 kB)
  Using cached PyICU-2.7.1.tar.gz (189 kB)
  Using cached PyICU-2.7.tar.gz (189 kB)
  Using cached PyICU-2.6.tar.gz (233 kB)
  Using cached PyICU-2.5.tar.gz (225 kB)
  Using cached PyICU-2.4.3.tar.gz (219 kB)
  Using cached PyICU-2.4.2.tar.gz (219 kB)
  Using cached PyICU-2.4.1.tar.gz (219 kB)
  Using cached PyICU-2.4.tar.gz (219 kB)
  Using cached PyICU-2.3.1.tar.gz (214 kB)
  Using cached PyICU-2.3.tar.gz (214 kB)
  Using cached PyICU-2.2.tar.gz (211 kB)

  Running command git clone -q https://github.com/aboSamoor/polyglot.git 'C:\Users\Gaurav\AppData\Local\Temp\pip-req-build-moik5t39'
    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\Gaurav\anaconda3\python.exe' -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\Gaurav\\AppData\\Local\\Temp\\pip-install-3lhz9jy0\\pyicu_1a06f99755544fdaabf7b4f22c4fbe4e\\setup.py'"'"'; __file__='"'"'C:\\Users\\Gaurav\\AppData\\Local\\Temp\\pip-install-3lhz9jy0\\pyicu_1a06f99755544fdaabf7b4f22c4fbe4e\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\Gaurav\AppData\Local\Temp\pip-pip-egg-info-tj_uvk2s'
         cwd: C:\Users\Gaurav\AppData\Local\Temp\pip-install-3lhz9jy0\pyicu_1a06f99755544fdaabf7b4f22c4fbe4


  Using cached PyICU-2.1.tar.gz (203 kB)
  Using cached PyICU-2.0.6.tar.gz (203 kB)
  Using cached PyICU-2.0.5.tar.gz (203 kB)
  Using cached PyICU-2.0.4.tar.gz (202 kB)
  Using cached PyICU-2.0.3.tar.gz (201 kB)
  Using cached PyICU-2.0.2.tar.gz (194 kB)
  Using cached PyICU-2.0.1.tar.gz (194 kB)
  Using cached PyICU-2.0.tar.gz (194 kB)
  Using cached PyICU-1.9.8.tar.gz (183 kB)
  Using cached PyICU-1.9.7.tar.gz (183 kB)
  Using cached PyICU-1.9.6.tar.gz (183 kB)
  Using cached PyICU-1.9.5.tar.gz (181 kB)
  Using cached PyICU-1.9.4.tar.gz (181 kB)
Collecting futures>=2.1.6
  Using cached futures-3.1.1-py3-none-any.whl (2.8 kB)
Collecting pycld2>=0.3
  Using cached pycld2-0.41.tar.gz (41.4 MB)
Collecting morfessor>=2.0.2a1
  Using cached Morfessor-2.0.6-py3-none-any.whl (35 kB)
Building wheels for collected packages: polyglot, pycld2, PyICU
  Building wheel for polyglot (setup.py): started
  Building wheel for polyglot (setup.py): finished with status 'done'
  Created wheel for polygl


      File "C:\Users\Gaurav\anaconda3\lib\subprocess.py", line 1311, in _execute_child
        hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
    FileNotFoundError: [WinError 2] The system cannot find the file specified
    
    During handling of the above exception, another exception occurred:
    
    Traceback (most recent call last):
      File "<string>", line 1, in <module>
      File "C:\Users\Gaurav\AppData\Local\Temp\pip-install-3lhz9jy0\pyicu_1a06f99755544fdaabf7b4f22c4fbe4e\setup.py", line 71, in <module>
        raise RuntimeError('''
    RuntimeError:
    Please install pkg-config on your system or set the ICU_VERSION environment
    variable to the version of ICU you have installed.
    
    (running 'icu-config --version')
    (running 'pkg-config --modversion icu-i18n')
    ----------------------------------------
    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\Gaurav\anaconda3\python.exe' -c 'import io, os, sys, setuptools, toke

ModuleNotFoundError: No module named 'polyglot'


    Traceback (most recent call last):
      File "C:\Users\Gaurav\AppData\Local\Temp\pip-install-3lhz9jy0\pyicu_5187011848c446cfa69868a2f5cfde61\setup.py", line 68, in <module>
        ICU_VERSION = check_output(('pkg-config', '--modversion', 'icu-i18n')).strip()
      File "C:\Users\Gaurav\AppData\Local\Temp\pip-install-3lhz9jy0\pyicu_5187011848c446cfa69868a2f5cfde61\setup.py", line 18, in check_output
        return subprocess_check_output(popenargs)
      File "C:\Users\Gaurav\anaconda3\lib\subprocess.py", line 415, in check_output
        return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
      File "C:\Users\Gaurav\anaconda3\lib\subprocess.py", line 493, in run
        with Popen(*popenargs, **kwargs) as process:
      File "C:\Users\Gaurav\anaconda3\lib\subprocess.py", line 858, in __init__
        self._execute_child(args, executable, preexec_fn, close_fds,
      File "C:\Users\Gaurav\anaconda3\lib\subprocess.py", line 1311, in _execute_child
        hp, ht, pid,

        ICU_VERSION = check_output(('icu-config', '--version')).strip()
      File "C:\Users\Gaurav\AppData\Local\Temp\pip-install-3lhz9jy0\pyicu_a005e22fe2294ba4be0cafb3a7fa143d\setup.py", line 18, in check_output
        return subprocess_check_output(popenargs)
      File "C:\Users\Gaurav\anaconda3\lib\subprocess.py", line 415, in check_output
        return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
      File "C:\Users\Gaurav\anaconda3\lib\subprocess.py", line 493, in run
        with Popen(*popenargs, **kwargs) as process:
      File "C:\Users\Gaurav\anaconda3\lib\subprocess.py", line 858, in __init__
        self._execute_child(args, executable, preexec_fn, close_fds,
      File "C:\Users\Gaurav\anaconda3\lib\subprocess.py", line 1311, in _execute_child
        hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
    FileNotFoundError: [WinError 2] The system cannot find the file specified
    
    During handling of the above exception, another exception oc

In [None]:
def polyglot_ner(document):
    return {(' '.join(entity),entity.tag.split('-')[-1]) for entity in Text(document).entities}

In [None]:
polyglot_ner(example_document)


    FileNotFoundError: [WinError 2] The system cannot find the file specified
    
    During handling of the above exception, another exception occurred:
    
    Traceback (most recent call last):
      File "<string>", line 1, in <module>
      File "C:\Users\Gaurav\AppData\Local\Temp\pip-install-3lhz9jy0\pyicu_dbe0615b4bb7441b9a5b188ed8948e12\setup.py", line 50, in <module>
        raise RuntimeError('''
    RuntimeError:
    Please set the ICU_VERSION environment variable to the version of
    ICU you have installed.
    
    ----------------------------------------
    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\Gaurav\anaconda3\python.exe' -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\Gaurav\\AppData\\Local\\Temp\\pip-install-3lhz9jy0\\pyicu_296c3ddfe05c4a7194e923b93aa7e3fe\\setup.py'"'"'; __file__='"'"'C:\\Users\\Gaurav\\AppData\\Local\\Temp\\pip-install-3lhz9jy0\\pyicu_296c3ddfe05c4a7194e923b93aa7e3fe\\setup.py'"'"';f =

        ICU_VERSION = os.environ['ICU_VERSION']
      File "C:\Users\Gaurav\anaconda3\lib\os.py", line 675, in __getitem__
        raise KeyError(key) from None
    KeyError: 'ICU_VERSION'
    
    During handling of the above exception, another exception occurred:
    
    Traceback (most recent call last):
      File "C:\Users\Gaurav\AppData\Local\Temp\pip-install-3lhz9jy0\pyicu_12bec7bf370446cb85fded6cd6400db4\setup.py", line 46, in <module>
        ICU_VERSION = check_output(('icu-config', '--version')).strip()
      File "C:\Users\Gaurav\anaconda3\lib\subprocess.py", line 415, in check_output
        return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
      File "C:\Users\Gaurav\anaconda3\lib\subprocess.py", line 493, in run
        with Popen(*popenargs, **kwargs) as process:
      File "C:\Users\Gaurav\anaconda3\lib\subprocess.py", line 858, in __init__
        self._execute_child(args, executable, preexec_fn, close_fds,
      File "C:\Users\Gaurav\anaconda3\lib\sub

        ICU_VERSION = os.environ['ICU_VERSION']
      File "C:\Users\Gaurav\anaconda3\lib\os.py", line 675, in __getitem__
        raise KeyError(key) from None
    KeyError: 'ICU_VERSION'
    
    During handling of the above exception, another exception occurred:
    
    Traceback (most recent call last):
      File "C:\Users\Gaurav\AppData\Local\Temp\pip-install-3lhz9jy0\pyicu_85b0ad5250884a0a92a0bdb911c8ebaa\setup.py", line 26, in <module>
        ICU_VERSION = check_output(('icu-config', '--version')).strip()
      File "C:\Users\Gaurav\anaconda3\lib\subprocess.py", line 415, in check_output
        return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
      File "C:\Users\Gaurav\anaconda3\lib\subprocess.py", line 493, in run
        with Popen(*popenargs, **kwargs) as process:
      File "C:\Users\Gaurav\anaconda3\lib\subprocess.py", line 858, in __init__
        self._execute_child(args, executable, preexec_fn, close_fds,
      File "C:\Users\Gaurav\anaconda3\lib\sub