## NER from Spacy

In [6]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [7]:
from pprint import pprint

news = """Afghanistan's famous pop star, Aryana Sayeed, who escaped from the Taliban after the takeover of Kabul, has blamed 
Pakistan for empowering the terrorist outfit and expressed her utmost gratitude to India for helping Afghans during the 
ongoing crisis."""
doc = nlp(news)
pprint([(X.text, X.label_) for X in doc.ents])

# NORP - Nationalities or religious or political groups
# GPE - Geo Political Entity

[('Afghanistan', 'GPE'),
 ('Aryana Sayeed', 'PERSON'),
 ('Taliban', 'ORG'),
 ('Kabul', 'GPE'),
 ('Pakistan', 'GPE'),
 ('India', 'GPE'),
 ('Afghans', 'NORP')]


IOB Tagging Scheme for each entities 

In [18]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(Afghanistan, 'B', 'GPE'),
 ('s, 'O', ''),
 (famous, 'O', ''),
 (pop, 'O', ''),
 (star, 'O', ''),
 (,, 'O', ''),
 (Aryana, 'B', 'PERSON'),
 (Sayeed, 'I', 'PERSON'),
 (,, 'O', ''),
 (who, 'O', ''),
 (escaped, 'O', ''),
 (from, 'O', ''),
 (the, 'O', ''),
 (Taliban, 'B', 'ORG'),
 (after, 'O', ''),
 (the, 'O', ''),
 (takeover, 'O', ''),
 (of, 'O', ''),
 (Kabul, 'B', 'GPE'),
 (,, 'O', ''),
 (has, 'O', ''),
 (blamed, 'O', ''),
 (
, 'O', ''),
 (Pakistan, 'B', 'GPE'),
 (for, 'O', ''),
 (empowering, 'O', ''),
 (the, 'O', ''),
 (terrorist, 'O', ''),
 (outfit, 'O', ''),
 (and, 'O', ''),
 (expressed, 'O', ''),
 (her, 'O', ''),
 (utmost, 'O', ''),
 (gratitude, 'O', ''),
 (to, 'O', ''),
 (India, 'B', 'GPE'),
 (for, 'O', ''),
 (helping, 'O', ''),
 (Afghans, 'B', 'NORP'),
 (during, 'O', ''),
 (the, 'O', ''),
 (
, 'O', ''),
 (ongoing, 'O', ''),
 (crisis, 'O', ''),
 (., 'O', '')]


In [9]:
from bs4 import BeautifulSoup
import requests
import re

In [10]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [15]:
ny_bb = url_to_string('https://www.ndtv.com/india-news/afghan-pop-star-aryana-sayeed-blames-pak-for-empowering-taliban-terms-india-true-friend-2517171#pfrom=home-ndtv_topstories')
article = nlp(ny_bb)
len(article.ents)

79

In [16]:
displacy.render(article, jupyter=True, style='ent')

## Stanford NER Tagger

In [25]:
import nltk
from nltk.tag.stanford import StanfordNERTagger
import nltk
nltk.download('punkt')

PATH_TO_JAR='C:\\Users\\dbhad\\Downloads\\stanford-ner.jar'
PATH_TO_MODEL = 'C:\\Users\\dbhad\Downloads\\english.all.3class.distsim.crf.ser.gz'

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dbhad\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [26]:
tagger = StanfordNERTagger(model_filename=PATH_TO_MODEL,path_to_jar=PATH_TO_JAR, encoding='utf-8')

In [28]:
words = nltk.word_tokenize(news) 
tagged = tagger.tag(words)

In [29]:
tagged

[('Afghanistan', 'LOCATION'),
 ("'s", 'O'),
 ('famous', 'O'),
 ('pop', 'O'),
 ('star', 'O'),
 (',', 'O'),
 ('Aryana', 'PERSON'),
 ('Sayeed', 'PERSON'),
 (',', 'O'),
 ('who', 'O'),
 ('escaped', 'O'),
 ('from', 'O'),
 ('the', 'O'),
 ('Taliban', 'ORGANIZATION'),
 ('after', 'O'),
 ('the', 'O'),
 ('takeover', 'O'),
 ('of', 'O'),
 ('Kabul', 'LOCATION'),
 (',', 'O'),
 ('has', 'O'),
 ('blamed', 'O'),
 ('Pakistan', 'LOCATION'),
 ('for', 'O'),
 ('empowering', 'O'),
 ('the', 'O'),
 ('terrorist', 'O'),
 ('outfit', 'O'),
 ('and', 'O'),
 ('expressed', 'O'),
 ('her', 'O'),
 ('utmost', 'O'),
 ('gratitude', 'O'),
 ('to', 'O'),
 ('India', 'LOCATION'),
 ('for', 'O'),
 ('helping', 'O'),
 ('Afghans', 'O'),
 ('during', 'O'),
 ('the', 'O'),
 ('ongoing', 'O'),
 ('crisis', 'O'),
 ('.', 'O')]