# Named Entity Recognition

# DHARMATEJ VAYINENI (0937480)

# Importing libraries 

In [6]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [7]:
!python -m spacy download en_core_web_lg


Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     -------------------------------------- 0.1/587.7 MB 544.7 kB/s eta 0:17:59
     ---------------------------------------- 0.3/587.7 MB 1.7 MB/s eta 0:05:38
     ---------------------------------------- 1.6/587.7 MB 8.8 MB/s eta 0:01:07
     --------------------------------------- 3.8/587.7 MB 17.3 MB/s eta 0:00:34
     --------------------------------------- 5.7/587.7 MB 20.4 MB/s eta 0:00:29
     --------------------------------------- 5.7/587.7 MB 20.4 MB/s eta 0:00:29
      -------------------------------------- 8.5/587.7 MB 23.7 MB/s eta 0:00:25
      -------------------------------------- 9.5/587.7 MB 23.5 MB/s eta 0:00:25
      -------------------------------

## Import SpaCy in English

In [8]:
#!python -m spacy download en_core_web_lg 
import spacy
nlp = spacy.load("en_core_web_lg")


In [9]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for token in doc:
    print(token, end=" | ") 



My | best | friend | Ryan | Peters | likes | fancy | adventure | games | . | 

## What are the attributes that SpaCy adds?

In [10]:
import pandas as pd

def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
    
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df
display_nlp(doc)


Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O
1,best,good,False,True,ADJ,amod,,O
2,friend,friend,False,True,NOUN,nsubj,,O
3,Ryan,Ryan,False,True,PROPN,compound,PERSON,B
4,Peters,Peters,False,True,PROPN,appos,PERSON,I
5,likes,like,False,True,VERB,ROOT,,O
6,fancy,fancy,False,True,ADJ,amod,,O
7,adventure,adventure,False,True,NOUN,compound,,O
8,games,game,False,True,NOUN,dobj,,O


## Removing Stop words using Spacy

In [11]:
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)

non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)


[Dear, Ryan, need, sit, talk, Regards, Pete]


## Find all nouns using Spacy

In [12]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)


[friend, Ryan, Peters, adventure, games]


## Named Entity Recognition

In [13]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")


(Ryan Peters, PERSON) 

In [14]:
text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco." 
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")


(James O'Neill, PERSON) (World Cargo Inc, ORG) (San Francisco, GPE) 

## Visualize NERS

In [15]:
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)


In [16]:
!pip install html5lib




## trying it on a real dataset

In [17]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'lxml')  # Use 'lxml' parser instead of 'html5lib'
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://ieeexplore.ieee.org/Xplore/home.jsp')
article = nlp(ny_bb)
len(article.ents)



4

## look at the NERS

In [22]:
displacy.render(article, style='ent', jupyter=True)


## Most popular NER types

In [23]:
from collections import Counter

labels = [x.label_ for x in article.ents]
Counter(labels)



Counter({'ORG': 2, 'CARDINAL': 1, 'PERSON': 1})

In [24]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)


[('IEEE.org IEEE', 1),
 ('IEEE-SA IEEE Spectrum', 1),
 ('onlinesupport@ieee.org', 1),
 ('Â© Copyright  IEEE', 1)]

## Let’s pick one sentence to analyze

In [32]:
article = nlp(ny_bb)
sentences = [x for x in article.sents]
print(sentences[2])

We apologize for the inconvenience and appreciate your patience.


## NER tags

In [34]:
displacy.render(nlp(str(sentences[1])), jupyter=True, style='ent')


## Types of words in the sentence

In [35]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[2])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('apologize', 'VERB', 'apologize'),
 ('inconvenience', 'NOUN', 'inconvenience'),
 ('appreciate', 'VERB', 'appreciate'),
 ('patience', 'NOUN', 'patience')]

## Sentence dependency tree

In [36]:
displacy.render(nlp(str(sentences[2])), style='dep', jupyter = True, options = {'distance': 120})
