# Named Entity Recognition

# DHARMATEJ VAYINENI (0937480)

# spacy 1

# Importing libraries 

## Import SpaCy in English

In [1]:
#!python -m spacy download en_core_web_lg 
import spacy
nlp = spacy.load("en_core_web_lg")

In [8]:
text = "My best friend chris thinks like an extrovert."
doc = nlp(text)
for token in doc:
    print(token, end=" | ")

My | best | friend | chris | thinks | like | an | extrovert | . | 

## What are the attributes that SpaCy adds?

In [9]:
import pandas as pd

def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
    
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df
display_nlp(doc)

Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O
1,best,good,False,True,ADJ,amod,,O
2,friend,friend,False,True,NOUN,nsubj,,O
3,chris,chris,False,True,PROPN,appos,PERSON,B
4,thinks,think,False,True,VERB,ROOT,,O
5,like,like,False,True,ADP,prep,,O
6,an,an,True,True,DET,det,,O
7,extrovert,extrovert,False,True,NOUN,pobj,,O


## Removing Stop words using Spacy

In [10]:
text = "Dear venkat, we need to get a new playstation. Regards, tej"
doc = nlp(text)

non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)

[Dear, venkat, need, new, playstation, Regards, tej]


## Find all nouns using Spacy

In [11]:
text = "My best friend chris thinks like an extrovert."
doc = nlp(text)

nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)

[friend, chris, extrovert]


## Named Entity Recognition

In [12]:
text = "My best friend chris thinks like an extrovert."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(chris, PERSON) 

In [13]:
text = "Emily Johnson, president of Global Enterprises, calls Los Angeles home." 
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(Emily Johnson, PERSON) (Global Enterprises, ORG) (Los Angeles, GPE) 

## Visualize NERS

In [14]:
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)

## Trying it on a real dataset

In [15]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'lxml')  # Use 'lxml' parser instead of 'html5lib'
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://tax.thomsonreuters.com/en/tax-accounting')
article = nlp(ny_bb)
len(article.ents)

94

## look at the NERS

In [16]:
displacy.render(article, style='ent', jupyter=True)

## Most popular NER types

In [17]:
from collections import Counter

labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 72,
         'PRODUCT': 5,
         'PERSON': 4,
         'LAW': 3,
         'CARDINAL': 3,
         'FAC': 3,
         'WORK_OF_ART': 2,
         'GPE': 1,
         'DATE': 1})

In [18]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('Insights                              Book', 4),
 ('AI', 3),
 ('Checkpoint Edge', 3),
 ('one', 2),
 ('T.C. Burgin', 2)]

## Let’s pick one sentence to analyze

In [20]:
article = nlp(ny_bb)
sentences = [x for x in article.sents]
print(sentences[5])

Simplify project management, increase profits, and improve client satisfaction.                 


## NER tags

In [21]:
displacy.render(nlp(str(sentences[1])), jupyter=True, style='ent')



## Types of words in the sentence

In [22]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[2])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Practical', 'PROPN', 'Practical'),
 ('Law', 'PROPN', 'Law'),
 ('                     ', 'SPACE', '                     '),
 ('Fast', 'PROPN', 'Fast'),
 ('track', 'NOUN', 'track'),
 ('case', 'NOUN', 'case'),
 ('onboarding', 'VERB', 'onboarde'),
 ('practice', 'VERB', 'practice'),
 ('confidence', 'NOUN', 'confidence')]

## Sentence dependency tree

In [24]:
displacy.render(nlp(str(sentences[2])), style='dep', jupyter = True, options = {'distance': 150})

# spacy 2 (Dutch)

In [1]:
!python -m spacy download de_core_news_lg
import spacy
# Load the German language model
nlp = spacy.load("de_core_news_lg")

Collecting de-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.7.0/de_core_news_lg-3.7.0-py3-none-any.whl (567.8 MB)
     ---------------------------------------- 0.0/567.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/567.8 MB ? eta -:--:--
     -------------------------------------- 0.0/567.8 MB 217.9 kB/s eta 0:43:27
     -------------------------------------- 0.0/567.8 MB 281.8 kB/s eta 0:33:35
     -------------------------------------- 0.1/567.8 MB 459.5 kB/s eta 0:20:36
     ---------------------------------------- 0.5/567.8 MB 2.1 MB/s eta 0:04:30
     ---------------------------------------- 2.1/567.8 MB 7.9 MB/s eta 0:01:12
     --------------------------------------- 3.5/567.8 MB 11.1 MB/s eta 0:00:52
     --------------------------------------- 4.9/567.8 MB 14.2 MB/s eta 0:00:40
     --------------------------------------- 6.8/567.8 MB 16.7 MB/s eta 0:00:34
      ----------------------------

In [2]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')  # Change 'html5lib' to 'html.parser'
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.sueddeutsche.de/sport/kroos-dfb-frankreich-nagelsmann-deutschland-1.6484717?reduced=true')
article = nlp(ny_bb)
len(article.ents)

90

## Have a Look At The NERS

In [3]:
# Visualize named entities in the article using displacy
from spacy import displacy
displacy.render(article, style='ent', jupyter=True)

## Popular NER Types

In [4]:
# Count the frequency of each named entity label in the article
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 44, 'MISC': 25, 'PER': 20, 'LOC': 1})

## Most Popular NER

In [5]:
# Extract the text of each named entity in the article and count the most common 5 entities
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('eBay', 5),
 ('Toni Kroos', 4),
 ('Frankreich', 3),
 ('deutschen', 2),
 ('FARFETCH', 2)]

## Let’s Pick One Sentence to Analyze

In [6]:
# Extract sentences from the article and print the first sentence
sentences = [x for x in article.sents]
print(sentences[0])

Deutschland und das 2:0 gegen Frankreich: Toni Kroos, der Große  - Sport - SZ.de


## NER Tags

In [7]:
# Visualize named entities in the first sentence of the article
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

## Types of Words in Sentence

In [8]:
# Extract orthographic form, part-of-speech, and lemma of non-stopword, non-punctuation tokens in the first sentence
[(x.orth_,x.pos_, x.lemma_) for x in [y
                                      for y
                                      in nlp(str(sentences[0]))
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Deutschland', 'PROPN', 'Deutschland'),
 ('2:0', 'NOUN', '2:0'),
 ('Frankreich', 'PROPN', 'Frankreich'),
 ('Toni', 'PROPN', 'Toni'),
 ('Kroos', 'PROPN', 'Kroos'),
 (' ', 'SPACE', ' '),
 ('Sport', 'NOUN', 'Sport'),
 ('SZ.de', 'PROPN', 'SZ.de')]

## Sentence Dependency Tree

In [9]:
# Visualize dependency parse of the first sentence of the article with custom distance between words
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True,
                options = {'distance': 120})

# spacy 3

## Import SpaCy in English

In [10]:
#!python -m spacy download en_core_web_lg 
import spacy
nlp = spacy.load("en_core_web_lg")

In [11]:
text = "My best friend gomez thinks like an introvert."
doc = nlp(text)
for token in doc:
    print(token, end=" | ")

My | best | friend | gomez | thinks | like | an | introvert | . | 

## What are the attributes that SpaCy adds?

In [12]:
import pandas as pd

def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
    
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df
display_nlp(doc)

Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O
1,best,good,False,True,ADJ,amod,,O
2,friend,friend,False,True,NOUN,nsubj,,O
3,gomez,gomez,False,True,PROPN,appos,PERSON,B
4,thinks,think,False,True,VERB,ROOT,,O
5,like,like,False,True,ADP,prep,,O
6,an,an,True,True,DET,det,,O
7,introvert,introvert,False,True,NOUN,pobj,,O


## Removing Stop words using Spacy

In [13]:
text = "Dear venkat, we need to get a new playstation. Regards, tej"
doc = nlp(text)

non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)

[Dear, venkat, need, new, playstation, Regards, tej]


## Find all nouns using Spacy

In [14]:
text = "My best friend gomez thinks like an introvert."
doc = nlp(text)

nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)

[friend, gomez, introvert]


## Named Entity Recognition

In [15]:
text = "My best friend gomez thinks like an introvert."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(gomez, PERSON) 

In [16]:
text = "Sarah Patel, CEO of Tech Innovations Ltd, resides in Silicon Valley." 
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(Sarah Patel, PERSON) (Tech Innovations Ltd, ORG) (Silicon Valley, LOC) 

## Visualize NERS

In [17]:
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)

In [18]:
!pip install html5lib



## trying it on a real dataset

In [19]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'lxml')  # Use 'lxml' parser instead of 'html5lib'
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.thomsonreuters.com/en.html')
article = nlp(ny_bb)
len(article.ents)

140

## look at the NERS

In [20]:
displacy.render(article, style='ent', jupyter=True)

## Most popular NER types

In [21]:
from collections import Counter

labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 91,
         'DATE': 28,
         'PRODUCT': 6,
         'GPE': 5,
         'LAW': 4,
         'CARDINAL': 3,
         'PERSON': 1,
         'WORK_OF_ART': 1,
         'FAC': 1})

In [22]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('today', 10),
 ('Thomson Reuters', 8),
 ('tomorrow', 6),
 ('AI', 5),
 ('Insights                              Book', 4)]

## Let’s pick one sentence to analyze

In [23]:
article = nlp(ny_bb)
sentences = [x for x in article.sents]
print(sentences[2])

Practical Law                      Fast track case onboarding and practice with confidence.


## NER tags

In [24]:
displacy.render(nlp(str(sentences[1])), jupyter=True, style='ent')



## Types of words in the sentence

In [25]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[2])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Practical', 'PROPN', 'Practical'),
 ('Law', 'PROPN', 'Law'),
 ('                     ', 'SPACE', '                     '),
 ('Fast', 'PROPN', 'Fast'),
 ('track', 'NOUN', 'track'),
 ('case', 'NOUN', 'case'),
 ('onboarding', 'VERB', 'onboarde'),
 ('practice', 'VERB', 'practice'),
 ('confidence', 'NOUN', 'confidence')]

## Sentence dependency tree

In [26]:
displacy.render(nlp(str(sentences[2])), style='dep', jupyter = True, options = {'distance': 100})