<a href="https://colab.research.google.com/github/elvinaqa/NER-Python/blob/main/Spacy_and_Displacy_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import spacy

In [27]:
from spacy import displacy
from collections import Counter

In [28]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [29]:
doc = nlp("I want to go to Russia to meet Barack Obama in Mars on Wednesday")

In [30]:
print([(x.text, x.label_) for x in doc.ents ])

[('Russia', 'GPE'), ('Barack Obama', 'PERSON'), ('Mars', 'LOC'), ('Wednesday', 'DATE')]


1.   B means token begins with an entity

2.    I means the token is inside an entity (Obama word)
2.   O means outside of an entity
3.   " " obviously does not sense any entity



In [31]:
print([(x, x.ent_iob_, x.ent_type_) for x in doc])

[(I, 'O', ''), (want, 'O', ''), (to, 'O', ''), (go, 'O', ''), (to, 'O', ''), (Russia, 'B', 'GPE'), (to, 'O', ''), (meet, 'O', ''), (Barack, 'B', 'PERSON'), (Obama, 'I', 'PERSON'), (in, 'O', ''), (Mars, 'B', 'LOC'), (on, 'O', ''), (Wednesday, 'B', 'DATE')]


In [33]:
from bs4 import BeautifulSoup
import requests
import re

In [34]:
def scrape(url):
  res = requests.get(url)
  html = res.text
  soup = BeautifulSoup(html, 'html5lib')

  for script in soup(["script", "style", "aside"]):
    script.extract()
  return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [35]:
article = scrape("https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news")

In [36]:
article

'     F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Fired - The New York Times                                                                                                             SectionsSEARCHSkip to contentSkip to site indexPoliticsToday’s PaperPolitics|F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Firedhttps://nyti.ms/2OtNre3AdvertisementContinue reading the main storySupported byContinue reading the main storyF.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is FiredPeter Strzok, a top F.B.I. counterintelligence agent who was taken off the special counsel investigation after his disparaging texts about President Trump were uncovered, was fired.Credit...T.J. Kirkpatrick for The New York TimesBy Adam Goldman and Michael S. SchmidtAug. 13, 2018WASHINGTON — Peter Strzok, the F.B.I. senior counterintelligence agent who disparaged President Trump in inflammatory text messages and helped oversee the Hillary Clinton email and Russia investigati

In [37]:
article = nlp(article)

There are 153 entities

In [40]:
len(article.ents)

153

In [41]:
labels = [x.label_ for x in article.ents]

## Entities

In [50]:
Counter(labels)

Counter({'CARDINAL': 3,
         'DATE': 23,
         'GPE': 9,
         'LOC': 1,
         'NORP': 2,
         'ORDINAL': 1,
         'ORG': 37,
         'PERSON': 77})

In [51]:
Counter(labels).most_common(5)

[('PERSON', 77), ('ORG', 37), ('DATE', 23), ('GPE', 9), ('CARDINAL', 3)]

In [53]:
items = [x.text for x in article.ents]

## Most common entities

In [54]:
Counter(items).most_common(5)

[('Strzok', 29), ('F.B.I.', 19), ('Trump', 13), ('Russia', 6), ('Clinton', 5)]

## Sentences

In [58]:
texts = [x.text for x in article.sents]
texts[:5]

['     F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Fired - The New York Times                                                                                                             ',
 'SectionsSEARCHSkip to contentSkip to site indexPoliticsToday’s PaperPolitics|F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Firedhttps://nyti.ms/2OtNre3AdvertisementContinue reading the main storySupported byContinue reading the main storyF.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is FiredPeter Strzok, a top F.B.I. counterintelligence agent who was taken off the special counsel investigation after his disparaging texts about President Trump were uncovered, was fired.',
 'Credit...',
 'T.J. Kirkpatrick for The New York TimesBy Adam Goldman and Michael S. SchmidtAug.',
 '13, 2018WASHINGTON']

In [64]:
displacy.render(nlp(str(texts[5])), jupyter=True, style='ent')

In [67]:
displacy.render(nlp(str(texts[5])), jupyter=True, style='dep', options = {'distance': 50})

In [69]:
[(x.orth_, x.pos_, x.lemma_) for x in [y for y in nlp(str(texts[5])) if not y.is_stop and y.pos_!= 'PUNCT']]

[('Peter', 'PROPN', 'Peter'),
 ('Strzok', 'PROPN', 'Strzok'),
 ('F.B.I.', 'PROPN', 'F.B.I.'),
 ('senior', 'ADJ', 'senior'),
 ('counterintelligence', 'NOUN', 'counterintelligence'),
 ('agent', 'NOUN', 'agent'),
 ('disparaged', 'VERB', 'disparage'),
 ('President', 'PROPN', 'President'),
 ('Trump', 'PROPN', 'Trump'),
 ('inflammatory', 'ADJ', 'inflammatory'),
 ('text', 'NOUN', 'text'),
 ('messages', 'NOUN', 'message'),
 ('helped', 'VERB', 'help'),
 ('oversee', 'VERB', 'oversee'),
 ('Hillary', 'PROPN', 'Hillary'),
 ('Clinton', 'PROPN', 'Clinton'),
 ('email', 'NOUN', 'email'),
 ('Russia', 'PROPN', 'Russia'),
 ('investigations', 'NOUN', 'investigation'),
 ('fired', 'VERB', 'fire'),
 ('violating', 'VERB', 'violate'),
 ('bureau', 'NOUN', 'bureau'),
 ('policies', 'NOUN', 'policy'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Strzok', 'PROPN', 'Strzok'),
 ('lawyer', 'NOUN', 'lawyer'),
 ('said', 'VERB', 'say'),
 ('Monday', 'PROPN', 'Monday')]

In [82]:
dict([(str(x), x.label_) for x in nlp(str(texts[5])).ents])

{'F.B.I.': 'ORG',
 'Hillary Clinton': 'PERSON',
 'Monday': 'DATE',
 'Peter Strzok': 'PERSON',
 'Russia': 'GPE',
 'Strzok': 'PERSON',
 'Trump': 'PERSON'}

In [83]:
print([(x, x.ent_iob_, x.ent_type_) for x in nlp(texts[5])])

[(—, 'O', ''), (Peter, 'B', 'PERSON'), (Strzok, 'I', 'PERSON'), (,, 'O', ''), (the, 'O', ''), (F.B.I., 'B', 'ORG'), (senior, 'O', ''), (counterintelligence, 'O', ''), (agent, 'O', ''), (who, 'O', ''), (disparaged, 'O', ''), (President, 'O', ''), (Trump, 'B', 'PERSON'), (in, 'O', ''), (inflammatory, 'O', ''), (text, 'O', ''), (messages, 'O', ''), (and, 'O', ''), (helped, 'O', ''), (oversee, 'O', ''), (the, 'O', ''), (Hillary, 'B', 'PERSON'), (Clinton, 'I', 'PERSON'), (email, 'O', ''), (and, 'O', ''), (Russia, 'B', 'GPE'), (investigations, 'O', ''), (,, 'O', ''), (has, 'O', ''), (been, 'O', ''), (fired, 'O', ''), (for, 'O', ''), (violating, 'O', ''), (bureau, 'O', ''), (policies, 'O', ''), (,, 'O', ''), (Mr., 'O', ''), (Strzok, 'B', 'PERSON'), (’s, 'O', ''), (lawyer, 'O', ''), (said, 'O', ''), (Monday, 'B', 'DATE'), (., 'O', '')]


In [85]:
displacy.render(article, style="ent", jupyter=True)