In [15]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [17]:
doc = nlp('European authorities fined X a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')


In [18]:
#BILUO tagging scheme
#Begin - First token of multi token
#In  - Inner token of multi token
#Last - Final token of multi token
#Unit - Single token
#Out - Non entity token

In [19]:
print([(X, X.ent_iob_, X.ent_type_) for X in doc])

#"B" means the token begins an entity, 
#"I" means it is inside an entity, 
#"O" means it is outside an entity, and 
#"" means no entity tag is set.

[(European, 'B', 'NORP'), (authorities, 'O', ''), (fined, 'O', ''), (X, 'O', ''), (a, 'B', 'MONEY'), (record, 'I', 'MONEY'), ($, 'I', 'MONEY'), (5.1, 'I', 'MONEY'), (billion, 'I', 'MONEY'), (on, 'O', ''), (Wednesday, 'B', 'DATE'), (for, 'O', ''), (abusing, 'O', ''), (its, 'O', ''), (power, 'O', ''), (in, 'O', ''), (the, 'O', ''), (mobile, 'O', ''), (phone, 'O', ''), (market, 'O', ''), (and, 'O', ''), (ordered, 'O', ''), (the, 'O', ''), (company, 'O', ''), (to, 'O', ''), (alter, 'O', ''), (its, 'O', ''), (practices, 'O', '')]


In [20]:
#Extracting named entity from an article

from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))




In [21]:
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

158

In [22]:
ny_bb

'     F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Fired - The New York Times                                                                                                             SectionsSEARCHSkip to contentSkip to site indexPoliticsToday’s PaperPolitics|F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Firedhttps://nyti.ms/2OtNre3AdvertisementContinue reading the main storySupported byContinue reading the main storyF.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is FiredPeter Strzok, a top F.B.I. counterintelligence agent who was taken off the special counsel investigation after his disparaging texts about President Trump were uncovered, was fired.Credit...T.J. Kirkpatrick for The New York TimesBy Adam Goldman and Michael S. SchmidtAug. 13, 2018WASHINGTON — Peter Strzok, the F.B.I. senior counterintelligence agent who disparaged President Trump in inflammatory text messages and helped oversee the Hillary Clinton email and Russia investigati

In [23]:
labels = [x.label_ for x in article.ents]
Counter(labels)


#PERSON	People, including fictional.
#NORP	Nationalities or religious or political groups.
#FAC	Buildings, airports, highways, bridges, etc.
#ORG	Companies, agencies, institutions, etc.
#GPE	Countries, cities, states.
#LOC	Non-GPE locations, mountain ranges, bodies of water.
#PRODUCT	Objects, vehicles, foods, etc. (Not services.)
#EVENT	Named hurricanes, battles, wars, sports events, etc.
#WORK_OF_ART	Titles of books, songs, etc.
#LAW	Named documents made into laws.
#LANGUAGE	Any named language.
#DATE	Absolute or relative dates or periods.
#TIME	Times smaller than a day.
#PERCENT	Percentage, including ”%“.
#MONEY	Monetary values, including unit.
#QUANTITY	Measurements, as of weight or distance.
#ORDINAL	“first”, “second”, etc.
#CARDINAL	Numerals that do not fall under another type.

Counter({'PERSON': 75,
         'ORG': 39,
         'PRODUCT': 1,
         'CARDINAL': 5,
         'GPE': 11,
         'DATE': 23,
         'NORP': 3,
         'ORDINAL': 1})

In [24]:
#3 most frequent entries
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 28), ('F.B.I.', 18), ('Trump', 11)]

In [25]:
sentences = [x for x in article.sents]
#print(sentences[19])

In [26]:
sentences

[     F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Fired - The New York Times                                                                                                             ,
 SectionsSEARCHSkip,
 to contentSkip to site,
 indexPoliticsToday’s,
 PaperPolitics|F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Firedhttps://nyti.ms/2OtNre3AdvertisementContinue reading the main storySupported byContinue reading the main storyF.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is FiredPeter Strzok, a top F.B.I. counterintelligence agent who was taken off the special counsel investigation after his disparaging texts about President Trump were uncovered, was fired.,
 Credit...,
 T.J. Kirkpatrick for The New York TimesBy Adam Goldman and Michael S. SchmidtAug.,
 13, 2018WASHINGTON — Peter Strzok, the F.B.I. senior counterintelligence agent who disparaged President Trump in inflammatory text messages and helped oversee the Hillary Clinton email and R

In [27]:
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')

In [28]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[19])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Mr.', 'PROPN', 'Mr.'),
 ('Strzok', 'PROPN', 'Strzok'),
 ('testified', 'VERB', 'testify'),
 ('House', 'PROPN', 'House'),
 ('July', 'PROPN', 'July'),
 ('allowed', 'VERB', 'allow'),
 ('political', 'ADJ', 'political'),
 ('views', 'NOUN', 'view'),
 ('interfere', 'VERB', 'interfere'),
 ('investigations', 'NOUN', 'investigation'),
 ('overseeing', 'VERB', 'oversee')]

In [29]:
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')