In [1]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [2]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')


TypeError: unhashable type: 'dict'

In [29]:
#BILUO tagging scheme
#Begin - First token of multi token
#In  - Inner token of multi token
#Last - Final token of multi token
#Unit - Single token
#Out - Non entity token

In [31]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

#"B" means the token begins an entity, 
#"I" means it is inside an entity, 
#"O" means it is outside an entity, and 
#"" means no entity tag is set.

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [32]:
#Extracting named entity from an article

from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

157

In [34]:
labels = [x.label_ for x in article.ents]
Counter(labels)


#PERSON	People, including fictional.
#NORP	Nationalities or religious or political groups.
#FAC	Buildings, airports, highways, bridges, etc.
#ORG	Companies, agencies, institutions, etc.
#GPE	Countries, cities, states.
#LOC	Non-GPE locations, mountain ranges, bodies of water.
#PRODUCT	Objects, vehicles, foods, etc. (Not services.)
#EVENT	Named hurricanes, battles, wars, sports events, etc.
#WORK_OF_ART	Titles of books, songs, etc.
#LAW	Named documents made into laws.
#LANGUAGE	Any named language.
#DATE	Absolute or relative dates or periods.
#TIME	Times smaller than a day.
#PERCENT	Percentage, including ”%“.
#MONEY	Monetary values, including unit.
#QUANTITY	Measurements, as of weight or distance.
#ORDINAL	“first”, “second”, etc.
#CARDINAL	Numerals that do not fall under another type.

Counter({'PERSON': 78,
         'GPE': 14,
         'ORG': 37,
         'CARDINAL': 3,
         'DATE': 22,
         'NORP': 2,
         'ORDINAL': 1})

In [35]:
#3 most frequent entries
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 28), ('F.B.I.', 13), ('Trump', 12)]

In [37]:
sentences = [x for x in article.sents]
#print(sentences[19])

Firing Mr. Strzok, however, removes a favorite target of Mr. Trump from the ranks of the F.B.I. and gives Mr. Bowdich and the F.B.I. director, Christopher A. Wray, a chance to move beyond the president


In [38]:
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')

In [39]:
displacy.render(nlp(str(sentences[19])), style='dep', jupyter = True, options = {'distance': 120})

In [40]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[19])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Firing', 'VERB', 'fire'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Strzok', 'PROPN', 'Strzok'),
 ('removes', 'VERB', 'remove'),
 ('favorite', 'ADJ', 'favorite'),
 ('target', 'NOUN', 'target'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Trump', 'PROPN', 'Trump'),
 ('ranks', 'NOUN', 'rank'),
 ('F.B.I.', 'PROPN', 'F.B.I.'),
 ('gives', 'VERB', 'give'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Bowdich', 'PROPN', 'Bowdich'),
 ('F.B.I.', 'PROPN', 'F.B.I.'),
 ('director', 'NOUN', 'director'),
 ('Christopher', 'PROPN', 'Christopher'),
 ('A.', 'PROPN', 'A.'),
 ('Wray', 'PROPN', 'Wray'),
 ('chance', 'NOUN', 'chance'),
 ('president', 'NOUN', 'president')]

In [43]:
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')