In [13]:
import spacy
from spacy import displacy
from collections import Counter

import en_core_web_sm
nlp = en_core_web_sm.load()

In [12]:
from bs4 import BeautifulSoup
import requests
import re

In [9]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [14]:
bbc_news = url_to_string('https://www.bbc.com/news/world-asia-india-53182169')
article = nlp(bbc_news)
len(article.ents)

144

In [15]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 31,
         'FAC': 1,
         'GPE': 28,
         'LOC': 12,
         'CARDINAL': 14,
         'PERSON': 19,
         'PRODUCT': 3,
         'NORP': 8,
         'DATE': 20,
         'ORDINAL': 3,
         'MONEY': 2,
         'TIME': 2,
         'WORK_OF_ART': 1})

There are 144 entities in the article and they are represented as 13 unique labels

In [16]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('India', 13), ('Fair', 6), ('Asia', 6)]

The most frequent tokens are: India, Fair, Asia

In [24]:
sentences = [x for x in article.sents]
print(sentences[70])

Chandana Hiran, who authored one of the petitions, told the BBC the Unilever announcement was "a path-breaking decision" but was only "a first step towards inclusivity".


In [25]:
displacy.render(nlp(str(sentences[70])), jupyter=True, style='ent')

In [26]:
displacy.render(nlp(str(sentences[70])), style='dep', jupyter = True, options = {'distance': 120})

In [30]:
[(x.orth_,x.pos_, x.lemma_) for x in [y for y in nlp(str(sentences[70]))if not y.is_stop and y.pos_ != 'PUNCT']]

[('Chandana', 'PROPN', 'Chandana'),
 ('Hiran', 'PROPN', 'Hiran'),
 ('authored', 'VERB', 'author'),
 ('petitions', 'NOUN', 'petition'),
 ('told', 'VERB', 'tell'),
 ('BBC', 'PROPN', 'BBC'),
 ('Unilever', 'PROPN', 'Unilever'),
 ('announcement', 'NOUN', 'announcement'),
 ('path', 'NOUN', 'path'),
 ('breaking', 'VERB', 'break'),
 ('decision', 'NOUN', 'decision'),
 ('step', 'NOUN', 'step'),
 ('inclusivity', 'NOUN', 'inclusivity')]

In [31]:
dict([(str(x), x.label_) for x in nlp(str(sentences[70])).ents])

{'Chandana Hiran': 'PERSON',
 'one': 'CARDINAL',
 'BBC': 'ORG',
 'Unilever': 'ORG',
 'first': 'ORDINAL'}

In [32]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[70]])

[(Chandana, 'B', 'PERSON'), (Hiran, 'I', 'PERSON'), (,, 'O', ''), (who, 'O', ''), (authored, 'O', ''), (one, 'B', 'CARDINAL'), (of, 'O', ''), (the, 'O', ''), (petitions, 'O', ''), (,, 'O', ''), (told, 'O', ''), (the, 'O', ''), (BBC, 'B', 'ORG'), (the, 'O', ''), (Unilever, 'B', 'ORG'), (announcement, 'O', ''), (was, 'O', ''), (", 'O', ''), (a, 'O', ''), (path, 'O', ''), (-, 'O', ''), (breaking, 'O', ''), (decision, 'O', ''), (", 'O', ''), (but, 'O', ''), (was, 'O', ''), (only, 'O', ''), (", 'O', ''), (a, 'O', ''), (first, 'B', 'ORDINAL'), (step, 'O', ''), (towards, 'O', ''), (inclusivity, 'O', ''), (", 'O', ''), (., 'O', '')]


In [33]:
displacy.render(article, jupyter=True, style='ent')