In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

In [3]:
sentence = 'Bengaluru: Karnataka Chief Minister BS Yediyurappa said he tested positive for the novel coronavirus according to a tweet he posted on Sunday. The Chief Minister, 77, said he is fine and is being hospitalised as a precaution on the recommendation of doctors. I have tested positive for coronavirus. Whilst I am fine, I am being hospitalised as a precaution on the recommendation of doctors. I request those who have come in contact with me recently to be observant and exercise self-quarantine'

In [4]:
ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))

In [5]:
print(ne_tree)

(S
  (GPE Bengaluru/NN)
  :/:
  (PERSON Karnataka/NNP)
  Chief/NNP
  Minister/NNP
  BS/NNP
  (PERSON Yediyurappa/NNP)
  said/VBD
  he/PRP
  tested/VBD
  positive/JJ
  for/IN
  the/DT
  novel/NN
  coronavirus/NN
  according/VBG
  to/TO
  a/DT
  tweet/NN
  he/PRP
  posted/VBD
  on/IN
  Sunday/NNP
  ./.
  The/DT
  Chief/JJ
  Minister/NNP
  ,/,
  77/CD
  ,/,
  said/VBD
  he/PRP
  is/VBZ
  fine/JJ
  and/CC
  is/VBZ
  being/VBG
  hospitalised/VBN
  as/IN
  a/DT
  precaution/NN
  on/IN
  the/DT
  recommendation/NN
  of/IN
  doctors/NNS
  ./.
  I/PRP
  have/VBP
  tested/VBN
  positive/JJ
  for/IN
  coronavirus/NN
  ./.
  Whilst/NNP
  I/PRP
  am/VBP
  fine/JJ
  ,/,
  I/PRP
  am/VBP
  being/VBG
  hospitalised/VBN
  as/IN
  a/DT
  precaution/NN
  on/IN
  the/DT
  recommendation/NN
  of/IN
  doctors/NNS
  ./.
  I/PRP
  request/VBP
  those/DT
  who/WP
  have/VBP
  come/VBN
  in/IN
  contact/NN
  with/IN
  me/PRP
  recently/RB
  to/TO
  be/VB
  observant/JJ
  and/CC
  exercise/JJ
  self-quarantine/N

In [6]:
ex = 'Bengaluru: Karnataka Chief Minister BS Yediyurappa said he tested positive for the novel coronavirus according to a tweet he posted on Sunday. The Chief Minister, 77, said he is fine and is being hospitalised as a precaution on the recommendation of doctors. I have tested positive for coronavirus. Whilst I am fine, I am being hospitalised as a precaution on the recommendation of doctors. I request those who have come in contact with me recently to be observant and exercise self-quarantine'

In [7]:

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [8]:
sent = preprocess(ex)
sent

[('Bengaluru', 'NN'),
 (':', ':'),
 ('Karnataka', 'NNP'),
 ('Chief', 'NNP'),
 ('Minister', 'NNP'),
 ('BS', 'NNP'),
 ('Yediyurappa', 'NNP'),
 ('said', 'VBD'),
 ('he', 'PRP'),
 ('tested', 'VBD'),
 ('positive', 'JJ'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('novel', 'NN'),
 ('coronavirus', 'NN'),
 ('according', 'VBG'),
 ('to', 'TO'),
 ('a', 'DT'),
 ('tweet', 'NN'),
 ('he', 'PRP'),
 ('posted', 'VBD'),
 ('on', 'IN'),
 ('Sunday', 'NNP'),
 ('.', '.'),
 ('The', 'DT'),
 ('Chief', 'JJ'),
 ('Minister', 'NNP'),
 (',', ','),
 ('77', 'CD'),
 (',', ','),
 ('said', 'VBD'),
 ('he', 'PRP'),
 ('is', 'VBZ'),
 ('fine', 'JJ'),
 ('and', 'CC'),
 ('is', 'VBZ'),
 ('being', 'VBG'),
 ('hospitalised', 'VBN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('precaution', 'NN'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('recommendation', 'NN'),
 ('of', 'IN'),
 ('doctors', 'NNS'),
 ('.', '.'),
 ('I', 'PRP'),
 ('have', 'VBP'),
 ('tested', 'VBN'),
 ('positive', 'JJ'),
 ('for', 'IN'),
 ('coronavirus', 'NN'),
 ('.', '.'),
 ('Whilst', 'NNP'),
 ('I', 'PR

In [9]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [10]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  (NP Bengaluru/NN)
  :/:
  Karnataka/NNP
  Chief/NNP
  Minister/NNP
  BS/NNP
  Yediyurappa/NNP
  said/VBD
  he/PRP
  tested/VBD
  positive/JJ
  for/IN
  (NP the/DT novel/NN)
  (NP coronavirus/NN)
  according/VBG
  to/TO
  (NP a/DT tweet/NN)
  he/PRP
  posted/VBD
  on/IN
  Sunday/NNP
  ./.
  The/DT
  Chief/JJ
  Minister/NNP
  ,/,
  77/CD
  ,/,
  said/VBD
  he/PRP
  is/VBZ
  fine/JJ
  and/CC
  is/VBZ
  being/VBG
  hospitalised/VBN
  as/IN
  (NP a/DT precaution/NN)
  on/IN
  (NP the/DT recommendation/NN)
  of/IN
  doctors/NNS
  ./.
  I/PRP
  have/VBP
  tested/VBN
  positive/JJ
  for/IN
  (NP coronavirus/NN)
  ./.
  Whilst/NNP
  I/PRP
  am/VBP
  fine/JJ
  ,/,
  I/PRP
  am/VBP
  being/VBG
  hospitalised/VBN
  as/IN
  (NP a/DT precaution/NN)
  on/IN
  (NP the/DT recommendation/NN)
  of/IN
  doctors/NNS
  ./.
  I/PRP
  request/VBP
  those/DT
  who/WP
  have/VBP
  come/VBN
  in/IN
  (NP contact/NN)
  with/IN
  me/PRP
  recently/RB
  to/TO
  be/VB
  observant/JJ
  and/CC
  (NP exercise/JJ s

In [None]:
NPChunker = nltk.RegexpParser(pattern) 
result = NPChunker.parse(sent)
result.draw()

In [None]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

In [None]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
doc = nlp('Bengaluru: Karnataka Chief Minister BS Yediyurappa said he tested positive for the novel coronavirus according to a tweet he posted on Sunday. The Chief Minister, 77, said he is fine and is being hospitalised as a precaution on the recommendation of doctors. I have tested positive for coronavirus. Whilst I am fine, I am being hospitalised as a precaution on the recommendation of doctors. I request those who have come in contact with me recently to be observant and exercise self-quarantine')
pprint([(X.text, X.label_) for X in doc.ents])

In [None]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

In [None]:
from bs4 import BeautifulSoup
import requests
import re

In [None]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [None]:
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

In [None]:
labels = [x.label_ for x in article.ents]
Counter(labels)

In [None]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)