In [1]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [None]:
pip install newspaper3k

In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
from newspaper import Article
 
url = 'https://english.onlinekhabar.com/kathmandu-mayor-balens-meeting-with-prime-minister-deuba.html'
 
toi_article = Article(url, language="en")
 
toi_article.download()
toi_article.parse()
toi_article.nlp()

print(toi_article.title)
print(toi_article.text)
 


Kathmandu Mayor Balen Shah meets PM Deuba
Kathmandu, June 1

Balen Shah, the mayor of Kathmandu, met Prime Minister Sher Bahadur Deuba.

Shah met PM Deuba at the prime minister’s residence in Baluwatar.

During the meeting, Deuba pledged his entire support for his five-year term.

He states, “In the metropolis’ executive team, Congress has a majority. For development tasks, you will have complete support from the team. Take every measure for waste management; you will have everyone’s support.”


In [9]:
def clean(text):
    
    # removing paragraph numbers
    text = re.sub('[0-9]+.\t','',str(text))
    # removing new line characters
    text = re.sub('\n ','',str(text))
    text = re.sub('\n',' ',str(text))
    # removing apostrophes
    text = re.sub("'s",'',str(text))
    # removing hyphens
    text = re.sub("-",' ',str(text))
    text = re.sub("— ",'',str(text))
    # removing quotation marks
    text = re.sub('\"','',str(text))
    # removing salutations
    text = re.sub("Mr\.",'Mr',str(text))
    text = re.sub("Mrs\.",'Mrs',str(text))
    # removing any reference to outside text
    text = re.sub("[\(\[].*?[\)\]]", "", str(text))
    
    return text


In [10]:
text = clean(toi_article.text)

In [11]:
print(text)

Kathmandu, June 1  Balen Shah, the mayor of Kathmandu, met Prime Minister Sher Bahadur Deuba.  Shah met PM Deuba at the prime minister’s residence in Baluwatar.  During the meeting, Deuba pledged his entire support for his five year term.  He states, “In the metropolis’ executive team, Congress has a majority. For development tasks, you will have complete support from the team. Take every measure for waste management; you will have everyone’s support.”


In [12]:
import spacy

nlp = spacy.load('en_core_web_sm',disable=['ner','textcat'])

doc = nlp(text)

for token in doc:
    print(token.text,'->',token.pos_)

Kathmandu -> PROPN
, -> PUNCT
June -> PROPN
1 -> NUM
  -> SPACE
Balen -> PROPN
Shah -> PROPN
, -> PUNCT
the -> DET
mayor -> NOUN
of -> ADP
Kathmandu -> PROPN
, -> PUNCT
met -> VERB
Prime -> PROPN
Minister -> PROPN
Sher -> PROPN
Bahadur -> PROPN
Deuba -> PROPN
. -> PUNCT
  -> SPACE
Shah -> PROPN
met -> VERB
PM -> PROPN
Deuba -> PROPN
at -> ADP
the -> DET
prime -> PROPN
minister -> PROPN
’s -> PART
residence -> NOUN
in -> ADP
Baluwatar -> PROPN
. -> PUNCT
  -> SPACE
During -> ADP
the -> DET
meeting -> NOUN
, -> PUNCT
Deuba -> PROPN
pledged -> VERB
his -> DET
entire -> ADJ
support -> NOUN
for -> ADP
his -> DET
five -> NUM
year -> NOUN
term -> NOUN
. -> PUNCT
  -> SPACE
He -> PRON
states -> VERB
, -> PUNCT
“ -> PUNCT
In -> ADP
the -> DET
metropolis -> PROPN
’ -> PUNCT
executive -> ADJ
team -> NOUN
, -> PUNCT
Congress -> PROPN
has -> AUX
a -> DET
majority -> NOUN
. -> PUNCT
For -> ADP
development -> NOUN
tasks -> NOUN
, -> PUNCT
you -> PRON
will -> VERB
have -> AUX
complete -> ADJ
support -

In [13]:
for token in doc:
    if token.pos_=='NOUN':
        print(token.text)

mayor
residence
meeting
support
year
term
team
majority
development
tasks
support
team
measure
waste
management
support


In [26]:
from pathlib import Path


In [30]:
from spacy import displacy 
image = displacy.render(doc, style='dep',jupyter=True)

In [15]:
doc = nlp(text)

for tok in doc:
  print(tok.text, "...", tok.dep_)

Kathmandu ... nsubj
, ... punct
June ... npadvmod
1 ... nummod
  ... 
Balen ... compound
Shah ... appos
, ... punct
the ... det
mayor ... appos
of ... prep
Kathmandu ... pobj
, ... punct
met ... ROOT
Prime ... compound
Minister ... compound
Sher ... compound
Bahadur ... compound
Deuba ... dobj
. ... punct
  ... 
Shah ... nsubj
met ... ROOT
PM ... compound
Deuba ... dobj
at ... prep
the ... det
prime ... compound
minister ... pobj
’s ... punct
residence ... appos
in ... prep
Baluwatar ... pobj
. ... punct
  ... 
During ... prep
the ... det
meeting ... pobj
, ... punct
Deuba ... nsubj
pledged ... ROOT
his ... poss
entire ... amod
support ... dobj
for ... prep
his ... poss
five ... nummod
year ... compound
term ... pobj
. ... punct
  ... 
He ... nsubj
states ... ROOT
, ... punct
“ ... punct
In ... prep
the ... det
metropolis ... poss
’ ... punct
executive ... amod
team ... pobj
, ... punct
Congress ... nsubj
has ... ccomp
a ... det
majority ... dobj
. ... punct
For ... prep
development ..

In [16]:
def get_entities(sent):
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    
  prv_tok_text = ""   
  prefix = ""
  modifier = ""


  for tok in nlp(sent):
    
    if tok.dep_ != "punct":
      if tok.dep_ == "compound":
        prefix = tok.text
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        

      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text

  return [ent1.strip(), ent2.strip()]

In [17]:
sentence=[]
tokens = nlp(text)
for sent in tokens.sents:
    sentence.append((sent.text.strip()))
    print(sentence)

['Kathmandu, June 1  Balen Shah, the mayor of Kathmandu, met Prime Minister Sher Bahadur Deuba.']
['Kathmandu, June 1  Balen Shah, the mayor of Kathmandu, met Prime Minister Sher Bahadur Deuba.', 'Shah met PM Deuba at the prime minister’s residence in Baluwatar.']
['Kathmandu, June 1  Balen Shah, the mayor of Kathmandu, met Prime Minister Sher Bahadur Deuba.', 'Shah met PM Deuba at the prime minister’s residence in Baluwatar.', 'During the meeting, Deuba pledged his entire support for his five year term.']
['Kathmandu, June 1  Balen Shah, the mayor of Kathmandu, met Prime Minister Sher Bahadur Deuba.', 'Shah met PM Deuba at the prime minister’s residence in Baluwatar.', 'During the meeting, Deuba pledged his entire support for his five year term.', 'He states, “In the metropolis’ executive team, Congress has a majority.']
['Kathmandu, June 1  Balen Shah, the mayor of Kathmandu, met Prime Minister Sher Bahadur Deuba.', 'Shah met PM Deuba at the prime minister’s residence in Baluwatar.',

In [18]:
def obtain_relation(sent):
  
   doc = nlp(sent)
  
   matcher = Matcher(nlp.vocab)
  
   pattern = [{'DEP':'ROOT'},
           {'DEP':'prep','OP':"?"},
           {'DEP':'agent','OP':"?"}, 
           {'POS':'ADJ','OP':"?"}]
  
   matcher.add("matching_1", None, pattern)
  
   matcher = matcher(doc)
   h = len(matcher) - 1
  
   span = doc[matcher[h][1]:matcher[h][2]]
  
   return (span.text)

In [19]:
relations = [obtain_relation(j) for j in tqdm(text)]
pd.Series(relations).value_counts()[:50]


100%|██████████| 456/456 [00:02<00:00, 175.91it/s]


     75
e    56
a    32
t    31
r    26
s    20
m    20
i    19
h    18
o    18
n    17
u    16
p    11
l    10
y     7
v     7
,     7
d     7
.     6
f     5
g     5
D     4
w     4
’     3
S     3
c     3
b     3
B     3
k     2
K     2
M     2
P     2
1     1
H     1
“     1
I     1
x     1
C     1
j     1
F     1
J     1
T     1
;     1
”     1
dtype: int64

In [20]:
# subject extraction
source = [j[0] for j in sentence]

#object extraction
target = [k[1] for k in sentence]

data_kgf = pd.DataFrame({'source':source, 'target':target, 'edge':relations})

NameError: ignored

In [None]:
import networkx as ntx


In [None]:
graph = ntx.from_pandas_edgelist(data_kgf, "source", "target",
                         edge_attr=True, create_using=ntx.MultiDiGraph())

In [None]:
plt.figure(figsize=(14, 14))
posn = ntx.spring_layout(graph)
ntx.draw(graph, with_labels=True, node_color='green', edge_cmap=plt.cm.Blues, pos = posn)
plt.show()

In [None]:
graph = ntx.from_pandas_edgelist(data_kgf[data_kgf['edge']=="Information from"], "source", "target",
                         edge_attr=True, create_using=ntx.MultiDiGraph())
 
plt.figure(figsize=(14,14))
pos = ntx.spring_layout(graph, k = 0.5) # k regulates the distance between nodes
ntx.draw(graph, with_labels=True, node_color='green', node_size=1400, edge_cmap=plt.cm.Blues, pos = posn)
plt.show()