<a href="https://colab.research.google.com/github/dhanvanthboppana/testgit/blob/master/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [2]:
candidate_sentences = pd.read_csv("wiki_sentences_v2.csv")
candidate_sentences.shape

(4318, 1)

In [3]:
def get_entities(sent):
  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in nlp(sent):
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  #############################################################

  return [ent1.strip(), ent2.strip()]

In [4]:
def get_relation(sent):

  doc = nlp(sent)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", None, pattern) 

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [6]:
entity_pairs= []
relations=[]
p=set()
loc=set()
for i in tqdm(candidate_sentences["sentence"]):
  doc=nlp(i)
  for l in doc.ents:
    if(l.label_ is 'PERSON'):
      p.add(l.text.strip())
    if(l.label_ is 'GPE'):
      loc.add(l.text.strip())
  entity_pairs.append(get_entities(i))
  relations.append(get_relation(i))

100%|██████████| 4318/4318 [02:08<00:00, 33.70it/s]


In [7]:
source = [i[0] for i in entity_pairs]

# extract object
target = [i[1] for i in entity_pairs]

kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})

In [8]:
G=nx.from_pandas_edgelist(kg_df, "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

In [9]:
d=nx.closeness_centrality(G)

In [10]:
dp=d.copy()
for i in d.items():
  if(i[0] not in p):
    del dp[i[0]]

In [11]:
print("Important persons:\n")
sdp=sorted(dp.items(),key=lambda x:x[1], reverse=True)
c=0
for i in sdp:
  print(i[0])
  c=c+1
  if(c==10):
    break

Important persons:

john thimothy
cam gigandet
ali larter
chandran
allen maris
boris karloff
brad pitt
e. jack kaplan
jfk
kennedy


In [16]:
dl=d.copy()
for k in d.items():
  if(k[0] not in loc):
    del dl[k[0]]

In [18]:
print("Important locations:\n")
sdl=sorted(dl.items(), key=lambda x:x[1], reverse=True)
c=0
for i in sdl:
  print(i[0])
  c=c+1
  if(c==10):
    break

Important locations:

mumbai
hollywood
washington
us
america
hyderabad
samoa
cyprus
hrithik roshan
eilis
