In [1]:
import json
import re
import urllib
from pprint import pprint
import time
from tqdm import tqdm

import py2neo
import pandas as pd
import wikipedia
import spacy

from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy
from spacy.matcher import Matcher
from spacy.tokens import Doc, Span, Token

print(spacy.__version__)

3.0.3


In [2]:
SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
VERBS = ['ROOT', 'advcl']
OBJECTS = ["dobj", "dative", "attr", "oprd", 'pobj']
ENTITY_LABELS = ['PERSON', 'NORP', 'GPE', 'ORG', 'FAC', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART']

api_key = open('.api_key').read()

non_nc = spacy.load('en_core_web_md')

nlp = spacy.load('en_core_web_md')
nlp.add_pipe('merge_noun_chunks')

print(non_nc.pipe_names)
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer', 'merge_noun_chunks']


In [3]:
def query_google(query, api_key, limit=10, indent=True, return_lists=True):
    
    text_ls = []
    node_label_ls = []
    url_ls = []
    
    params = {
        'query': query,
        'limit': limit,
        'indent': indent,
        'key': api_key,
    }   
    
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    url = service_url + '?' + urllib.parse.urlencode(params)
    response = json.loads(urllib.request.urlopen(url).read())
    
    if return_lists:
        for element in response['itemListElement']:

            try:
                node_label_ls.append(element['result']['@type'])
            except:
                node_label_ls.append('')

            try:
                text_ls.append(element['result']['detailedDescription']['articleBody'])
                #pprint(element['result']['detailedDescription']['articleBody'])
            except:
                text_ls.append('')
                
            try:
                url_ls.append(element['result']['detailedDescription']['url'])
            except:
                url_ls.append('')
                
        return text_ls, node_label_ls, url_ls
    
    else:
        return response

In [4]:
def remove_special_characters(text):
    
    regex = re.compile(r'[\n\r\t]')
    clean_text = regex.sub(" ", text)
    
    return clean_text


def remove_stop_words_and_punct(text, print_text=False):
    
    result_ls = []
    rsw_doc = non_nc(text)
    
    for token in rsw_doc:
        if print_text:
            print(token, token.is_stop)
            print('--------------')
        if not token.is_stop and not token.is_punct:
            result_ls.append(str(token))
    
    result_str = ' '.join(result_ls)

    return result_str


def create_svo_lists(doc, print_lists=False):
    
    subject_ls = []
    verb_ls = []
    object_ls = []

    for token in doc:
        if token.dep_ in SUBJECTS:
            #print(list(token.ancestors))
            subject_ls.append((token.lower_, token.idx))
        elif token.dep_ in VERBS:
            #print('CHILDREN of ', token.text, ': ' ,list(token.children), token.idx)
            verb_ls.append((token.lemma_, token.idx))
        elif token.dep_ in OBJECTS:
            #print('ANCESTORS of ', token.text, ': ', list(token.ancestors), token.idx)
            object_ls.append((token.lower_, token.idx))

    if print_lists:
        print('SUBJECTS: ', subject_ls)
        print('VERBS: ', verb_ls)
        print('OBJECTS: ', object_ls)
    
    return subject_ls, verb_ls, object_ls


def remove_duplicates(tup, tup_posn):
    
    check_val = set()
    result = []
    
    for i in tup:
        if i[tup_posn] not in check_val:
            result.append(i)
            check_val.add(i[tup_posn])
            
    return result


def remove_dates(tup_ls):
    
    clean_tup_ls = []
    for entry in tup_ls:
        if not entry[2].isdigit():
            clean_tup_ls.append(entry)
    return clean_tup_ls


def create_svo_triples(text):
    
    clean_text = remove_special_characters(text)
    doc = nlp(clean_text)
    subject_ls, verb_ls, object_ls = create_svo_lists(doc)
    
    graph_tup_ls = []
    dedup_tup_ls = []
    clean_tup_ls = []
    
    for subj in subject_ls: 
        for obj in object_ls:
            
            dist_ls = []
            
            for v in verb_ls:
                
                # Assemble a list of distances between each object and each verb
                dist_ls.append(abs(obj[1] - v[1]))
                
            # Get the index of the verb with the smallest distance to the object 
            # and return that verb
            index_min = min(range(len(dist_ls)), key=dist_ls.__getitem__)
            
            # Remve stop words from subjects and object.  Note that we do this a bit
            # later down in the process to allow for proper sentence recognition.

            no_sw_subj = remove_stop_words_and_punct(subj[0])
            no_sw_obj = remove_stop_words_and_punct(obj[0])
            
            # Add entries to the graph iff neither subject nor object is blank
            if no_sw_subj and no_sw_obj:
                tup = (no_sw_subj, verb_ls[index_min][0], no_sw_obj)
                graph_tup_ls.append(tup)
        
        #clean_tup_ls = remove_dates(graph_tup_ls)
    
    dedup_tup_ls = remove_duplicates(graph_tup_ls, 2)
    clean_tup_ls = remove_dates(dedup_tup_ls)
    
    return clean_tup_ls

In [39]:
def get_obj_properties(tup_ls):
    
    init_obj_tup_ls = []
    
    for tup in tup_ls:

        try:
            text, node_label_ls, url = query_google(tup[2], api_key, limit=1)
            new_tup = (tup[0], tup[1], tup[2], text[0], node_label_ls[0], url[0])
        except:
            new_tup = (tup[0], tup[1], tup[2], [], [], [])
        
        init_obj_tup_ls.append(new_tup)
        
    return init_obj_tup_ls
    

def format_verb(tup_ls):
    
    formatted_ls = []
    
    for tup in tup_ls:
        verb = ':' + tup[1].upper()
        #print(tup[3])
        formatted_ls.append((tup[0], verb, tup[2], tup[3], tup[4], tup[5]))
        
    return formatted_ls


def deduper(dup_ls):
    
    return list(dict.fromkeys(dup_ls))

'''
def add_layer(dedup_obj_ls):
    
    new_tup_ls = []

    for obj in dedup_obj_ls:

        text_ls, node_label_ls, url_ls = query_google(obj[2], api_key, limit=1)

        for text in text_ls:
            tup = create_svo_triples(text)
            #dedup_tup = remove_duplicates(tup, 2)
            if tup:
                new_tup_ls.extend(tup)
                
    return new_tup_ls
'''

def add_layer(tup_ls):

    svo_tup_ls = []
    
    for tup in tup_ls:
        
        if tup[3]:
            svo_tup = create_svo_triples(tup[3])
            svo_tup_ls.extend(svo_tup)
        else:
            #svo_tup = ([], [], [])
            continue
    
    return get_obj_properties(svo_tup_ls)
        

def subj_equals_obj(tup_ls):
    
    new_tup_ls = []
    
    for tup in tup_ls:
        if tup[0] != tup[2]:
            new_tup_ls.append((tup[0], tup[1], tup[2], tup[3], tup[4], tup[5]))
            
    return new_tup_ls

In [6]:
text = wikipedia.summary('barack obama')
text

'Barack Hussein Obama II ( (listen) bə-RAHK hoo-SAYN oh-BAH-mə; born August 4, 1961) is an American politician and attorney who served as the 44th president of the United States from 2009 to 2017. A member of the Democratic Party, Obama was the first African-American president of the United States. He previously served as a U.S. senator from Illinois from 2005 to 2008 and as an Illinois state senator from 1997 to 2004.\nObama was born in Honolulu, Hawaii. After graduating from Columbia University in 1983, he worked as a community organizer in Chicago. In 1988, he enrolled in Harvard Law School, where he was the first black person to be president of the Harvard Law Review. After graduating, he became a civil rights attorney and an academic, teaching constitutional law at the University of Chicago Law School from 1992 to 2004. Turning to elective politics, he represented the 13th district from 1997 until 2004 in the Illinois Senate, when he ran for the U.S. Senate. Obama received nationa

In [7]:
%%time
initial_tup_ls = create_svo_triples(text)  

CPU times: user 50.6 s, sys: 26.5 ms, total: 50.6 s
Wall time: 50.6 s


In [8]:
%%time
init_obj_tup_ls = get_obj_properties(initial_tup_ls)
init_obj_tup_ls[0:5]

CPU times: user 718 ms, sys: 51.9 ms, total: 770 ms
Wall time: 13 s


[('oh bah mə', 'be', 'american politician', '', ['Thing'], ''),
 ('oh bah mə', 'be', '44th president', [], [], []),
 ('oh bah mə',
  'be',
  'united states',
  'The United States of America, commonly known as the United States or America, is a country primarily located in North America. It consists of 50 states, a federal district, five major self-governing territories, 326 Indian reservations, and some minor possessions. ',
  ['Country', 'AdministrativeArea', 'Place', 'Thing'],
  'https://en.wikipedia.org/wiki/United_States'),
 ('oh bah mə',
  'be',
  'democratic party',
  'The Democratic Party is one of the two major contemporary political parties in the United States, along with its main, historic rival, the Republican Party. ',
  ['Thing', 'Organization'],
  'https://en.wikipedia.org/wiki/Democratic_Party_(United_States)'),
 ('oh bah mə',
  'be',
  'african american president',
  'The National Museum of African American History and Culture is a Smithsonian Institution museum locate

In [10]:
%%time
new_layer_ls = add_layer(init_obj_tup_ls)
new_layer_ls[0:5]

CPU times: user 14.2 s, sys: 144 ms, total: 14.4 s
Wall time: 1min 6s


[('united states',
  'be',
  'america',
  'South America is a continent entirely in the Western Hemisphere and mostly in the Southern Hemisphere, with a relatively small portion in the Northern Hemisphere. It can also be described as a southern subcontinent of the Americas. ',
  ['Place', 'Continent', 'Thing'],
  'https://en.wikipedia.org/wiki/South_America'),
 ('united states',
  'be',
  'united states',
  'The United States of America, commonly known as the United States or America, is a country primarily located in North America. It consists of 50 states, a federal district, five major self-governing territories, 326 Indian reservations, and some minor possessions. ',
  ['Country', 'Thing', 'AdministrativeArea', 'Place'],
  'https://en.wikipedia.org/wiki/United_States'),
 ('united states',
  'be',
  'country',
  "A country is a distinct territorial body\nor political entity. It is often referred to as the land of an individual's birth, residence or citizenship.\n",
  ['Thing'],
  'h

In [44]:
starter_edge_ls = init_obj_tup_ls + new_layer_ls
clean_edge_ls = subj_equals_obj(starter_edge_ls)
edge_ls = format_verb(clean_edge_ls)
edge_ls[0:3]

[('oh bah mə', ':BE', 'american politician', '', ['Thing'], ''),
 ('oh bah mə', ':BE', '44th president', [], [], []),
 ('oh bah mə',
  ':BE',
  'united states',
  'The United States of America, commonly known as the United States or America, is a country primarily located in North America. It consists of 50 states, a federal district, five major self-governing territories, 326 Indian reservations, and some minor possessions. ',
  ['Country', 'AdministrativeArea', 'Place', 'Thing'],
  'https://en.wikipedia.org/wiki/United_States')]