In [1]:
import json
import re
import urllib
from pprint import pprint
import time
from tqdm import tqdm

from neo4j import GraphDatabase
import pandas as pd
import wikipedia
import spacy

from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy
from spacy.matcher import Matcher
from spacy.tokens import Doc, Span, Token

print(spacy.__version__)

3.0.3


# Configure spacy

In [2]:
SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
VERBS = ['ROOT', 'advcl']
OBJECTS = ["dobj", "dative", "attr", "oprd", 'pobj']
ENTITY_LABELS = ['PERSON', 'NORP', 'GPE', 'ORG', 'FAC', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART']

api_key = open('.api_key').read()

non_nc = spacy.load('en_core_web_sm')

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('merge_noun_chunks')

print(non_nc.pipe_names)
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer', 'merge_noun_chunks']


# Neo4j Connector Class

In [3]:
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

# Query Google

In [5]:
def query_google(query, api_key, limit=10, indent=True, return_lists=True):
    
    text_ls = []
    node_label_ls = []
    url_ls = []
    
    params = {
        'query': query,
        'limit': limit,
        'indent': indent,
        'key': api_key,
    }   
    
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    url = service_url + '?' + urllib.parse.urlencode(params)
    response = json.loads(urllib.request.urlopen(url).read())
    
    if return_lists:
        for element in response['itemListElement']:

            try:
                node_label_ls.append(element['result']['@type'])
            except:
                node_label_ls.append('')

            try:
                text_ls.append(element['result']['detailedDescription']['articleBody'])
                #pprint(element['result']['detailedDescription']['articleBody'])
            except:
                text_ls.append('')
                
            try:
                url_ls.append(element['result']['detailedDescription']['url'])
            except:
                url_ls.append('')
                
        return text_ls, node_label_ls, url_ls
    
    else:
        return response

# NLP Functions

In [6]:
def remove_special_characters(text):
    
    regex = re.compile(r'[\n\r\t]')
    clean_text = regex.sub(" ", text)
    
    return clean_text


def remove_stop_words_and_punct(text, print_text=False):
    
    result_ls = []
    rsw_doc = non_nc(text)
    
    for token in rsw_doc:
        if print_text:
            print(token, token.is_stop)
            print('--------------')
        if not token.is_stop and not token.is_punct:
            result_ls.append(str(token))
    
    result_str = ' '.join(result_ls)

    return result_str


def create_svo_lists(doc, print_lists=False):
    
    subject_ls = []
    verb_ls = []
    object_ls = []

    for token in doc:
        if token.dep_ in SUBJECTS:
            #print(list(token.ancestors))
            subject_ls.append((token.lower_, token.idx))
        elif token.dep_ in VERBS:
            #print('CHILDREN of ', token.text, ': ' ,list(token.children), token.idx)
            verb_ls.append((token.lemma_, token.idx))
        elif token.dep_ in OBJECTS:
            #print('ANCESTORS of ', token.text, ': ', list(token.ancestors), token.idx)
            object_ls.append((token.lower_, token.idx))

    if print_lists:
        print('SUBJECTS: ', subject_ls)
        print('VERBS: ', verb_ls)
        print('OBJECTS: ', object_ls)
    
    return subject_ls, verb_ls, object_ls


def remove_duplicates(tup, tup_posn):
    
    check_val = set()
    result = []
    
    for i in tup:
        if i[tup_posn] not in check_val:
            result.append(i)
            check_val.add(i[tup_posn])
            
    return result


def remove_dates(tup_ls):
    
    clean_tup_ls = []
    for entry in tup_ls:
        if not entry[2].isdigit():
            clean_tup_ls.append(entry)
    return clean_tup_ls


def create_svo_triples(text):
    
    clean_text = remove_special_characters(text)
    doc = nlp(clean_text)
    subject_ls, verb_ls, object_ls = create_svo_lists(doc)
    
    graph_tup_ls = []
    dedup_tup_ls = []
    clean_tup_ls = []
    
    for subj in subject_ls: 
        for obj in object_ls:
            
            dist_ls = []
            
            for v in verb_ls:
                
                # Assemble a list of distances between each object and each verb
                dist_ls.append(abs(obj[1] - v[1]))
                
            # Get the index of the verb with the smallest distance to the object 
            # and return that verb
            index_min = min(range(len(dist_ls)), key=dist_ls.__getitem__)
            
            # Remve stop words from subjects and objects

            no_sw_subj = remove_stop_words_and_punct(subj[0])
            no_sw_obj = remove_stop_words_and_punct(obj[0])
            
            # Add entries to the graph iff neither subject nor object is blank
            if no_sw_subj and no_sw_obj:
                tup = (no_sw_subj, verb_ls[index_min][0], no_sw_obj)
                graph_tup_ls.append(tup)
        
        #clean_tup_ls = remove_dates(graph_tup_ls)
    
    dedup_tup_ls = remove_duplicates(graph_tup_ls, 2)
    clean_tup_ls = remove_dates(dedup_tup_ls)
    
    return clean_tup_ls

# Add to dataframe

In [59]:
def make_verb_edge_string(verb):
    
    return '[:' + str(verb).upper() + ']'


def add_columns(row, limit=1, indent=True):
    
    params = {
        'query': row[2],
        'limit': limit,
        'indent': indent,
        'key': api_key,
    } 
    
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    url = service_url + '?' + urllib.parse.urlencode(params)
    response = json.loads(urllib.request.urlopen(url).read())
    
    try:
        if response['itemListElement'][0]['result']['detailedDescription']['articleBody']:
            text = response['itemListElement'][0]['result']['detailedDescription']['articleBody']
    except:
        text = ' '
        
    try:
        if response['itemListElement'][0]['result']['@type']:
            node_labels = response['itemListElement'][0]['result']['@type']
    except:
        node_labels = ' '

    try:
        if response['itemListElement'][0]['result']['detailedDescription']['url']:
            link = response['itemListElement'][0]['result']['detailedDescription']['url']
    except:
        link = ' '

    row['description'] = text
    row['node_labels'] = node_labels
    row['url'] = link
        
    return row


def add_df_layer(df):

    objects = df['object'].tolist()
    final_tup_ls = []

    for obj in objects:

        text_ls, node_label_ls, url_ls = query_google(obj, api_key, limit=1)

        for text in text_ls:
            tup = create_svo_triples(text)
            dedup_tup = remove_duplicates(tup, 2)
            if dedup_tup:
                final_tup_ls.extend(dedup_tup)
                
    new_df = pd.DataFrame(final_tup_ls, columns = ['subject', 'verb', 'object'])
    new_df['edge_string'] = new_df['verb'].map(make_verb_edge_string)
    new_df = new_df.apply(add_columns, axis=1)
    #new_df['text_property'] = new_df['object'].map(add_text)
    #new_df['node_labels'] = new_df['object'].map(add_node_labels)
    #new_df['url'] = new_df['object'].map(add_url)
            
    return new_df

# Populate the graph from the dataframe

In [63]:
def create_graph(rows):
    
    query = '''
    UNWIND $rows AS item
    MERGE (s:Subject {name: item.subject})
    MERGE (o:Object {name: item.object, description: COALESCE(item.description, 'NOT SET'), url: COALESCE(item.url, 'NOT SET')})
    WITH s, o, item
    CALL apoc.create.relationship(s, item.edge_string, {}, o)
    YIELD rel
    RETURN COUNT(s), COUNT(o), COUNT(rel)
    '''
    
    return insert_data(query, rows, batch_size=10000)



def insert_data(query, rows, batch_size = 10000):
    # Function to handle the updating the Neo4j database in batch mode.

    total = 0
    batch = 0
    start = time.time()
    result = None

    while batch * batch_size < len(rows):

        res = conn.query(query, parameters={'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
        if res[0]:
            print(res[0])
        else:
            print(res)
        #total += res[0]['total']
        batch += 1
        #result = {"total":total, "batches":batch, "time":time.time()-start}
        result = {'batches': batch, 'time': time.time()-start}
        print(result)

    return result

In [36]:
text = wikipedia.summary('barack obama')
text

'Barack Hussein Obama II (born August 4, 1961) is an American politician and attorney who served as the 44th president of the United States from 2009 to 2017. A member of the Democratic Party, Obama was the first African-American president of the United States. He previously served as a U.S. senator from Illinois from 2005 to 2008 and as an Illinois state senator from 1997 to 2004.\nObama was born in Honolulu, Hawaii. After graduating from Columbia University in 1983, he worked as a community organizer in Chicago. In 1988, he enrolled in Harvard Law School, where he was the first black person to be president of the Harvard Law Review. After graduating, he became a civil rights attorney and an academic, teaching constitutional law at the University of Chicago Law School from 1992 to 2004. Turning to elective politics, he represented the 13th district from 1997 until 2004 in the Illinois Senate, when he ran for the U.S. Senate. Obama received national attention in 2004 with his March Sen

In [37]:
final_tup_ls = create_svo_triples(text)  
final_tup_ls[0:5]

[('barack hussein obama ii', 'be', 'american politician'),
 ('barack hussein obama ii', 'be', '44th president'),
 ('barack hussein obama ii', 'be', 'united states'),
 ('barack hussein obama ii', 'be', 'democratic party'),
 ('barack hussein obama ii', 'be', 'african american president')]

In [40]:
%%time
df = pd.DataFrame(final_tup_ls, columns = ['subject', 'verb', 'object'])
df['edge_string'] = df['verb'].map(make_verb_edge_string)
df = df.apply(add_columns, axis=1)
df.head()

CPU times: user 858 ms, sys: 44.4 ms, total: 903 ms
Wall time: 12.6 s


Unnamed: 0,subject,verb,object,edge_string,description,node_labels,url
0,barack hussein obama ii,be,american politician,[:BE],,[Thing],
1,barack hussein obama ii,be,44th president,[:BE],,,
2,barack hussein obama ii,be,united states,[:BE],"The United States of America, commonly known a...","[Country, Place, AdministrativeArea, Thing]",https://en.wikipedia.org/wiki/United_States
3,barack hussein obama ii,be,democratic party,[:BE],The Democratic Party is one of the two major c...,"[Thing, Organization]",https://en.wikipedia.org/wiki/Democratic_Party...
4,barack hussein obama ii,be,african american president,[:BE],The National Museum of African American Histor...,"[Place, Museum, CivicStructure, TouristAttract...",https://en.wikipedia.org/wiki/National_Museum_...


In [56]:
conn = Neo4jConnection(uri="bolt://0.0.0.0:7687", user="neo4j", pwd="1234")
conn.query('CREATE CONSTRAINT subj_constraint IF NOT EXISTS ON (s:Subject) ASSERT s.name IS UNIQUE')
conn.query('CREATE CONSTRAINT obj_constraint IF NOT EXISTS ON (o:Object) ASSERT o.name IS UNIQUE')
create_graph(df)

<Record COUNT(s)=112 COUNT(o)=112 COUNT(rel)=112>
{'batches': 1, 'time': 0.06975555419921875}


{'batches': 1, 'time': 0.06975555419921875}

In [60]:
%%time
new_df = add_df_layer(df)

CPU times: user 14.2 s, sys: 359 ms, total: 14.6 s
Wall time: 1min 19s


In [61]:
new_df.shape

(506, 7)

In [62]:
create_graph(new_df)

<Record COUNT(s)=506 COUNT(o)=506 COUNT(rel)=506>
{'batches': 1, 'time': 0.12885475158691406}


{'batches': 1, 'time': 0.12885475158691406}