In [2]:
import json
import re
import urllib
from pprint import pprint
import time
from tqdm import tqdm

from neo4j import GraphDatabase
import pandas as pd
import wikipedia
import spacy

from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy
from spacy.matcher import Matcher
from spacy.tokens import Doc, Span, Token

print(spacy.__version__)

3.0.3


# Configure spacy

Prior to actually using spacy, we need to load in some models.  The basic model is their small core library, taken from the web: `en_core_web_sm`, which provides good, basic functionality with a small download size (< 20 MB).  However, one drawback of this basic model is that it doesn't have full word vectors.  Instead, it comes with context-sensitive tensors.  You can still do things like text similarity with it, but if you want to use spacy to create good word vectors, you should use a larger model such as `en_core_web_md` or`en_core_web_lg` since the small models are not known for accuracy.  You can also use a variety of third-party models, but that is beyond the scope of this workshop.  Again, choose the model that works best with your setup.

To load the models we use the following command:

`python3 -m spacy download en_core_web_sm`

You can do this either as a cell in this notebook or via the CLI.

## API key for Google Knowledge Graph

See below for instructions on how to create this key.  When you have the key, save it to a file called `.api_key` in this directory.

In [11]:
SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
VERBS = ['ROOT', 'advcl']
OBJECTS = ["dobj", "dative", "attr", "oprd", 'pobj']
ENTITY_LABELS = ['PERSON', 'NORP', 'GPE', 'ORG', 'FAC', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART']

api_key = open('.api_key').read()

non_nc = spacy.load('en_core_web_sm')

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('merge_noun_chunks')

print(non_nc.pipe_names)
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer', 'merge_noun_chunks']


# Neo4j Connector Class

There are a few different Python packages that can be used to connect to Neo4j: `neo4j` (the official driver) or `py2neo` (a community-written driver).  There are many examples out there on how to use `py2neo`, so I have chosen to use `neo4j` for the purposes of providing a different example.

In [12]:
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

# Query Google Knowledge Graph

To query the Google Knowledge Graph you will require an API key, which permits you to have 100,000 read calls per day per project for free.  That will be more than sufficient for this workshop.  To obtain your key, follow [these instructions](https://developers.google.com/knowledge-graph/how-tos/authorizing).

In [13]:
def query_google(query, api_key, limit=10, indent=True, return_lists=True):
    
    text_ls = []
    node_label_ls = []
    url_ls = []
    
    params = {
        'query': query,
        'limit': limit,
        'indent': indent,
        'key': api_key,
    }   
    
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    url = service_url + '?' + urllib.parse.urlencode(params)
    response = json.loads(urllib.request.urlopen(url).read())
    
    if return_lists:
        for element in response['itemListElement']:

            try:
                node_label_ls.append(element['result']['@type'])
            except:
                node_label_ls.append('')

            try:
                text_ls.append(element['result']['detailedDescription']['articleBody'])
                #pprint(element['result']['detailedDescription']['articleBody'])
            except:
                text_ls.append('')
                
            try:
                url_ls.append(element['result']['detailedDescription']['url'])
            except:
                url_ls.append('')
                
        return text_ls, node_label_ls, url_ls
    
    else:
        return response

# NLP Functions

There are a variety of functions below that will use both spacy and regex to clean and prepare our data prior to loading it into the graph database.  The main goal is to create subject-verb-object (SVO) triples.  But before we can do that, we have to do some general cleaning of the data.  The below functions perform this for us in the following order:

1. Use regex to remove control characters (`remove_special_characters`)
2. Create spacy doc of (1) (`create_svo_lists`)
3. Get list of all subjects, verbs, and objects (`create_svo_lists`)
4. For each object, find the closest verb (`create_svo_triples`)
5. Remove all stop words and punctuation (`remove_stop_words_and_punct`)
6. Assemble SVO tuples (`create_svo_triples`)
7. Remove duplicate tuples (`remove_duplicates`)
8. Remove tuples that have dates in them (`remove_dates`)

In [21]:
def remove_special_characters(text):
    
    regex = re.compile(r'[\n\r\t]')
    clean_text = regex.sub(" ", text)
    
    return clean_text


def remove_stop_words_and_punct(text, print_text=False):
    
    result_ls = []
    rsw_doc = non_nc(text)
    
    for token in rsw_doc:
        if print_text:
            print(token, token.is_stop)
            print('--------------')
        if not token.is_stop and not token.is_punct:
            result_ls.append(str(token))
    
    result_str = ' '.join(result_ls)

    return result_str


def create_svo_lists(doc, print_lists=False):
    
    subject_ls = []
    verb_ls = []
    object_ls = []

    for token in doc:
        if token.dep_ in SUBJECTS:
            #print(list(token.ancestors))
            subject_ls.append((token.lower_, token.idx))
        elif token.dep_ in VERBS:
            #print('CHILDREN of ', token.text, ': ' ,list(token.children), token.idx)
            verb_ls.append((token.lemma_, token.idx))
        elif token.dep_ in OBJECTS:
            #print('ANCESTORS of ', token.text, ': ', list(token.ancestors), token.idx)
            object_ls.append((token.lower_, token.idx))

    if print_lists:
        print('SUBJECTS: ', subject_ls)
        print('VERBS: ', verb_ls)
        print('OBJECTS: ', object_ls)
    
    return subject_ls, verb_ls, object_ls


def remove_duplicates(tup, tup_posn):
    
    check_val = set()
    result = []
    
    for i in tup:
        if i[tup_posn] not in check_val:
            result.append(i)
            check_val.add(i[tup_posn])
            
    return result


def remove_dates(tup_ls):
    
    clean_tup_ls = []
    for entry in tup_ls:
        if not entry[2].isdigit():
            clean_tup_ls.append(entry)
    return clean_tup_ls


def create_svo_triples(text):
    
    clean_text = remove_special_characters(text)
    doc = nlp(clean_text)
    subject_ls, verb_ls, object_ls = create_svo_lists(doc)
    
    graph_tup_ls = []
    dedup_tup_ls = []
    clean_tup_ls = []
    
    for subj in subject_ls: 
        for obj in object_ls:
            
            dist_ls = []
            
            for v in verb_ls:
                
                # Assemble a list of distances between each object and each verb
                dist_ls.append(abs(obj[1] - v[1]))
                
            # Get the index of the verb with the smallest distance to the object 
            # and return that verb
            index_min = min(range(len(dist_ls)), key=dist_ls.__getitem__)
            
            # Remve stop words from subjects and object.  Note that we do this a bit
            # later down in the process to allow for proper sentence recognition.

            no_sw_subj = remove_stop_words_and_punct(subj[0])
            no_sw_obj = remove_stop_words_and_punct(obj[0])
            
            # Add entries to the graph iff neither subject nor object is blank
            if no_sw_subj and no_sw_obj:
                tup = (no_sw_subj, verb_ls[index_min][0], no_sw_obj)
                graph_tup_ls.append(tup)
        
        #clean_tup_ls = remove_dates(graph_tup_ls)
    
    dedup_tup_ls = remove_duplicates(graph_tup_ls, 2)
    clean_tup_ls = remove_dates(dedup_tup_ls)
    
    return clean_tup_ls

# Add to DataFrame

The following helper functions will be used to get the data we want into the format we need for creating the graph in the database.  They do the following:

- `make_verb_edge_string`: Best practice in Neo4j is to name the edges with the format of `:Verb`.  We will just quickly create a column with a string in that format.
- `add_columns`: For each subject column, we will do a search of the Google Knowledge Graph to obtain a description of that subject, possible node labels, and a URL with the complete information.  These will be used (if they exist, which they won't in all cases) to populate the properties in the graph.
- `add_df_layer`: For each object in the original graph, we will then go add some more SVO triples to the graph based on those objects.  We search the Google Knowledge Graph to obtain the same information in the previous step and add it to the tuple list.

In [15]:
def make_verb_edge_string(verb):
    
    return '[:' + str(verb).upper() + ']'


def add_columns(row, limit=1, indent=True):
    
    params = {
        'query': row[2],
        'limit': limit,
        'indent': indent,
        'key': api_key,
    } 
    
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    url = service_url + '?' + urllib.parse.urlencode(params)
    response = json.loads(urllib.request.urlopen(url).read())
    
    try:
        if response['itemListElement'][0]['result']['detailedDescription']['articleBody']:
            text = response['itemListElement'][0]['result']['detailedDescription']['articleBody']
    except:
        text = ' '
        
    try:
        if response['itemListElement'][0]['result']['@type']:
            node_labels = response['itemListElement'][0]['result']['@type']
    except:
        node_labels = ' '

    try:
        if response['itemListElement'][0]['result']['detailedDescription']['url']:
            link = response['itemListElement'][0]['result']['detailedDescription']['url']
    except:
        link = ' '

    row['description'] = text
    row['node_labels'] = node_labels
    row['url'] = link
        
    return row


def add_df_layer(df):

    objects = df['object'].tolist()
    final_tup_ls = []

    for obj in objects:

        text_ls, node_label_ls, url_ls = query_google(obj, api_key, limit=1)

        for text in text_ls:
            tup = create_svo_triples(text)
            dedup_tup = remove_duplicates(tup, 2)
            if dedup_tup:
                final_tup_ls.extend(dedup_tup)
                
    new_df = pd.DataFrame(final_tup_ls, columns = ['subject', 'verb', 'object'])
    new_df['edge_string'] = new_df['verb'].map(make_verb_edge_string)
    new_df = new_df.apply(add_columns, axis=1)
            
    return new_df

# Populate the graph from the DataFrame

These are the functions that will be used to take our DataFrame and populate the Neo4j database.  We should note that the `insert_data` function takes our data and processes it in batch mode.  While not necessarily required for this small graph, it is a good idea to use it for larger graphs to speed up the writes.  (See [this blog post](https://towardsdatascience.com/create-a-graph-database-in-neo4j-using-python-4172d40f89c4) for more information.)

In [16]:
def create_graph(rows):
    
    query = '''
    UNWIND $rows AS item
    MERGE (s:Subject {name: item.subject})
    MERGE (o:Object {name: item.object, description: COALESCE(item.description, 'NOT SET'), url: COALESCE(item.url, 'NOT SET')})
    WITH s, o, item
    CALL apoc.create.relationship(s, item.edge_string, {}, o)
    YIELD rel
    RETURN COUNT(s), COUNT(o), COUNT(rel)
    '''
    
    return insert_data(query, rows, batch_size=10000)



def insert_data(query, rows, batch_size = 10000):
    # Function to handle the updating the Neo4j database in batch mode.

    total = 0
    batch = 0
    start = time.time()
    result = None

    while batch * batch_size < len(rows):

        res = conn.query(query, parameters={'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
        if res[0]:
            print(res[0])
        else:
            print(res)
        #total += res[0]['total']
        batch += 1
        #result = {"total":total, "batches":batch, "time":time.time()-start}
        result = {'batches': batch, 'time': time.time()-start}
        print(result)

    return result

# Now let's get to work!

In [17]:
text = wikipedia.summary('barack obama')
text

'Barack Hussein Obama II ( (listen) bə-RAHK hoo-SAYN oh-BAH-mə; born August 4, 1961) is an American politician and attorney who served as the 44th president of the United States from 2009 to 2017. A member of the Democratic Party, Obama was the first African-American president of the United States. He previously served as a U.S. senator from Illinois from 2005 to 2008 and as an Illinois state senator from 1997 to 2004.\nObama was born in Honolulu, Hawaii. After graduating from Columbia University in 1983, he worked as a community organizer in Chicago. In 1988, he enrolled in Harvard Law School, where he was the first black person to be president of the Harvard Law Review. After graduating, he became a civil rights attorney and an academic, teaching constitutional law at the University of Chicago Law School from 1992 to 2004. Turning to elective politics, he represented the 13th district from 1997 until 2004 in the Illinois Senate, when he ran for the U.S. Senate. Obama received nationa

In [22]:
%%time
final_tup_ls = create_svo_triples(text)  
final_tup_ls[0:5]

CPU times: user 46 s, sys: 52 ms, total: 46.1 s
Wall time: 46.2 s


[('oh bah mə', 'be', 'american politician'),
 ('oh bah mə', 'be', '44th president'),
 ('oh bah mə', 'be', 'united states'),
 ('oh bah mə', 'be', 'democratic party'),
 ('oh bah mə', 'be', 'african american president')]

# Now we create the DataFrame that will be used to populate the graph....

In [23]:
%%time
df = pd.DataFrame(final_tup_ls, columns = ['subject', 'verb', 'object'])
df['edge_string'] = df['verb'].map(make_verb_edge_string)
df = df.apply(add_columns, axis=1)
df.head()

CPU times: user 2.05 s, sys: 76.1 ms, total: 2.13 s
Wall time: 14.2 s


Unnamed: 0,subject,verb,object,edge_string,description,node_labels,url
0,oh bah mə,be,american politician,[:BE],,[Thing],
1,oh bah mə,be,44th president,[:BE],,,
2,oh bah mə,be,united states,[:BE],"The United States of America, commonly known a...","[Thing, Country, Place, AdministrativeArea]",https://en.wikipedia.org/wiki/United_States
3,oh bah mə,be,democratic party,[:BE],The Democratic Party is one of the two major c...,"[Organization, Thing]",https://en.wikipedia.org/wiki/Democratic_Party...
4,oh bah mə,be,african american president,[:BE],The National Museum of African American Histor...,"[Place, Museum, Thing, TouristAttraction, Civi...",https://en.wikipedia.org/wiki/National_Museum_...


# Connecting to Neo4j

(Note that here we are using the internal networking set up by Docker.)

To avoid duplication of nodes, we begin by creating some constraints on each subject and object based on their names.  For more information on this (along with how to set an index on your nodes), check out [this blog post](https://towardsdatascience.com/create-a-graph-database-in-neo4j-using-python-4172d40f89c4).  We then add every row from the DataFrame to the graph via the `create_graph` function.

In [25]:
conn = Neo4jConnection(uri="bolt://neo4j:7687", user="neo4j", pwd="1234")
conn.query('CREATE CONSTRAINT subj_constraint IF NOT EXISTS ON (s:Subject) ASSERT s.name IS UNIQUE')
conn.query('CREATE CONSTRAINT obj_constraint IF NOT EXISTS ON (o:Object) ASSERT o.name IS UNIQUE')
create_graph(df)

<Record COUNT(s)=112 COUNT(o)=112 COUNT(rel)=112>
{'batches': 1, 'time': 0.43721604347229004}


{'batches': 1, 'time': 0.43721604347229004}

# Build out the graph more

We have what amounts to a very small graph.  So let's build that out a bit more.  We are now going to take everything from the `object` column in the initial DataFrame, query Google for it, and get the SVO triples for each of those.  This will take a minute or two, but could be parallelized or run through something like `dask` to speed it up.

In [26]:
%%time
new_df = add_df_layer(df)

CPU times: user 21.3 s, sys: 701 ms, total: 22.1 s
Wall time: 1min 28s


In [27]:
new_df.shape

(506, 7)

In [28]:
create_graph(new_df)

<Record COUNT(s)=506 COUNT(o)=506 COUNT(rel)=506>
{'batches': 1, 'time': 0.26596856117248535}


{'batches': 1, 'time': 0.26596856117248535}

# But Michelle Obama is not in this graph???  Let's go ahead and add her...

In [30]:
michelle = wikipedia.summary('michelle obama')
michelle_tup_ls = create_svo_triples(michelle)
michelle_tup_ls[0:5]

[('michelle lavaughn robinson obama', 'be', 'american attorney'),
 ('michelle lavaughn robinson obama', 'be', 'lady'),
 ('michelle lavaughn robinson obama', 'be', 'united states'),
 ('michelle lavaughn robinson obama', 'be', 'african american woman'),
 ('michelle lavaughn robinson obama', 'be', 'position')]

In [31]:
michelle_df = pd.DataFrame(michelle_tup_ls, columns = ['subject', 'verb', 'object'])
michelle_df['edge_string'] = michelle_df['verb'].map(make_verb_edge_string)
michelle_df = michelle_df.apply(add_columns, axis=1)
michelle_df.head()

Unnamed: 0,subject,verb,object,edge_string,description,node_labels,url
0,michelle lavaughn robinson obama,be,american attorney,[:BE],,[Thing],
1,michelle lavaughn robinson obama,be,lady,[:BE],The word lady is a term of respect for a girl ...,[Thing],https://en.wikipedia.org/wiki/Lady
2,michelle lavaughn robinson obama,be,united states,[:BE],"The United States of America, commonly known a...","[Place, Country, AdministrativeArea, Thing]",https://en.wikipedia.org/wiki/United_States
3,michelle lavaughn robinson obama,be,african american woman,[:BE],,[Thing],
4,michelle lavaughn robinson obama,be,position,[:BE],"A job, employment, work or occupation, is a pe...",[Thing],https://en.wikipedia.org/wiki/Job


In [32]:
create_graph(michelle_df)

<Record COUNT(s)=38 COUNT(o)=38 COUNT(rel)=38>
{'batches': 1, 'time': 0.08252167701721191}


{'batches': 1, 'time': 0.08252167701721191}

In [34]:
new_michelle_df = add_df_layer(michelle_df)
create_graph(new_michelle_df)

<Record COUNT(s)=146 COUNT(o)=146 COUNT(rel)=146>
{'batches': 1, 'time': 0.05074310302734375}


{'batches': 1, 'time': 0.05074310302734375}