In [1]:
import json
import re
import urllib
from pprint import pprint
import time
from tqdm import tqdm

from py2neo import Node, Graph, Relationship, NodeMatcher
from py2neo.bulk import merge_nodes

import numpy as np
import pandas as pd
import wikipedia
from sklearn.metrics.pairwise import cosine_similarity

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.matcher import Matcher
from spacy.tokens import Doc, Span, Token

print(spacy.__version__)

3.0.3


In [2]:
graph = Graph("bolt://neo4j:7687", name="neo4j", password="1234")
nodes_matcher = NodeMatcher(graph)

In [5]:
df = graph.run('MATCH (n:Node) RETURN n.name, n.description, n.node_labels, n.url, n.word_vec').to_data_frame()
df.columns = ['name', 'description', 'node_labels', 'url', 'word_vec']
df.head()

Unnamed: 0,name,description,node_labels,url,word_vec
0,house,"A house is a single-unit residential building,...",[Thing],https://en.wikipedia.org/wiki/House,"[0.04616522789001465, 0.3092450797557831, -0.0..."
1,representatives,"The speaker of a deliberative assembly, especi...",[Thing],https://en.wikipedia.org/wiki/Speaker_(politics),"[-0.014807065017521381, 0.2798156142234802, -0..."
2,lower chamber,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,national bicameral legislature,Bicameralism is the practice of having a legis...,[Thing],https://en.wikipedia.org/wiki/Bicameralism,"[0.011496711522340775, 0.1557886153459549, -0...."
4,article,An article is any member of a class of dedicat...,[Thing],https://en.wikipedia.org/wiki/Article_(grammar),"[-0.022769873961806297, 0.18098898231983185, -..."


# Let's try some more entity disambiguation

Recall in the previous notebook that we looked at the cosine similarity of word vectors.  Instead, now let's look at the overlap of the relationships between our starting node, 'oh bah mə', and  'barack hussein obama ii'.  We might expect if there was strong similarity that we would see a lot of relationship overlap.  


In [41]:
bho_ls = []
bho = graph.run('MATCH (n:Node {name: "barack hussein obama ii"})--(m) RETURN DISTINCT m.name')
for record in bho:
    bho_ls.append(record[0])
print('Total number of connected nodes: ', len(bho_ls))
bho_ls

Total number of connected nodes:  5


['american politician',
 '44th president',
 'united states',
 'democratic party',
 'african american president']

In [38]:
obm_ls = []
obm = graph.run('MATCH (n:Node {name: "oh bah mə"})--(m) RETURN DISTINCT m.name')
for record in obm:
    obm_ls.append(record[0])
obm_ls

['american politician',
 '44th president',
 'united states',
 'democratic party',
 'african american president',
 'illinois',
 'u.s senator',
 'great recession',
 'economic stimuli',
 'honolulu',
 'illinois state senator',
 'columbia university',
 'community organizer',
 'chicago',
 'harvard law school',
 'black person',
 'harvard law review',
 'civil rights attorney',
 'president',
 'university',
 'constitutional law',
 'law school',
 'elective politics',
 '13th district',
 'u.s senate',
 'illinois senate',
 'national attention',
 'march senate primary win',
 'ukraine',
 'u.s relations',
 '2016 u.s elections',
 'iran',
 'campaign',
 'jcpoa nuclear deal',
 'sanctions',
 'interference',
 'senate',
 'global climate change',
 'invasion',
 'cuba',
 'justices',
 'russia',
 'close primary campaign',
 'hillary clinton',
 '  general election',
 'republican nominee john mccain',
 'running mate',
 '2009 nobel peace prize laureate',
 'january',
 'lengthy debate',
 'landmark bills',
 'national deb

In [40]:
bho_set = set(bho_ls)
obm_set = set(obm_ls)
if (bho_set & obm_set):
    print(bho_set & obm_set)

{'democratic party', 'african american president', 'united states', '44th president', 'american politician'}


# Observation

So we can see that 100% of the nodes connected to 'barack hussein obama ii' are in the connected node list of 'oh bah mə'.  This is a strong indicator that the former might be the same entity as the later.