In [1]:
!pip install py2neo
!pip install wikipedia
!pip install spacy==3.0.3
!pip install scikit-learn
!pip install pandas





In [2]:
import json
import re
import urllib
from pprint import pprint
import time
from tqdm import tqdm

from py2neo import Node, Graph, Relationship, NodeMatcher
from py2neo.bulk import merge_nodes

import numpy as np
import pandas as pd
import wikipedia
from sklearn.metrics.pairwise import cosine_similarity

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.matcher import Matcher
from spacy.tokens import Doc, Span, Token

from collections import Counter

print(spacy.__version__)

3.0.3


In [3]:
# !python3 -m spacy download en_core_web_md
# !python3 -m spacy download en_core_web_lg

In [4]:
SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
VERBS = ['ROOT', 'advcl']
OBJECTS = ["dobj", "dative", "attr", "oprd", 'pobj']
ENTITY_LABELS = ['PERSON', 'NORP', 'GPE', 'ORG', 'FAC', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART']


non_nc = spacy.load('en_core_web_lg')

nlp = spacy.load('en_core_web_md')

nlp.add_pipe('merge_noun_chunks')



print(non_nc.pipe_names)
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer', 'merge_noun_chunks']


## Data Cleaning

In [5]:
def remove_special_characters(text):
    
    regex = re.compile(r'[\n\r\t]')
    clean_text = regex.sub(" ", text)
    
    return clean_text


def remove_stop_words_and_punct(text, print_text=False):
    
    result_ls = []
    rsw_doc = non_nc(text)
    
    for token in rsw_doc:
        if print_text:
            print(token, token.is_stop)
            print('--------------')
        if not token.is_stop and not token.is_punct:
            result_ls.append(str(token))
    
    result_str = ' '.join(result_ls)

    return result_str


def create_svo_lists(doc, print_lists):
    
    subject_ls = []
    verb_ls = []
    object_ls = []

    for token in doc:
        if token.dep_ in SUBJECTS:
            subject_ls.append((token.lower_, token.idx))
        elif token.dep_ in VERBS:
            verb_ls.append((token.lemma_, token.idx))
        elif token.dep_ in OBJECTS:
            object_ls.append((token.lower_, token.idx))

    if print_lists:
        print('SUBJECTS: ', subject_ls)
        print('VERBS: ', verb_ls)
        print('OBJECTS: ', object_ls)
    
    return subject_ls, verb_ls, object_ls


def remove_duplicates(tup, tup_posn):
    
    check_val = set()
    result = []
    
    for i in tup:
        if i[tup_posn] not in check_val:
            result.append(i)
            check_val.add(i[tup_posn])
            
    return result


def remove_dates(tup_ls):
    
    clean_tup_ls = []
    for entry in tup_ls:
        if not entry[2].isdigit():
            clean_tup_ls.append(entry)
    return clean_tup_ls


def create_svo_triples(text, print_lists=False):
    
    clean_text = remove_special_characters(text)
    doc = nlp(clean_text)
    subject_ls, verb_ls, object_ls = create_svo_lists(doc, print_lists=print_lists)
    
    graph_tup_ls = []
    dedup_tup_ls = []
    clean_tup_ls = []
    
    for subj in subject_ls: 
        for obj in object_ls:
            
            dist_ls = []
            
            for v in verb_ls:
                
                # Assemble a list of distances between each object and each verb
                dist_ls.append(abs(obj[1] - v[1]))
                
            # Get the index of the verb with the smallest distance to the object 
            # and return that verb
            index_min = min(range(len(dist_ls)), key=dist_ls.__getitem__)
            
            # Remve stop words from subjects and object.  Note that we do this a bit
            # later down in the process to allow for proper sentence recognition.

            no_sw_subj = remove_stop_words_and_punct(subj[0])
            no_sw_obj = remove_stop_words_and_punct(obj[0])
            
            # Add entries to the graph iff neither subject nor object is blank
            if no_sw_subj and no_sw_obj:
                tup = (no_sw_subj, verb_ls[index_min][0], no_sw_obj)
                graph_tup_ls.append(tup)
        
        #clean_tup_ls = remove_dates(graph_tup_ls)
    
    dedup_tup_ls = remove_duplicates(graph_tup_ls, 2)
    clean_tup_ls = remove_dates(dedup_tup_ls)
    
    return clean_tup_ls

## Helper Functions

In [6]:
def get_obj_properties(tup_ls):
    
    init_obj_tup_ls = []
    
    for tup in tup_ls:

        new_tup = (tup[0], tup[1], tup[2])
        
        init_obj_tup_ls.append(new_tup)
        
    return init_obj_tup_ls


def add_layer(tup_ls):

    svo_tup_ls = []
    
    for tup in tup_ls:
        
        if tup[3]:
            svo_tup = create_svo_triples(tup[3])
            svo_tup_ls.extend(svo_tup)
        else:
            continue
    
    return get_obj_properties(svo_tup_ls)
        

def subj_equals_obj(tup_ls):
    
    new_tup_ls = []
    
    for tup in tup_ls:
        if tup[0] != tup[2]:
            new_tup_ls.append((tup[0], tup[1], tup[2]))
            
    return new_tup_ls


def check_for_string_labels(tup_ls):
    # This is for an edge case where the object does not get fully populated
    # resulting in the node labels being assigned to string instead of list.
    # This may not be strictly necessary and the lines using it are commnted out
    # below.  Run this function if you come across this case.
    
    clean_tup_ls = []
    
    for el in tup_ls:
        if isinstance(el[2], list):
            clean_tup_ls.append(el)
            
    return clean_tup_ls


def create_word_vectors(tup_ls):

    new_tup_ls = []
    
    for tup in tup_ls:
        new_tup = (tup[0], tup[1], tup[2], np.random.uniform(low=-1.0, high=1.0, size=(300,)))
        new_tup_ls.append(new_tup)
        
    return new_tup_ls

## Create the node and edge lists to populate the graph with the below helper functions

In [29]:
def dedup(tup_ls):
    
    visited = set()
    output_ls = []
    
    for tup in tup_ls:
        if not tup[0] in visited:
            visited.add(tup[0])
            output_ls.append((tup[0], tup[1]))
            
    return output_ls


def convert_vec_to_ls(tup_ls):
    
    vec_to_ls_tup = []
    
    for el in tup_ls:
        vec_ls = [float(v) for v in el[1]]
        tup = (el[0], vec_ls)
        vec_to_ls_tup.append(tup)
        
    return vec_to_ls_tup


def add_nodes(tup_ls):   

    keys = ['name', 'word_vec']
    merge_nodes(graph.auto(), tup_ls, ('Node', 'name'), keys=keys)
    print('Number of nodes in graph: ', graph.nodes.match('Node').count())
    
    return

In [30]:
def add_edges(edge_ls):
    
    edge_dc = {} 
    
    # Group tuple by verb
    # Result: {verb1: [(sub1, v1, obj1), (sub2, v2, obj2), ...],
    #          verb2: [(sub3, v3, obj3), (sub4, v4, obj4), ...]}
    
    for tup in edge_ls: 
        if tup[1] in edge_dc: 
            edge_dc[tup[1]].append((tup[0], tup[1], tup[2])) 
        else: 
            edge_dc[tup[1]] = [(tup[0], tup[1], tup[2])] 
    
    for edge_labels, tup_ls in tqdm(edge_dc.items()):   # k=edge labels, v = list of tuples
        
        tx = graph.begin()
        
        for el in tup_ls:
            source_node = nodes_matcher.match(name=el[0]).first()
            target_node = nodes_matcher.match(name=el[2]).first()
            if not source_node:
                source_node = Node('Node', name=el[0])
                tx.create(source_node)
            if not target_node:
                try:
                    target_node = Node('Node', name=el[2], word_vec=el[3])
                    tx.create(target_node)
                except:
                    continue
            try:
                rel = Relationship(source_node, edge_labels, target_node)
            except:
                continue
            tx.create(rel)
        tx.commit()
    
    return

## Read File

In [9]:
f = open('./data/unsup_data.txt', 'r')
text = f.read()

In [10]:
clean_text = remove_special_characters(text)
doc = nlp(clean_text)

In [11]:
text = clean_text.replace('. ', '\n')

In [12]:
words = [token.text
         for token in doc
         if not token.is_stop and not token.is_punct]

In [13]:
word_freq = Counter(words)

In [14]:
rows = text.split('\n')

In [15]:
rows[:10]

['It is made quite clear that the supplier shall be required to fulfill and implement the duties, obligations and responsibilities laid down in the pertinent applicable standard [IATF 16949:2016]',
 "Purpose of the document Particular attention is drawn to the following duties and obligations: Purpose of the document The supplier shall determine the customer's requirements in accordance with section 4.3.2 of [IATF 16949:2016] and satisfy them in an effort to improve customer satisfaction",
 'Purpose of the document Among other things, this means that the supplier shall determine requirements that have not been specified by the client but are necessary for the specified or intended use in accordance with section 8.2.2 of [IATF 16949:2016], and shall be obliged to define the properties of the product that are essential to ensure that it can be used safely for its intended purpose in accordance with section 8.2.3 of [IATF 16949:2016]',
 'Purpose of the document Furthermore, the supplier s

## Parse text and build graph

In [26]:
def text_to_kg(text:str, failed_texts:list, print_lists:bool):
    try:
        initial_tup_ls = create_svo_triples(text, print_lists=print_lists)
        
        init_obj_tup_ls = get_obj_properties(initial_tup_ls)
        print(init_obj_tup_ls)
        starter_edge_ls = init_obj_tup_ls
        edge_ls = subj_equals_obj(starter_edge_ls)
        clean_edge_ls = edge_ls

        edges_word_vec_ls = create_word_vectors(edge_ls)

        orig_node_tup_ls = [(edge_ls[0][0], '', ['Subject'], '', np.random.uniform(low=-1.0, high=1.0, size=(300,)))]
        obj_node_tup_ls = [(tup[2], tup[3]) for tup in edges_word_vec_ls]
        full_node_tup_ls = orig_node_tup_ls + obj_node_tup_ls
        dedup_node_tup_ls = dedup(full_node_tup_ls)

        node_tup_ls = convert_vec_to_ls(dedup_node_tup_ls)

        add_nodes(node_tup_ls)
        add_edges(edges_word_vec_ls)
    except:
        failed_texts.append(text)

## Connect to neo4j

In [17]:
graph = Graph("bolt://localhost:7999", name="neo4j", password="secret")
nodes_matcher = NodeMatcher(graph)

In [None]:
%%time
failed_texts = []
for t in rows:
    text_to_kg(t, failed_texts, print_lists=False)

In [None]:
len(failed_texts)

In [None]:
%%time
failed_texts = []
for t in failed_texts:
    text_to_kg(t, [], print_lists=True)

## ============================================

## Start Building

In [31]:
text = 'Particular attention is drawn to the following duties and obligations: Purpose of the document'

In [32]:
text_to_kg(text, [], print_lists=False)

[('particular attention', 'draw', 'following duties'), ('particular attention', 'purpose', 'document')]
Number of nodes in graph:  3


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 43.60it/s]


In [22]:
%%time
initial_tup_ls = create_svo_triples(text, print_lists=False)

CPU times: user 26.3 ms, sys: 2.01 ms, total: 28.3 ms
Wall time: 27.2 ms


In [23]:
initial_tup_ls

[('particular attention', 'draw', 'following duties'),
 ('particular attention', 'purpose', 'document')]

In [24]:
%%time
init_obj_tup_ls = get_obj_properties(initial_tup_ls)
print(init_obj_tup_ls)
starter_edge_ls = init_obj_tup_ls
edge_ls = subj_equals_obj(starter_edge_ls)
clean_edge_ls = edge_ls

[('particular attention', 'draw', 'following duties'), ('particular attention', 'purpose', 'document')]
CPU times: user 172 µs, sys: 69 µs, total: 241 µs
Wall time: 224 µs


In [25]:
edges_word_vec_ls = create_word_vectors(edge_ls)

## Creating some lists of tuples representing the node and edge lists

In [None]:
orig_node_tup_ls = [(edge_ls[0][0], '', ['Subject'], '', np.random.uniform(low=-1.0, high=1.0, size=(300,)))]
obj_node_tup_ls = [(tup[2], tup[3]) for tup in edges_word_vec_ls]
full_node_tup_ls = orig_node_tup_ls + obj_node_tup_ls
dedup_node_tup_ls = dedup(full_node_tup_ls)

len(full_node_tup_ls), len(dedup_node_tup_ls)

## Create the node list that will be used to populate the graph

In [None]:
node_tup_ls = convert_vec_to_ls(dedup_node_tup_ls)

In [None]:
graph = Graph("bolt://localhost:7999", name="neo4j", password="secret")
nodes_matcher = NodeMatcher(graph)

In [None]:
%%time
add_nodes(node_tup_ls)

In [None]:
%%time
add_edges(edges_word_vec_ls)

## Usage

In [None]:
def get_word_vec_similarity(node1, node2, node_ls):
    
    node1_vec = [tup[4] for tup in node_ls if tup[0] == node1]
    node2_vec = [tup[4] for tup in node_ls if tup[0] == node2]
    
    return cosine_similarity(node1_vec, node2_vec)

In [None]:
cs = get_word_vec_similarity('company', 'standard', dedup_node_tup_ls)
print(cs)

## Read PDF

In [None]:
!pip install PyPDF2

import PyPDF2

In [None]:
from PyPDF2 import PdfReader

reader = PdfReader("./data/test.pdf")
text = ""
for page in reader.pages:
    text += page.extract_text() + "\n"

In [None]:
len(reader.pages)

In [None]:
text = reader.pages[20].extract_text()