# Building the Full Graph

In [1]:
%%script bash
# Install en_coref_lg if it hasn't been installed already.
pip show en_coref_lg 2>/dev/null;
if [ ! $? ]; then 
  pip install https://github.com/huggingface/neuralcoref-models/releases/download/en_coref_lg-3.0.0/en_coref_lg-3.0.0.tar.gz;
  pip show en_coref_lg 2>/dev/null;
fi

Name: en-coref-lg
Version: 3.0.0
Summary: Coref-added English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse, named entities and coreference clusters.
Home-page: https://huggingface.co
Author: HuggingFace Inc. & Explosion AI
Author-email: thomas@huggingface.co
License: CC BY-SA 3.0
Location: /home/dan/.pyenv/versions/3.7.0/lib/python3.7/site-packages
Requires: spacy
Required-by: 


In [2]:
# Models
import spacy
from spacy import displacy
import en_coref_lg
import networkx as nx

# Data Wrangling
import pandas as pd
import csv
import dill as pickle
import helpers.data as data_helper
import helpers.graph_generator as graph_generator

# Utils
import time
from graphviz import Source
from tqdm import tqdm
from importlib import reload

In [3]:
%%time

nlp = en_coref_lg.load()

CPU times: user 16.7 s, sys: 3.55 s, total: 20.3 s
Wall time: 29.9 s


In [4]:
! ls ../data

document.csv   entity_ids.pkl	sentences-coref.pkl  www.glozman.com
documents.pkl  name_entity.csv	sentences.pkl
download.sh    sentence.csv	trial_graph.pkl


In [5]:
%%time

entities = data_helper.get_labeled_data()

CPU times: user 27.4 s, sys: 792 ms, total: 28.2 s
Wall time: 28.3 s


In [6]:
entities.head()

Unnamed: 0_level_0,document_id,type,sentence_index,start_word_index,end_word_index,string,sentence_id
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:0:5:5,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,ORDINAL,0,5,5,first,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:18:19:20,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,18,19,20,Ye Daying,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:18:47:48,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,18,47,48,Ye Ting,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:19:11:11,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,ORDINAL,19,11,11,second,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:19:3:4,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,19,3,4,Ye Zhengming,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...


In [7]:
%%time

sentences = data_helper.get_sentences()

CPU times: user 11 s, sys: 196 ms, total: 11.2 s
Wall time: 11.2 s


In [8]:
sentences.head()

Unnamed: 0_level_0,document_id,sentence_index,sentence
sentence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bn/abc/00/abc_0008@0008@abc@bn@en@on:0,bn/abc/00/abc_0008@0008@abc@bn@en@on,0,The explosion in Yemen did not help an already...
bn/abc/00/abc_0006@0006@abc@bn@en@on:0,bn/abc/00/abc_0006@0006@abc@bn@en@on,0,"Still in Asia , President Clinton signed a bil..."
bn/abc/00/abc_0006@0006@abc@bn@en@on:1,bn/abc/00/abc_0006@0006@abc@bn@en@on,1,This will end the annual review of China 's tr...
bn/abc/00/abc_0006@0006@abc@bn@en@on:2,bn/abc/00/abc_0006@0006@abc@bn@en@on,2,"China , in return , has agreed to open its mar..."
bn/abc/00/abc_0012@0012@abc@bn@en@on:0,bn/abc/00/abc_0012@0012@abc@bn@en@on,0,"And in Yemen , the investigation into the bomb..."


In [9]:
trial_ents = entities.iloc[0:100]
len(trial_ents)

100

In [10]:
trial_ent_ids = list(zip(list(trial_ents.document_id), list(trial_ents.sentence_index)))

In [13]:
list(trial_ent_ids)[9]

('bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on', 25)

In [14]:
len(trial_ent_ids)

100

In [15]:
%%time

trial_sents = [sentence for _, sentence in sentences.iterrows() if (sentence.document_id, sentence.sentence_index) in trial_ent_ids]

CPU times: user 10.7 s, sys: 0 ns, total: 10.7 s
Wall time: 10.7 s


In [16]:
print(len(trial_sents))
trial_sents = pd.DataFrame(trial_sents, columns=['document_id', 'sentence_index', 'sentence'])

50


In [17]:
parsed_sentences = data_helper.parse_sentences(trial_sents, nlp=nlp)

100%|██████████| 50/50 [00:02<00:00, 24.25it/s]


In [18]:
trial_sents['spacy_parsed'] = parsed_sentences

In [19]:
trial_sents.iloc[0].spacy_parsed.ents[0].label_

'ORDINAL'

In [20]:
inputs = data_helper.get_inputs_from_sentences(trial_sents)
inputs.head()

Unnamed: 0_level_0,document_id,sentence_id,type,sentence_index,start_index,end_index,string
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:0:5:5,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...,ORDINAL,0,5,5,first
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:18:20:21,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...,PERSON,18,20,21,Ye Daying
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:18:47:50,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...,PERSON,18,47,50,Marshall Ye Ting 's
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:19:3:4,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...,PERSON,19,3,4,Ye Zhengming
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:19:7:10,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...,PERSON,19,7,10,Marshall Ye Ting 's


In [21]:
%%time

trial_graph = graph_generator.generate_graph(inputs, trial_sents)

CPU times: user 62 ms, sys: 214 µs, total: 62.3 ms
Wall time: 59.6 ms


In [22]:
trial_graph

{'ORDINAL': <networkx.classes.digraph.DiGraph at 0x7f58bc8ee630>,
 'PERSON': <networkx.classes.digraph.DiGraph at 0x7f58bc8ee9e8>,
 'DATE': <networkx.classes.digraph.DiGraph at 0x7f58bc8ee5f8>,
 'GPE': <networkx.classes.digraph.DiGraph at 0x7f58bc8eecf8>,
 'NORP': <networkx.classes.digraph.DiGraph at 0x7f58bc8eeef0>,
 'CARDINAL': <networkx.classes.digraph.DiGraph at 0x7f58bc8eefd0>,
 'ORG': <networkx.classes.digraph.DiGraph at 0x7f58bc8df7f0>,
 'LAW': <networkx.classes.digraph.DiGraph at 0x7f58bc8df8d0>,
 'EVENT': <networkx.classes.digraph.DiGraph at 0x7f58bc8df9e8>,
 'WORK_OF_ART': <networkx.classes.digraph.DiGraph at 0x7f58b9f7fd68>,
 'PRODUCT': <networkx.classes.digraph.DiGraph at 0x7f58bcd67fd0>}

In [23]:
with open('../data/trial_graph.pkl', 'wb') as file:
    pickle.dump(trial_graph, file)

In [24]:
!ls ../data

document.csv   entity_ids.pkl	sentences-coref.pkl  www.glozman.com
documents.pkl  name_entity.csv	sentences.pkl
download.sh    sentence.csv	trial_graph.pkl


In [25]:
with open('../data/trial_graph.pkl', 'rb') as file:
    trial_graph_retrieved = pickle.load(file)

In [26]:
trial_graph_retrieved

{'ORDINAL': <networkx.classes.digraph.DiGraph at 0x7f58d7f33b00>,
 'PERSON': <networkx.classes.digraph.DiGraph at 0x7f58bf732940>,
 'DATE': <networkx.classes.digraph.DiGraph at 0x7f58bf73d9e8>,
 'GPE': <networkx.classes.digraph.DiGraph at 0x7f58bf7a85c0>,
 'NORP': <networkx.classes.digraph.DiGraph at 0x7f58bf7c8898>,
 'CARDINAL': <networkx.classes.digraph.DiGraph at 0x7f58bf7a5ef0>,
 'ORG': <networkx.classes.digraph.DiGraph at 0x7f58d7f4bba8>,
 'LAW': <networkx.classes.digraph.DiGraph at 0x7f58d7f4b898>,
 'EVENT': <networkx.classes.digraph.DiGraph at 0x7f58d7f4b6a0>,
 'WORK_OF_ART': <networkx.classes.digraph.DiGraph at 0x7f58d7f4b358>,
 'PRODUCT': <networkx.classes.digraph.DiGraph at 0x7f58d7f4b1d0>}

In [None]:
%%time
parsed_sentences = data_helper.parse_sentences(sentences, nlp=nlp)

 54%|█████▎    | 77231/143709 [41:39<29:54, 37.05it/s]  

In [None]:
graph = graph_generator.generate_graph(entities, sentences)