# Building the Full Graph

In [1]:
%%script bash
# Install en_coref_lg if it hasn't been installed already.
pip show en_coref_lg 2>/dev/null;
if [ ! $? ]; then 
  pip install https://github.com/huggingface/neuralcoref-models/releases/download/en_coref_lg-3.0.0/en_coref_lg-3.0.0.tar.gz;
  pip show en_coref_lg 2>/dev/null;
fi

Name: en-coref-lg
Version: 3.0.0
Summary: Coref-added English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse, named entities and coreference clusters.
Home-page: https://huggingface.co
Author: HuggingFace Inc. & Explosion AI
Author-email: thomas@huggingface.co
License: CC BY-SA 3.0
Location: /home/dan/.pyenv/versions/3.7.0/lib/python3.7/site-packages
Requires: spacy
Required-by: 


In [2]:
# Models
import spacy
from spacy import displacy
import en_coref_lg
import networkx as nx

# Data Wrangling
import pandas as pd
import csv
import dill as pickle
import helpers.data as data_helper
import helpers.graph_generator as graph_generator

# Utils
import time
from graphviz import Source
from tqdm import tqdm
from importlib import reload

In [3]:
%%time

nlp = en_coref_lg.load()

CPU times: user 16.7 s, sys: 3.55 s, total: 20.3 s
Wall time: 29.9 s


In [4]:
! ls ../data

document.csv   entity_ids.pkl	sentences-coref.pkl  www.glozman.com
documents.pkl  name_entity.csv	sentences.pkl
download.sh    sentence.csv	trial_graph.pkl


In [5]:
%%time

entities = data_helper.get_labeled_data()

CPU times: user 27.4 s, sys: 792 ms, total: 28.2 s
Wall time: 28.3 s


In [6]:
entities.head()

Unnamed: 0_level_0,document_id,type,sentence_index,start_word_index,end_word_index,string,sentence_id
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:0:5:5,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,ORDINAL,0,5,5,first,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:18:19:20,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,18,19,20,Ye Daying,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:18:47:48,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,18,47,48,Ye Ting,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:19:11:11,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,ORDINAL,19,11,11,second,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:19:3:4,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,19,3,4,Ye Zhengming,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...


In [7]:
%%time

sentences = data_helper.get_sentences()

CPU times: user 11 s, sys: 196 ms, total: 11.2 s
Wall time: 11.2 s


In [8]:
sentences.head()

Unnamed: 0_level_0,document_id,sentence_index,sentence
sentence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bn/abc/00/abc_0008@0008@abc@bn@en@on:0,bn/abc/00/abc_0008@0008@abc@bn@en@on,0,The explosion in Yemen did not help an already...
bn/abc/00/abc_0006@0006@abc@bn@en@on:0,bn/abc/00/abc_0006@0006@abc@bn@en@on,0,"Still in Asia , President Clinton signed a bil..."
bn/abc/00/abc_0006@0006@abc@bn@en@on:1,bn/abc/00/abc_0006@0006@abc@bn@en@on,1,This will end the annual review of China 's tr...
bn/abc/00/abc_0006@0006@abc@bn@en@on:2,bn/abc/00/abc_0006@0006@abc@bn@en@on,2,"China , in return , has agreed to open its mar..."
bn/abc/00/abc_0012@0012@abc@bn@en@on:0,bn/abc/00/abc_0012@0012@abc@bn@en@on,0,"And in Yemen , the investigation into the bomb..."


In [9]:
trial_ents = entities.iloc[0:100]
len(trial_ents)

100

In [10]:
trial_ent_ids = list(zip(list(trial_ents.document_id), list(trial_ents.sentence_index)))

In [13]:
list(trial_ent_ids)[9]

('bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on', 25)

In [14]:
len(trial_ent_ids)

100

In [15]:
%%time

trial_sents = [sentence for _, sentence in sentences.iterrows() if (sentence.document_id, sentence.sentence_index) in trial_ent_ids]

CPU times: user 10.7 s, sys: 0 ns, total: 10.7 s
Wall time: 10.7 s


In [16]:
print(len(trial_sents))
trial_sents = pd.DataFrame(trial_sents, columns=['document_id', 'sentence_index', 'sentence'])

50


In [17]:
parsed_sentences = data_helper.parse_sentences(trial_sents, nlp=nlp)

100%|██████████| 50/50 [00:02<00:00, 24.25it/s]


In [18]:
trial_sents['spacy_parsed'] = parsed_sentences

In [19]:
trial_sents.iloc[0].spacy_parsed.ents[0].label_

'ORDINAL'

In [20]:
inputs = data_helper.get_inputs_from_sentences(trial_sents)
inputs.head()

Unnamed: 0_level_0,document_id,sentence_id,type,sentence_index,start_index,end_index,string
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:0:5:5,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...,ORDINAL,0,5,5,first
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:18:20:21,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...,PERSON,18,20,21,Ye Daying
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:18:47:50,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...,PERSON,18,47,50,Marshall Ye Ting 's
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:19:3:4,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...,PERSON,19,3,4,Ye Zhengming
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:19:7:10,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...,PERSON,19,7,10,Marshall Ye Ting 's


In [21]:
%%time

trial_graph = graph_generator.generate_graph(inputs, trial_sents)

CPU times: user 62 ms, sys: 214 µs, total: 62.3 ms
Wall time: 59.6 ms


In [22]:
trial_graph

{'ORDINAL': <networkx.classes.digraph.DiGraph at 0x7f58bc8ee630>,
 'PERSON': <networkx.classes.digraph.DiGraph at 0x7f58bc8ee9e8>,
 'DATE': <networkx.classes.digraph.DiGraph at 0x7f58bc8ee5f8>,
 'GPE': <networkx.classes.digraph.DiGraph at 0x7f58bc8eecf8>,
 'NORP': <networkx.classes.digraph.DiGraph at 0x7f58bc8eeef0>,
 'CARDINAL': <networkx.classes.digraph.DiGraph at 0x7f58bc8eefd0>,
 'ORG': <networkx.classes.digraph.DiGraph at 0x7f58bc8df7f0>,
 'LAW': <networkx.classes.digraph.DiGraph at 0x7f58bc8df8d0>,
 'EVENT': <networkx.classes.digraph.DiGraph at 0x7f58bc8df9e8>,
 'WORK_OF_ART': <networkx.classes.digraph.DiGraph at 0x7f58b9f7fd68>,
 'PRODUCT': <networkx.classes.digraph.DiGraph at 0x7f58bcd67fd0>}

In [23]:
with open('../data/trial_graph.pkl', 'wb') as file:
    pickle.dump(trial_graph, file)

In [24]:
!ls ../data

document.csv   entity_ids.pkl	sentences-coref.pkl  www.glozman.com
documents.pkl  name_entity.csv	sentences.pkl
download.sh    sentence.csv	trial_graph.pkl


In [25]:
with open('../data/trial_graph.pkl', 'rb') as file:
    trial_graph_retrieved = pickle.load(file)

In [26]:
trial_graph_retrieved

{'ORDINAL': <networkx.classes.digraph.DiGraph at 0x7f58d7f33b00>,
 'PERSON': <networkx.classes.digraph.DiGraph at 0x7f58bf732940>,
 'DATE': <networkx.classes.digraph.DiGraph at 0x7f58bf73d9e8>,
 'GPE': <networkx.classes.digraph.DiGraph at 0x7f58bf7a85c0>,
 'NORP': <networkx.classes.digraph.DiGraph at 0x7f58bf7c8898>,
 'CARDINAL': <networkx.classes.digraph.DiGraph at 0x7f58bf7a5ef0>,
 'ORG': <networkx.classes.digraph.DiGraph at 0x7f58d7f4bba8>,
 'LAW': <networkx.classes.digraph.DiGraph at 0x7f58d7f4b898>,
 'EVENT': <networkx.classes.digraph.DiGraph at 0x7f58d7f4b6a0>,
 'WORK_OF_ART': <networkx.classes.digraph.DiGraph at 0x7f58d7f4b358>,
 'PRODUCT': <networkx.classes.digraph.DiGraph at 0x7f58d7f4b1d0>}

In [27]:
%%time
parsed_sentences = data_helper.parse_sentences(sentences, nlp=nlp)

100%|██████████| 143709/143709 [1:10:05<00:00, 34.18it/s]

CPU times: user 2h 33min 31s, sys: 1h 55min 20s, total: 4h 28min 52s
Wall time: 1h 10min 5s





In [49]:
def print_tree_or_none(spacy_sentence):
    if spacy_sentence is None:
        return None
    return spacy_sentence.print_tree()

sentence_trees = [print_tree_or_none(sentence) for sentence in tqdm(parsed_sentences)]
sentence_trees[0]


  0%|          | 0/143709 [00:00<?, ?it/s][A
  0%|          | 160/143709 [00:00<01:29, 1598.73it/s][A
  0%|          | 322/143709 [00:00<01:29, 1602.88it/s][A
  0%|          | 484/143709 [00:00<01:29, 1607.95it/s][A
  0%|          | 661/143709 [00:00<01:26, 1652.16it/s][A
  1%|          | 830/143709 [00:00<01:25, 1663.13it/s][A
  1%|          | 988/143709 [00:00<01:27, 1632.68it/s][A
  1%|          | 1169/143709 [00:00<01:24, 1681.10it/s][A
  1%|          | 1333/143709 [00:00<01:25, 1665.47it/s][A
  1%|          | 1493/143709 [00:00<01:26, 1640.85it/s][A
  1%|          | 1652/143709 [00:01<01:28, 1604.13it/s][A
  1%|▏         | 1809/143709 [00:01<01:29, 1587.28it/s][A
  1%|▏         | 1985/143709 [00:01<01:26, 1633.33it/s][A
  1%|▏         | 2147/143709 [00:01<01:29, 1573.85it/s][A
  2%|▏         | 2310/143709 [00:01<01:28, 1589.52it/s][A
  2%|▏         | 2469/143709 [00:01<01:30, 1564.04it/s][A
  2%|▏         | 2639/143709 [00:01<01:28, 1600.98it/s][A
  2%|▏         

 19%|█▊        | 26658/143709 [00:17<03:28, 562.14it/s][A
 19%|█▊        | 26795/143709 [00:17<02:51, 682.72it/s][A
 19%|█▊        | 26926/143709 [00:17<02:26, 796.42it/s][A
 19%|█▉        | 27053/143709 [00:17<02:12, 878.31it/s][A
 19%|█▉        | 27195/143709 [00:17<01:57, 991.59it/s][A
 19%|█▉        | 27325/143709 [00:17<01:49, 1067.43it/s][A
 19%|█▉        | 27455/143709 [00:18<01:44, 1109.73it/s][A
 19%|█▉        | 27583/143709 [00:18<01:43, 1119.47it/s][A
 19%|█▉        | 27707/143709 [00:18<01:47, 1079.58it/s][A
 19%|█▉        | 27824/143709 [00:18<01:46, 1088.37it/s][A
 19%|█▉        | 27939/143709 [00:18<01:47, 1076.85it/s][A
 20%|█▉        | 28051/143709 [00:18<01:49, 1057.09it/s][A
 20%|█▉        | 28167/143709 [00:18<01:46, 1084.06it/s][A
 20%|█▉        | 28291/143709 [00:18<01:42, 1125.92it/s][A
 20%|█▉        | 28406/143709 [00:18<01:45, 1096.55it/s][A
 20%|█▉        | 28529/143709 [00:19<01:41, 1133.43it/s][A
 20%|█▉        | 28665/143709 [00:19<01:36, 1

 30%|███       | 43711/143709 [00:33<01:22, 1208.41it/s][A
 31%|███       | 43834/143709 [00:33<01:24, 1177.40it/s][A
 31%|███       | 43953/143709 [00:33<01:25, 1162.09it/s][A
 31%|███       | 44082/143709 [00:33<01:23, 1192.87it/s][A
 31%|███       | 44203/143709 [00:33<01:23, 1195.68it/s][A
 31%|███       | 44324/143709 [00:33<01:27, 1135.13it/s][A
 31%|███       | 44444/143709 [00:33<01:26, 1152.52it/s][A
 31%|███       | 44561/143709 [00:33<01:26, 1141.18it/s][A
 31%|███       | 44676/143709 [00:34<01:26, 1138.90it/s][A
 31%|███       | 44798/143709 [00:34<01:25, 1159.06it/s][A
 31%|███▏      | 44925/143709 [00:34<01:23, 1190.15it/s][A
 31%|███▏      | 45053/143709 [00:34<01:21, 1210.50it/s][A
 31%|███▏      | 45175/143709 [00:34<01:24, 1167.55it/s][A
 32%|███▏      | 45298/143709 [00:34<01:23, 1185.16it/s][A
 32%|███▏      | 45435/143709 [00:34<01:19, 1233.45it/s][A
 32%|███▏      | 45560/143709 [00:34<01:20, 1214.28it/s][A
 32%|███▏      | 45690/143709 [00:34<01:

 42%|████▏     | 60814/143709 [00:49<01:09, 1197.68it/s][A
 42%|████▏     | 60945/143709 [00:49<01:07, 1219.59it/s][A
 42%|████▏     | 61068/143709 [00:49<01:10, 1166.32it/s][A
 43%|████▎     | 61195/143709 [00:49<01:09, 1195.38it/s][A
 43%|████▎     | 61326/143709 [00:49<01:07, 1227.46it/s][A
 43%|████▎     | 61455/143709 [00:49<01:06, 1245.43it/s][A
 43%|████▎     | 61583/143709 [00:49<01:05, 1255.41it/s][A
 43%|████▎     | 61709/143709 [00:49<01:08, 1201.24it/s][A
 43%|████▎     | 61843/143709 [00:49<01:06, 1239.37it/s][A
 43%|████▎     | 61980/143709 [00:49<01:04, 1275.76it/s][A
 43%|████▎     | 62109/143709 [00:50<01:05, 1252.75it/s][A
 43%|████▎     | 62235/143709 [00:50<01:07, 1210.41it/s][A
 43%|████▎     | 62365/143709 [00:50<01:05, 1234.60it/s][A
 43%|████▎     | 62490/143709 [00:50<01:07, 1206.51it/s][A
 44%|████▎     | 62612/143709 [00:52<07:41, 175.83it/s] [A
 44%|████▎     | 62735/143709 [00:52<05:42, 236.53it/s][A
 44%|████▎     | 62855/143709 [00:52<04:1

 54%|█████▍    | 78271/143709 [01:05<00:47, 1380.14it/s][A
 55%|█████▍    | 78411/143709 [01:05<00:47, 1383.05it/s][A
 55%|█████▍    | 78577/143709 [01:05<00:44, 1454.99it/s][A
 55%|█████▍    | 78724/143709 [01:05<00:46, 1395.37it/s][A
 55%|█████▍    | 78865/143709 [01:05<00:46, 1387.51it/s][A
 55%|█████▍    | 79005/143709 [01:05<00:48, 1332.32it/s][A
 55%|█████▌    | 79143/143709 [01:05<00:48, 1341.59it/s][A
 55%|█████▌    | 79278/143709 [01:05<00:53, 1210.43it/s][A
 55%|█████▌    | 79407/143709 [01:05<00:52, 1230.43it/s][A
 55%|█████▌    | 79547/143709 [01:06<00:50, 1274.40it/s][A
 55%|█████▌    | 79687/143709 [01:06<00:49, 1302.97it/s][A
 56%|█████▌    | 79819/143709 [01:06<00:49, 1303.80it/s][A
 56%|█████▌    | 79959/143709 [01:06<00:47, 1331.03it/s][A
 56%|█████▌    | 80093/143709 [01:06<00:48, 1318.04it/s][A
 56%|█████▌    | 80253/143709 [01:06<00:45, 1390.84it/s][A
 56%|█████▌    | 80396/143709 [01:06<00:45, 1397.68it/s][A
 56%|█████▌    | 80537/143709 [01:06<00:

 68%|██████▊   | 97559/143709 [01:22<00:38, 1203.11it/s][A
 68%|██████▊   | 97698/143709 [01:22<00:36, 1252.91it/s][A
 68%|██████▊   | 97826/143709 [01:22<00:38, 1176.99it/s][A
 68%|██████▊   | 97947/143709 [01:22<00:38, 1176.58it/s][A
 68%|██████▊   | 98067/143709 [01:22<00:39, 1169.14it/s][A
 68%|██████▊   | 98186/143709 [01:22<00:39, 1165.87it/s][A
 68%|██████▊   | 98304/143709 [01:22<00:39, 1152.38it/s][A
 68%|██████▊   | 98420/143709 [01:22<00:39, 1152.21it/s][A
 69%|██████▊   | 98536/143709 [01:22<00:39, 1140.82it/s][A
 69%|██████▊   | 98651/143709 [01:23<00:41, 1096.81it/s][A
 69%|██████▊   | 98770/143709 [01:23<00:40, 1121.59it/s][A
 69%|██████▉   | 98913/143709 [01:23<00:37, 1197.56it/s][A
 69%|██████▉   | 99062/143709 [01:23<00:35, 1271.76it/s][A
 69%|██████▉   | 99201/143709 [01:23<00:34, 1304.26it/s][A
 69%|██████▉   | 99334/143709 [01:23<00:34, 1297.20it/s][A
 69%|██████▉   | 99466/143709 [01:23<00:33, 1303.30it/s][A
 69%|██████▉   | 99606/143709 [01:23<00:

 92%|█████████▏| 132754/143709 [01:39<00:04, 2297.33it/s][A
 93%|█████████▎| 132985/143709 [01:39<00:04, 2272.62it/s][A
 93%|█████████▎| 133213/143709 [01:39<00:04, 2233.53it/s][A
 93%|█████████▎| 133456/143709 [01:39<00:04, 2288.04it/s][A
 93%|█████████▎| 133711/143709 [01:39<00:04, 2359.54it/s][A
 93%|█████████▎| 133959/143709 [01:39<00:04, 2392.96it/s][A
 93%|█████████▎| 134200/143709 [01:39<00:03, 2395.21it/s][A
 94%|█████████▎| 134456/143709 [01:39<00:03, 2441.50it/s][A
 94%|█████████▎| 134712/143709 [01:40<00:03, 2469.86it/s][A
 94%|█████████▍| 134960/143709 [01:40<00:03, 2414.86it/s][A
 94%|█████████▍| 135203/143709 [01:40<00:03, 2335.16it/s][A
 94%|█████████▍| 135438/143709 [01:40<00:03, 2307.05it/s][A
 94%|█████████▍| 135683/143709 [01:40<00:03, 2347.90it/s][A
 95%|█████████▍| 135919/143709 [01:40<00:03, 2351.17it/s][A
 95%|█████████▍| 136155/143709 [01:40<00:03, 2352.91it/s][A
 95%|█████████▍| 136391/143709 [01:40<00:03, 2333.36it/s][A
 95%|█████████▌| 136625/

[{'word': 'help',
  'lemma': 'help',
  'NE': '',
  'POS_fine': 'VB',
  'POS_coarse': 'VERB',
  'arc': 'ROOT',
  'modifiers': [{'word': 'explosion',
    'lemma': 'explosion',
    'NE': '',
    'POS_fine': 'NN',
    'POS_coarse': 'NOUN',
    'arc': 'nsubj',
    'modifiers': [{'word': 'The',
      'lemma': 'the',
      'NE': '',
      'POS_fine': 'DT',
      'POS_coarse': 'DET',
      'arc': 'det',
      'modifiers': []},
     {'word': 'in',
      'lemma': 'in',
      'NE': '',
      'POS_fine': 'IN',
      'POS_coarse': 'ADP',
      'arc': 'prep',
      'modifiers': [{'word': 'Yemen',
        'lemma': 'Yemen',
        'NE': 'GPE',
        'POS_fine': 'NNP',
        'POS_coarse': 'PROPN',
        'arc': 'pobj',
        'modifiers': []}]}]},
   {'word': 'did',
    'lemma': 'do',
    'NE': '',
    'POS_fine': 'VBD',
    'POS_coarse': 'VERB',
    'arc': 'aux',
    'modifiers': []},
   {'word': 'not',
    'lemma': 'not',
    'NE': '',
    'POS_fine': 'RB',
    'POS_coarse': 'ADV',
    'arc': 

In [52]:
sentences['spacy_parsed'] = sentence_trees
with open('../data/sentences.20181205.pkl', 'wb') as file:
    pickle.dump(sentences, file)

In [30]:
sentences['spacy_parsed'] = parsed_sentences

In [36]:
reload(graph_generator)
graph = graph_generator.generate_graph(entities, sentences)

100%|██████████| 161754/161754 [01:09<00:00, 2323.20it/s]


In [37]:
with open('../data/NET_Graphs.pkl', 'wb') as file:
    pickle.dump(graph, file)