# Building the Full Graph

In [1]:
# Models
import spacy
from spacy import displacy
import en_coref_lg
import networkx as nx

# Data Wrangling
import pandas as pd
import csv
import dill as pickle
import helpers.data as data_helper
import helpers.graph_generator as graph_generator

# Utils
import time
from graphviz import Source
from tqdm import tqdm
from importlib import reload

In [2]:
! ls ../data

LP_NET_Graphs.pkl	entities.pkl		       sentences.pkl
NET_Graphs.pkl		entity_ids.pkl		       test_document_ids.pkl
dev_document_ids.pkl	name_entity.csv		       train_document_ids.pkl
document.csv		parsed_sentences.20181205.pkl  trial_graph.pkl
documents.20181205.pkl	sentence.csv		       www.glozman.com
documents.pkl		sentences-coref.pkl
download.sh		sentences.20181205.pkl


In [3]:
%%time

documents = data_helper.get_documents()
display(documents.head())

Unnamed: 0_level_0,document
document_id,Unnamed: 1_level_1
bc/cctv/00/cctv_0000@0000@cctv@bc@en@on,"In the summer of 2005 , a picture that people ..."
bc/cctv/00/cctv_0001@0001@cctv@bc@en@on,What kind of memory ? We respectfully invite y...
bc/cctv/00/cctv_0002@0002@cctv@bc@en@on,Abramov had a car accident in Moscow last nigh...
bc/cctv/00/cctv_0003@0003@cctv@bc@en@on,"Hello , dear viewers . Welcome to Focus Today ..."
bc/cctv/00/cctv_0004@0004@cctv@bc@en@on,There will be 120 million viewers tuning in to...


CPU times: user 207 ms, sys: 51.7 ms, total: 259 ms
Wall time: 254 ms


In [4]:
%%time
entities = data_helper.get_entities()
display(entities.head())

Unnamed: 0,document_id,type,sentence_index,start_word_index,end_word_index,string
0,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,ORDINAL,0,5,5,first
1,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,18,19,20,Ye Daying
2,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,18,47,48,Ye Ting
3,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,ORDINAL,19,11,11,second
4,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,19,3,4,Ye Zhengming


CPU times: user 173 ms, sys: 76.5 ms, total: 250 ms
Wall time: 245 ms


In [5]:
entities['sentence_id'] = ['{}:{}'.format(r.document_id, r.sentence_index) for _, r in entities.iterrows()]

In [6]:
entities.head()

Unnamed: 0,document_id,type,sentence_index,start_word_index,end_word_index,string,sentence_id
0,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,ORDINAL,0,5,5,first,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...
1,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,18,19,20,Ye Daying,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...
2,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,18,47,48,Ye Ting,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...
3,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,ORDINAL,19,11,11,second,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...
4,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,19,3,4,Ye Zhengming,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...


In [7]:
!ls ../data/sentences.pkl

../data/sentences.pkl


In [8]:
%%time

sentences = data_helper.get_sentences()
with open('../data/sentences.pkl', 'rb') as file:
    sentences['spacy_parsed'] = pickle.load(file)

CPU times: user 1min 28s, sys: 19.9 s, total: 1min 48s
Wall time: 2min 7s


In [9]:
sentences.head()

Unnamed: 0_level_0,document_id,sentence_index,sentence,spacy_parsed
sentence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bn/abc/00/abc_0008@0008@abc@bn@en@on:0,bn/abc/00/abc_0008@0008@abc@bn@en@on,0,The explosion in Yemen did not help an already...,"(The, explosion, in, Yemen, did, not, help, an..."
bn/abc/00/abc_0006@0006@abc@bn@en@on:0,bn/abc/00/abc_0006@0006@abc@bn@en@on,0,"Still in Asia , President Clinton signed a bil...","(Still, in, Asia, ,, President, Clinton, signe..."
bn/abc/00/abc_0006@0006@abc@bn@en@on:1,bn/abc/00/abc_0006@0006@abc@bn@en@on,1,This will end the annual review of China 's tr...,"(This, will, end, the, annual, review, of, Chi..."
bn/abc/00/abc_0006@0006@abc@bn@en@on:2,bn/abc/00/abc_0006@0006@abc@bn@en@on,2,"China , in return , has agreed to open its mar...","(China, ,, in, return, ,, has, agreed, to, ope..."
bn/abc/00/abc_0012@0012@abc@bn@en@on:0,bn/abc/00/abc_0012@0012@abc@bn@en@on,0,"And in Yemen , the investigation into the bomb...","(And, in, Yemen, ,, the, investigation, into, ..."


In [10]:
# Skip for now - try again later
# sentences['spacy_parsed'] = sentence_trees
# with open('../data/sentences.20181205.pkl', 'wb') as file:
#     pickle.dump(sentences, file)

In [11]:
with open('../data/train_document_ids.pkl', 'rb') as file:
    train_document_ids = pickle.load(file)

In [12]:
train_sentences = sentences[sentences.document_id.isin(train_document_ids)]
len(train_sentences)

100415

In [13]:
train_entities = entities[entities.document_id.isin(train_document_ids)]
len(train_entities)

113115

In [14]:
reload(graph_generator)
graph = graph_generator.generate_graph(train_entities, train_sentences)

100%|██████████| 113115/113115 [00:53<00:00, 2119.90it/s]
  log_total = np.log(sum([G.node[node]['weight'] for node in successor_nodes]))
100%|██████████| 18/18 [22:55<00:00,  8.14s/it]


In [15]:
with open('../data/LP_NET_Graphs.20181205.pkl', 'wb') as file:
    pickle.dump(graph, file)

In [16]:
graph

{'ORDINAL': <networkx.classes.digraph.DiGraph at 0x7f1e966cc4e0>,
 'PERSON': <networkx.classes.digraph.DiGraph at 0x7f1e966cc3c8>,
 'DATE': <networkx.classes.digraph.DiGraph at 0x7f1eddfa9b00>,
 'GPE': <networkx.classes.digraph.DiGraph at 0x7f1e966cc940>,
 'NORP': <networkx.classes.digraph.DiGraph at 0x7f1e83c012e8>,
 'CARDINAL': <networkx.classes.digraph.DiGraph at 0x7f1e966cc9b0>,
 'ORG': <networkx.classes.digraph.DiGraph at 0x7f1e836e8c18>,
 'EVENT': <networkx.classes.digraph.DiGraph at 0x7f1e837640f0>,
 'WORK_OF_ART': <networkx.classes.digraph.DiGraph at 0x7f1e62d00940>,
 'MONEY': <networkx.classes.digraph.DiGraph at 0x7f1ea0e84550>,
 'LANGUAGE': <networkx.classes.digraph.DiGraph at 0x7f1e58383e10>,
 'FAC': <networkx.classes.digraph.DiGraph at 0x7f1e5ffc0f60>,
 'LOC': <networkx.classes.digraph.DiGraph at 0x7f1edd74e1d0>,
 'QUANTITY': <networkx.classes.digraph.DiGraph at 0x7f1e5fbda7b8>,
 'LAW': <networkx.classes.digraph.DiGraph at 0x7f1e5fbdaba8>,
 'TIME': <networkx.classes.digraph

In [19]:
len(graph['PERSON'].nodes)

9640