In [1]:
%%script bash
# Install en_coref_lg if it hasn't been installed already.
pip show en_coref_lg 2>/dev/null;
if [ ! $? ]; then 
  pip install https://github.com/huggingface/neuralcoref-models/releases/download/en_coref_lg-3.0.0/en_coref_lg-3.0.0.tar.gz;
  pip show en_coref_lg 2>/dev/null;
fi

Name: en-coref-lg
Version: 3.0.0
Summary: Coref-added English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse, named entities and coreference clusters.
Home-page: https://huggingface.co
Author: HuggingFace Inc. & Explosion AI
Author-email: thomas@huggingface.co
License: CC BY-SA 3.0
Location: /home/dan/.pyenv/versions/3.7.0/lib/python3.7/site-packages
Requires: spacy
Required-by: 


In [6]:
# NLP
import spacy
from spacy import displacy
import en_coref_lg

# Data
import pandas as pd
import csv
import helpers.data as data_helper

# Model
import networkx as nx

# Utils
import time
from graphviz import Source
from tqdm import tqdm
import dill as pickle

In [7]:
! ls ../data

NET_Graphs.pkl	name_entity.csv		       sentences.pkl
document.csv	parsed_sentences.20181205.pkl  trial_graph.pkl
documents.pkl	sentence.csv		       www.glozman.com
download.sh	sentences-coref.pkl
entity_ids.pkl	sentences.20181205.pkl


In [8]:
! head -n 2 ../data/document.csv

"document_id","document"
"bc/cctv/00/cctv_0000@0000@cctv@bc@en@on","In the summer of 2005 , a picture that people have long been looking forward to started emerging with frequency in various major Hong Kong media . With their unique charm , these well - known cartoon images once again caused Hong Kong to be a focus of worldwide attention . The world 's fifth Disney park will soon open to the public here . The most important thing about Disney is that it is a global brand . Well , for several years , although it was still under construction and , er , not yet open , it can be said that many people have viewed Hong Kong with new respect . Then welcome to the official writing ceremony of Hong Kong Disneyland . The construction of Hong Kong Disneyland began two years ago , in 2003 . In January of that year , the Hong Kong government turned over to Disney Corporation 200 hectares of land at the foot of Lantau Island that was obtained following the largest land reclamation project in recent

In [10]:
documents = data_helper.get_documents()
documents.head()

Unnamed: 0_level_0,document
document_id,Unnamed: 1_level_1
bc/cctv/00/cctv_0000@0000@cctv@bc@en@on,"In the summer of 2005 , a picture that people ..."
bc/cctv/00/cctv_0001@0001@cctv@bc@en@on,What kind of memory ? We respectfully invite y...
bc/cctv/00/cctv_0002@0002@cctv@bc@en@on,Abramov had a car accident in Moscow last nigh...
bc/cctv/00/cctv_0003@0003@cctv@bc@en@on,"Hello , dear viewers . Welcome to Focus Today ..."
bc/cctv/00/cctv_0004@0004@cctv@bc@en@on,There will be 120 million viewers tuning in to...


In [11]:
len(documents)

13109

In [12]:
nlp = en_coref_lg.load()

In [16]:
%%time

def wrap_nlp_parse(nlp, document):
    try:
        return nlp(document)
    except:
        return None

parsed_documents = [wrap_nlp_parse(nlp, row.document) for _, row in tqdm(documents.iterrows(), total=len(documents))]

 30%|██▉       | 3870/13109 [26:31<2:10:52,  1.18it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
len(parsed_documents)

In [17]:
parsed_documents[0].print_tree()

[{'word': 'picture',
  'lemma': 'picture',
  'NE': '',
  'POS_fine': 'NN',
  'POS_coarse': 'NOUN',
  'arc': 'ROOT',
  'modifiers': [{'word': 'In',
    'lemma': 'in',
    'NE': '',
    'POS_fine': 'IN',
    'POS_coarse': 'ADP',
    'arc': 'prep',
    'modifiers': [{'word': 'the summer of 2005',
      'lemma': 'the summer of 2005',
      'NE': 'DATE',
      'POS_fine': 'NN',
      'POS_coarse': 'NOUN',
      'arc': 'pobj',
      'modifiers': []}]},
   {'word': ',',
    'lemma': ',',
    'NE': '',
    'POS_fine': ',',
    'POS_coarse': 'PUNCT',
    'arc': 'punct',
    'modifiers': []},
   {'word': 'a',
    'lemma': 'a',
    'NE': '',
    'POS_fine': 'DT',
    'POS_coarse': 'DET',
    'arc': 'det',
    'modifiers': []},
   {'word': 'looking',
    'lemma': 'look',
    'NE': '',
    'POS_fine': 'VBG',
    'POS_coarse': 'VERB',
    'arc': 'relcl',
    'modifiers': [{'word': 'that',
      'lemma': 'that',
      'NE': '',
      'POS_fine': 'IN',
      'POS_coarse': 'ADP',
      'arc': 'mark',
 

In [18]:
def print_tree(spacy_parsed):
    if spacy_parsed is not None:
        return spacy_parsed.print_tree()
    return 'None'

In [19]:
documents['spacy_tree'] = [print_tree(parsed) for parsed in tqdm(parsed_documents)]

100%|██████████| 13109/13109 [01:41<00:00, 129.08it/s] 


In [20]:
with open('../data/documents.20181205.pkl', 'wb') as file:
    pickle.dump(documents, file)

In [24]:
! ls -lah ../data/*.pkl

-rw-r--r-- 1 dan dan  15M Dec  5 02:11 ../data/NET_Graphs.pkl
-rw-r--r-- 1 dan dan 276M Dec  5 03:20 ../data/documents.20181205.pkl
-rw-r--r-- 1 dan dan 5.1G Dec  1 01:00 ../data/documents.pkl
-rw-r--r-- 1 dan dan 8.7M Dec  3 23:01 ../data/entity_ids.pkl
-rw-r--r-- 1 dan dan 965M Dec  5 02:05 ../data/parsed_sentences.20181205.pkl
-rw-r--r-- 1 dan dan 951M Dec  4 02:21 ../data/sentences-coref.pkl
-rw-r--r-- 1 dan dan 286M Dec  5 02:29 ../data/sentences.20181205.pkl
-rw-r--r-- 1 dan dan 5.1G Dec  1 00:12 ../data/sentences.pkl
-rw-r--r-- 1 dan dan  29K Dec  5 00:54 ../data/trial_graph.pkl


In [22]:
for ent in parsed_documents[0].ents:
    print('{}: {}'.format(ent, ent.label_))

the summer of 2005: DATE
Hong Kong: GPE
Hong Kong: GPE
fifth: ORDINAL
Disney: ORG
Disney: ORG
several years: DATE
Hong Kong: GPE
Hong Kong: GPE
Disneyland: FAC
Hong Kong: GPE
Disneyland: FAC
two years ago: DATE
2003: DATE
January of that year: DATE
Hong Kong: GPE
Disney Corporation: ORG
200 hectares: QUANTITY
Lantau Island: GPE
recent years: DATE
One: CARDINAL
Hong Kong: GPE
Lantau Island: GPE
Hong Kong International Airport: FAC
Mickey Mouse 's: PERSON
Chinese: NORP
first: ORDINAL
only one month: DATE
Hong Kong: GPE
Disneyland: FAC
September 12: DATE
Disney: ORG
Disney: ORG
first: ORDINAL
Disney: ORG
the same day: DATE
two years: DATE
Disney: ORG
Disney: ORG
Disney: ORG
Disney: ORG
Disney Corporation: ORG
Hong Kong: GPE
Chinese Disney: ORG
China: GPE
Hong Kong: GPE
Hong Kong: GPE
now: DATE
more than seven million: CARDINAL
Hong Kong: GPE
about two years: DATE
34: CARDINAL
Hong Kong: GPE
one hundred years ago: DATE
today: DATE
Hong Kong: GPE
Mong Kok: PERSON
the Repulse Bay: LOC
Hong K