In [1]:
import spacy
from spacy import displacy
import pandas as pd
import csv
import networkx as nx
import time
from graphviz import Source
from tqdm import tqdm
import dill as pickle
from IPython.core.display import HTML
from sklearn.metrics import classification_report
from helpers.data import make_entity_id

In [2]:
! ls *.pkl

documents.pkl  sentences.pkl


In [3]:
parsed_sentences = None
with open('sentences.pkl', 'rb') as file:
    parsed_sentences = pickle.load(file)

CPU times: user 3 µs, sys: 2 µs, total: 5 µs
Wall time: 9.78 µs


In [4]:
parsed_sentences[0].ents

(Yemen,
 the Middle East,
 tonight,
 Israeli,
 Palestinian,
 Egypt,
 the next several days)

In [5]:
len(parsed_sentences)

143709

In [6]:
sentences = pd.read_csv('../data/sentence.csv')
sentences.head()

Unnamed: 0,document_id,sentence_index,sentence
0,bn/abc/00/abc_0008@0008@abc@bn@en@on,0,The explosion in Yemen did not help an already...
1,bn/abc/00/abc_0006@0006@abc@bn@en@on,0,"Still in Asia , President Clinton signed a bil..."
2,bn/abc/00/abc_0006@0006@abc@bn@en@on,1,This will end the annual review of China 's tr...
3,bn/abc/00/abc_0006@0006@abc@bn@en@on,2,"China , in return , has agreed to open its mar..."
4,bn/abc/00/abc_0012@0012@abc@bn@en@on,0,"And in Yemen , the investigation into the bomb..."


In [7]:
sentences.loc[:, 'spacy_parsed'] = parsed_sentences[:]

In [8]:
Y = pd.read_csv('../data/name_entity.csv')
Y.head()

Unnamed: 0,document_id,type,sentence_index,start_word_index,end_word_index,string
0,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,ORDINAL,0,5,5,first
1,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,18,19,20,Ye Daying
2,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,18,47,48,Ye Ting
3,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,ORDINAL,19,11,11,second
4,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,19,3,4,Ye Zhengming


In [2]:
def entity_id_from(row):
    return make_entity_id(
        row.document_id,
        row.sentence_index,
        row.start_word_index,
        row.end_word_index,
    )

In [46]:
entity_ids = [entity_id_from(y) for _, y in Y.iterrows()]
with open('entity_ids.pkl', 'wb') as file:
    pickle.dump(entity_ids, file)

In [11]:
Y['entity_id'] = entity_ids

In [12]:
Y.loc[0]

document_id          bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on
type                                                          ORDINAL
sentence_index                                                      0
start_word_index                                                    5
end_word_index                                                      5
string                                                          first
entity_id           bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@...
Name: 0, dtype: object

In [13]:
Y = Y.set_index('entity_id')
Y.head()

Unnamed: 0_level_0,document_id,type,sentence_index,start_word_index,end_word_index,string
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:0:5:5,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,ORDINAL,0,5,5,first
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:18:19:20,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,18,19,20,Ye Daying
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:18:47:48,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,18,47,48,Ye Ting
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:19:11:11,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,ORDINAL,19,11,11,second
bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:19:3:4,bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on,PERSON,19,3,4,Ye Zhengming


In [14]:
sentences.loc[0]

document_id                    bn/abc/00/abc_0008@0008@abc@bn@en@on
sentence_index                                                    0
sentence          The explosion in Yemen did not help an already...
spacy_parsed      (The, explosion, in, Yemen, did, not, help, an...
Name: 0, dtype: object

In [15]:
sentences.loc[0, 'spacy_parsed'].ents[0]

Yemen

In [16]:
spacy_X = []
for _, sentence in sentences.iterrows():
    if sentence.spacy_parsed is not None:
        for ent in sentence.spacy_parsed.ents:
            entity_id = ':'.join([
                sentence.document_id,
                str(sentence.sentence_index),
                str(ent.start),
                str(ent.end - 1),
            ])
            row = [
                entity_id,
                sentence.document_id,
                ent.label_,
                str(sentence.sentence_index),
                str(ent.start),
                str(ent.end - 1),
                str(ent),
            ]
            spacy_X.append(row)
    else:
        print('Error with {}'.format(sentence))

Error with document_id       nw/p2.5_c2e/00/p2.5_c2e_0034@0034@p2.5_c2e@nw@...
sentence_index                                                   12
sentence                                                        NaN
spacy_parsed                                                   None
Name: 95251, dtype: object
Error with document_id       tc/ch/00/ch_0021@0021@ch@tc@en@on
sentence_index                                  205
sentence                                        NaN
spacy_parsed                                   None
Name: 105131, dtype: object
Error with document_id       tc/ch/00/ch_0011@0011@ch@tc@en@on
sentence_index                                  251
sentence                                        NaN
spacy_parsed                                   None
Name: 112790, dtype: object


In [17]:
columns = (
    'entity_id',
    'document_id',
    'type',
    'sentence_index',
    'start_index',
    'end_index',
    'string',
)
X = pd.DataFrame(data=spacy_X, columns=columns)

In [18]:
X.head()

Unnamed: 0,entity_id,document_id,type,sentence_index,start_index,end_index,string
0,bn/abc/00/abc_0008@0008@abc@bn@en@on:0:3:3,bn/abc/00/abc_0008@0008@abc@bn@en@on,GPE,0,3,3,Yemen
1,bn/abc/00/abc_0008@0008@abc@bn@en@on:0:12:14,bn/abc/00/abc_0008@0008@abc@bn@en@on,LOC,0,12,14,the Middle East
2,bn/abc/00/abc_0008@0008@abc@bn@en@on:0:23:23,bn/abc/00/abc_0008@0008@abc@bn@en@on,TIME,0,23,23,tonight
3,bn/abc/00/abc_0008@0008@abc@bn@en@on:0:32:32,bn/abc/00/abc_0008@0008@abc@bn@en@on,NORP,0,32,32,Israeli
4,bn/abc/00/abc_0008@0008@abc@bn@en@on:0:34:34,bn/abc/00/abc_0008@0008@abc@bn@en@on,NORP,0,34,34,Palestinian


In [19]:
X = X.set_index('entity_id')

In [20]:
X.head()

Unnamed: 0_level_0,document_id,type,sentence_index,start_index,end_index,string
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bn/abc/00/abc_0008@0008@abc@bn@en@on:0:3:3,bn/abc/00/abc_0008@0008@abc@bn@en@on,GPE,0,3,3,Yemen
bn/abc/00/abc_0008@0008@abc@bn@en@on:0:12:14,bn/abc/00/abc_0008@0008@abc@bn@en@on,LOC,0,12,14,the Middle East
bn/abc/00/abc_0008@0008@abc@bn@en@on:0:23:23,bn/abc/00/abc_0008@0008@abc@bn@en@on,TIME,0,23,23,tonight
bn/abc/00/abc_0008@0008@abc@bn@en@on:0:32:32,bn/abc/00/abc_0008@0008@abc@bn@en@on,NORP,0,32,32,Israeli
bn/abc/00/abc_0008@0008@abc@bn@en@on:0:34:34,bn/abc/00/abc_0008@0008@abc@bn@en@on,NORP,0,34,34,Palestinian


In [21]:
len(X)

201700

In [22]:
len(Y)

161754

In [23]:
Y.loc['bn/abc/00/abc_0008@0008@abc@bn@en@on:0:3:3']

document_id         bn/abc/00/abc_0008@0008@abc@bn@en@on
type                                                 GPE
sentence_index                                         0
start_word_index                                       3
end_word_index                                         3
string                                             Yemen
Name: bn/abc/00/abc_0008@0008@abc@bn@en@on:0:3:3, dtype: object

In [24]:
X.loc['bn/abc/00/abc_0008@0008@abc@bn@en@on:0:3:3']

document_id       bn/abc/00/abc_0008@0008@abc@bn@en@on
type                                               GPE
sentence_index                                       0
start_index                                          3
end_index                                            3
string                                           Yemen
Name: bn/abc/00/abc_0008@0008@abc@bn@en@on:0:3:3, dtype: object

In [25]:
sample_document_id = 'bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on'
document = X[X['document_id'] == sample_document_id]
len(document)

219

In [26]:
document_Y = Y[Y['document_id'] == sample_document_id]
len(document_Y)

178

In [27]:
document_Y.index

Index(['bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:0:5:5',
       'bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:18:19:20',
       'bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:18:47:48',
       'bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:19:11:11',
       'bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:19:3:4',
       'bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:19:8:9',
       'bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:22:1:1',
       'bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:22:5:7',
       'bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:25:7:7',
       'bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:25:0:1',
       ...
       'bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:440:19:20',
       'bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:441:8:8',
       'bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:441:11:12',
       'bc/phoenix/00/phoenix_0000@0000@phoenix@bc@en@on:441:26:29',
       'bc/phoenix/00/phoenix_0000@0000@phoenix@bc

In [28]:
entity_ids_Y = set(document_Y.index)
entity_ids_X = set(document.index)
difference = entity_ids_X - entity_ids_Y
len(difference)

89

In [29]:
sample_document_sentences = sentences[sentences.document_id == sample_document_id]
sample_document_entities = Y[Y.document_id == sample_document_id]
for _, sentence in sample_document_sentences.iterrows():
    entities = sample_document_entities[sample_document_entities.sentence_index == sentence.sentence_index]
    if len(entities) == len(sentence.spacy_parsed.ents): continue
        
    print('-' * 80)
    if len(entities) > 0:
        display(HTML('<h3>Labeled:</h3>'))
        print(['{}: {}'.format(entity.string, entity.type) for _, entity in entities.iterrows()])
        
    display(HTML('<h3>Tagged:</h3>'))
    if len(sentence.spacy_parsed.ents) > 0:
        displacy.render(sentence.spacy_parsed, jupyter=True, style='ent')
    else:
        display(sentence.sentence)
    

--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


['two years: DATE', 'Ye Zhengming: PERSON', 'Ye Ting: PERSON', 'the Wannan Incident: EVENT', 'Kuomnintang: ORG']


--------------------------------------------------------------------------------


['eight years: DATE', 'Ye Daying: PERSON']


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


['fifth: ORDINAL', 'sixth: ORDINAL', 'Ye Zhengming: PERSON']


--------------------------------------------------------------------------------


['Ye: PERSON', "There 's a Place Called * *PRO*-2 Wangjiazhan: WORK_OF_ART"]


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


['Beijing: GPE']


--------------------------------------------------------------------------------


['Shanghai: GPE']


--------------------------------------------------------------------------------


["'69: DATE", '11 years: DATE']


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


['Ye Zhengming: PERSON']


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


['Quotations from Chairman Mao: WORK_OF_ART', 'Quotations from Chairman Mao: WORK_OF_ART']


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


['Ye Daying: PERSON']


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


['The Flower Girl .: WORK_OF_ART']


'The Flower Girl .'

--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


['Shanghai: GPE']


--------------------------------------------------------------------------------


['Ye Daying: PERSON', 'forty - some yuan: MONEY']


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


['first: ORDINAL', '13 yuan: MONEY', '13 yuan: MONEY', '15 yuan: MONEY', '41 yuan: MONEY', 'the first year: DATE', 'the second year: DATE', 'the third year: DATE']


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


["'78: DATE", 'the Beijing Film Institute: ORG', 'Shanghai: GPE']


--------------------------------------------------------------------------------


["the Xi'an Film Studio: ORG"]


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


["Xi'an Incident: EVENT"]


"That Xi'an Incident thing , I played an extra once ."

--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


--------------------------------------------------------------------------------


In [30]:
all_entity_ids = set(Y.index).union(set(X.index))
len(all_entity_ids)

215368

In [34]:
def build_type_lists(entity_ids):
    spacy_types = []
    onto_types = []
    for entity_id in tqdm(entity_ids):
        try:
            y_row = Y.loc[entity_id]
        except KeyError:
            y_row = None

        if y_row is not None:
            onto_types.append(y_row.type)
        else:
            onto_types.append('')

        try:
            x_row = X.loc[entity_id]
        except KeyError:
            x_row = None

        if x_row is not None:
            spacy_types.append(x_row.type)
        else:
            spacy_types.append('')
    return [spacy_types, onto_types]

spacy_types, onto_types = build_type_lists(entity_ids)

100%|██████████| 161754/161754 [00:46<00:00, 3478.23it/s]


In [35]:
display(spacy_types[0:10])
display(onto_types[0:10])

['ORDINAL',
 '',
 '',
 'ORDINAL',
 'PERSON',
 '',
 'DATE',
 'PERSON',
 'DATE',
 'PERSON']

['ORDINAL',
 'PERSON',
 'PERSON',
 'ORDINAL',
 'PERSON',
 'PERSON',
 'DATE',
 'PERSON',
 'DATE',
 'PERSON']

In [36]:
print(classification_report(spacy_types, onto_types))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

                   0.00      0.00      0.00     13668
    CARDINAL       0.88      0.98      0.93     12279
        DATE       0.89      0.99      0.94     21316
       EVENT       0.72      0.96      0.82       952
         FAC       0.74      0.93      0.83      1147
         GPE       0.92      0.98      0.95     26469
    LANGUAGE       0.76      0.95      0.84       332
         LAW       0.74      0.95      0.84       443
         LOC       0.83      0.93      0.88      2381
       MONEY       0.91      0.99      0.95      5883
        NORP       0.93      0.98      0.95     11009
     ORDINAL       0.91      0.99      0.95      2529
         ORG       0.92      0.97      0.94     28216
     PERCENT       0.92      1.00      0.96      4462
      PERSON       0.93      0.99      0.95     25683
     PRODUCT       0.74      0.94      0.83      1014
    QUANTITY       0.72      0.95      0.82      1199
        TIME       0.76    