In [1]:
%load_ext autoreload
%autoreload 2

In [27]:

%matplotlib inline
import os

# TO USE A DATABASE OTHER THAN SQLITE, USE THIS LINE
# Note that this is necessary for parallel execution amongst other things...
# os.environ['SNORKELDB'] = 'postgres:///snorkel-intro'

from snorkel import SnorkelSession
session = SnorkelSession()

# Running a Sapcy corpus parser
from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
#corpus_parser = CorpusParser() # use corenlp parser, slow but accurate for NER


# spacy lang_model = en_core_web_md
#en_depent_web_md

corpus_parser = CorpusParser(parser=Spacy(lang='en_core_web_md')) # use spacy parser, fast but not accurate enough for NER

In [28]:
# Parse the document and save into the sqlite server
# see the intro-turtorial 1

# Configure a doc pre-processer
from snorkel.parser import TextDocPreprocessor
doc_preprocessor = TextDocPreprocessor(path = 'data/samples', encoding = "utf-8")
print doc_preprocessor.path


corpus_parser.apply(doc_preprocessor)



data/samples
Clearing existing...
Running UDF...
parts: abs_sent_offset [0, 4, 12, 17, 21, 27, 32, 38, 46, 49, 54, 64, 67, 71, 73, 77, 84, 87, 92, 94, 97, 101, 113, 122, 126, 130, 139, 141, 143, 145, 149, 158, 161, 165, 169, 174, 179, 182, 184]
abs_sent_offset 0
abs_sent_offset_end 185
parts: abs_sent_offset [186, 191, 203, 214, 224, 225, 226, 235, 236, 238, 241, 246, 250, 258, 263, 266, 270, 277, 279, 285, 292, 293, 294, 304, 305, 307, 311, 321, 326, 327, 328, 334, 335, 336, 337]
abs_sent_offset 186
abs_sent_offset_end 339
parts: abs_sent_offset [339, 344, 353, 357, 362, 372, 375, 383, 388, 393, 394, 398, 400, 405, 407, 408, 412, 414, 415, 418, 422, 430, 437, 441, 442, 454, 459, 460, 462, 467, 478, 480, 482, 491, 495, 497, 500, 505, 507, 511, 517, 524, 526, 531, 532, 544, 551, 552, 554, 559, 561, 565, 567, 569, 573, 577, 579, 582, 587, 589, 592, 594, 598, 603, 606, 608, 610]
abs_sent_offset 339
abs_sent_offset_end 611
parts: abs_sent_offset [611, 616, 625, 629, 634, 644, 647, 655, 660

In [29]:
# Check if the document has beed loaded
from snorkel.models import Document, Sentence
for doc in session.query(Document).all():
    print doc.id
    #print doc.sentences

1
2
3


In [30]:
# Define Candidate Type
from snorkel.models import candidate_subclass, Document, Candidate
rel_contract = candidate_subclass('Contractor-Contractee', ['org1', 'org2'])

# Write the candidate extractor
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import SpacyOrganizationMatcher
ngrams         = Ngrams(n_max=7)
org_matcher = SpacyOrganizationMatcher()
cand_extractor = CandidateExtractor(rel_contract, [ngrams, ngrams], [org_matcher, org_matcher])

In [31]:
# Split the data into train, dev and test 
from snorkel.models import Document

docs = session.query(Document).order_by(Document.name).all()

train_sents = set()
dev_sents   = set()
test_sents  = set()

for i, doc in enumerate(docs):
    for s in doc.sentences:
        if i % 3 == 1:
            dev_sents.add(s)
        elif i % 3 == 2:
            test_sents.add(s)
        else:
            train_sents.add(s)
            
print len(train_sents)
print len(test_sents)
print len(dev_sents)

4
1
4


In [32]:
# Apply candidate extractors on all the sentences
# Save the candidates into the databse
# The rel_candidates will be splitted into train, dev, test 
for i, sents in enumerate([train_sents, dev_sents, test_sents]):
    cand_extractor.apply(sents, split=i)
    print("Number of candidates:", session.query(rel_contract).filter(rel_contract.split == i).count())
    


Clearing existing...
Running UDF...

('Number of candidates:', 8)
Clearing existing...
Running UDF...

('Number of candidates:', 12)
Clearing existing...
Running UDF...

('Number of candidates:', 1)


In [33]:
# Define the labeling functions
import re
from snorkel.lf_helpers import (
    get_left_tokens, get_right_tokens, get_between_tokens,
    get_text_between, get_tagged_text,
)
def LF_between_and(c):
    #print get_left_tokens(c[0], window=2)
    if "between" in get_left_tokens(c[0], window=5):
        #print get_between_tokens(c)
        if "and" in get_between_tokens(c):
            return 1
    return 0

def LF_agreement(c):
    if "agreement" in get_left_tokens(c[0], window=30):
        return 1
    return 0
    


In [34]:
# Test the first labeling function: LF_between_and
labeled = []
for c in session.query(rel_contract).filter(rel_contract.split == 0).all():
    if LF_between_and(c) != 0:
        labeled.append(c)
print("Number labeled:", len(labeled))
for i in range(len(labeled)):
    print labeled[i].get_parent()
    #print labeled[i].labels
    print




('Number labeled:', 4)
Sentence(Document b5dfde83-dde0-4235-b749-4c2aaf8abd06,3,THIS PURCHASE AND SALE AGREEMENT is entered into this  11 Feb,  2050, 
by and between Solarcity Corp. (hereinafter Buyer), 391 Foster City Blvd, Foster City, CA 94404 and The Hive Inc. (hereinafter Company), 720 University Ave #200, Palo Alto, CA 94301, USA.)

Sentence(Document b5dfde83-dde0-4235-b749-4c2aaf8abd06,3,THIS PURCHASE AND SALE AGREEMENT is entered into this  11 Feb,  2050, 
by and between Solarcity Corp. (hereinafter Buyer), 391 Foster City Blvd, Foster City, CA 94404 and The Hive Inc. (hereinafter Company), 720 University Ave #200, Palo Alto, CA 94301, USA.)

Sentence(Document b5dfde83-dde0-4235-b749-4c2aaf8abd06,2,THIS PURCHASE AND SALE AGREEMENT is entered into this  23rd, March,  2000, 
by and between Google Inc (hereinafter Buyer), 1545 Charleston Rd, Mountain View, CA 94043, and Cisco Systems, Inc. (hereinafter Company), 3571 N 1st St, San Jose, CA 95134, USA, and here is it. 
)

Sentence(

In [35]:
# Test the second labeling function: LF_agreement
labeled = []
for c in session.query(rel_contract).filter(rel_contract.split == 0).all():
    if LF_agreement(c) != 0:
        labeled.append(c)
print("Number labeled:", len(labeled))
for i in range(len(labeled)):
    print labeled[i].get_parent()
    print labeled[i]
    print




('Number labeled:', 7)
Sentence(Document b5dfde83-dde0-4235-b749-4c2aaf8abd06,3,THIS PURCHASE AND SALE AGREEMENT is entered into this  11 Feb,  2050, 
by and between Solarcity Corp. (hereinafter Buyer), 391 Foster City Blvd, Foster City, CA 94404 and The Hive Inc. (hereinafter Company), 720 University Ave #200, Palo Alto, CA 94301, USA.)
Contractor-Contractee(Span("Solarcity Corp. (", sentence=7, chars=[86,102], words=[20,22]), Span("Foster City Blvd", sentence=7, chars=[127,142], words=[28,30]))

Sentence(Document b5dfde83-dde0-4235-b749-4c2aaf8abd06,3,THIS PURCHASE AND SALE AGREEMENT is entered into this  11 Feb,  2050, 
by and between Solarcity Corp. (hereinafter Buyer), 391 Foster City Blvd, Foster City, CA 94404 and The Hive Inc. (hereinafter Company), 720 University Ave #200, Palo Alto, CA 94301, USA.)
Contractor-Contractee(Span("Solarcity Corp. (", sentence=7, chars=[86,102], words=[20,22]), Span("University Ave", sentence=7, chars=[212,225], words=[47,48]))

Sentence(Document b

In [36]:
LFs = [LF_between_and, LF_agreement]

# Apply the labelling functions
from snorkel.annotations import LabelAnnotator
import numpy as np
labeler = LabelAnnotator(lfs=LFs)

np.random.seed(1701)
L_train = labeler.apply(split=0)
# L_train is essentially the sparse matrix
# check the labeling results

for i in range(5):
    #print L_train.get_candidate(session, i)
    print L_train.get_candidate(session, i).get_parent()
    print L_train.get_candidate(session, i).org1.get_span(), " ", L_train.get_candidate(session, i).org2.get_span() 
    print L_train.get_candidate(session, i).labels
    print 
L_train.lf_stats(session)

Clearing existing...
Running UDF...

Sentence(Document b5dfde83-dde0-4235-b749-4c2aaf8abd06,3,THIS PURCHASE AND SALE AGREEMENT is entered into this  11 Feb,  2050, 
by and between Solarcity Corp. (hereinafter Buyer), 391 Foster City Blvd, Foster City, CA 94404 and The Hive Inc. (hereinafter Company), 720 University Ave #200, Palo Alto, CA 94301, USA.)
Solarcity Corp. (   Foster City Blvd
[Label (LF_agreement = 1)]

Sentence(Document b5dfde83-dde0-4235-b749-4c2aaf8abd06,3,THIS PURCHASE AND SALE AGREEMENT is entered into this  11 Feb,  2050, 
by and between Solarcity Corp. (hereinafter Buyer), 391 Foster City Blvd, Foster City, CA 94404 and The Hive Inc. (hereinafter Company), 720 University Ave #200, Palo Alto, CA 94301, USA.)
Solarcity Corp. (   University Ave
[Label (LF_between_and = 1), Label (LF_agreement = 1)]

Sentence(Document b5dfde83-dde0-4235-b749-4c2aaf8abd06,3,THIS PURCHASE AND SALE AGREEMENT is entered into this  11 Feb,  2050, 
by and between Solarcity Corp. (hereinafter B

Unnamed: 0,j,Coverage,Overlaps,Conflicts
LF_between_and,0,0.5,0.5,0.0
LF_agreement,1,0.875,0.5,0.0


In [37]:
# Launch Brat
from snorkel.contrib.brat import BratAnnotator

brat = BratAnnotator(session, rel_contract, encoding='utf-8', annotator_name='brat')

# Initilize the brat program
# the brat would copy the training data (split = 0) frrom the sqlite server 
# to the folder "snorkel/snorkel/snorkel/contrib/brat/brat-v1.3_Crunchy_Frog/data/contract/train"
brat.init_collection("contract/train", split=0, overwrite=False)

brat.view("contract/train")

Launching BRAT server at http://localhost:8001 [pid=90243]...


Error! Collection at 'contract/train' already exists. Please set overwrite=True to erase all existing annotations.


In [38]:
#load all the candidates and labeling results from the server
# the candiates are labeled 
train_cands = session.query(rel_contract).filter(rel_contract.split == 0).order_by(rel_contract.id).all()
for c in train_cands:
    print c
    print c.org1.get_span(), "--", c.org2.get_span()
    print c.labels
    print '-------------'

Contractor-Contractee(Span("Solarcity Corp. (", sentence=7, chars=[86,102], words=[20,22]), Span("Foster City Blvd", sentence=7, chars=[127,142], words=[28,30]))
Solarcity Corp. ( -- Foster City Blvd
[Label (LF_agreement = 1)]
-------------
Contractor-Contractee(Span("Solarcity Corp. (", sentence=7, chars=[86,102], words=[20,22]), Span("University Ave", sentence=7, chars=[212,225], words=[47,48]))
Solarcity Corp. ( -- University Ave
[Label (LF_between_and = 1), Label (LF_agreement = 1)]
-------------
Contractor-Contractee(Span("Solarcity Corp. (", sentence=7, chars=[86,102], words=[20,22]), Span("The Hive Inc.", sentence=7, chars=[171,183], words=[38,40]))
Solarcity Corp. ( -- The Hive Inc.
[Label (LF_between_and = 1), Label (LF_agreement = 1)]
-------------
Contractor-Contractee(Span("Foster City Blvd", sentence=7, chars=[127,142], words=[28,30]), Span("University Ave", sentence=7, chars=[212,225], words=[47,48]))
Foster City Blvd -- University Ave
[Label (LF_agreement = 1)]
---------

In [39]:
# import the gold labels from the brat files 
# match the relationship stored in the brat folder "contract/train" 
# with the training candidates in the current databse
# Note that the candidates exsit in "contract/train" but not exisit in the "train_cands"
# will not be imported to the databse
# 
brat.import_gold_labels(session, "contract/train", train_cands)


# Now we can load the brat_labels from the current database(session)
# Note that the annotator_name='brat' becasue when we we initilize the BratAnnotator
# the default annotator name will be 'brat'
from snorkel.annotations import load_gold_labels
L_gold_train = load_gold_labels(session, annotator_name='brat', split=0)
print L_gold_train




documents
{u'b5dfde83-dde0-4235-b749-4c2aaf8abd06': Document b5dfde83-dde0-4235-b749-4c2aaf8abd06}

annotations
{'b5dfde83-dde0-4235-b749-4c2aaf8abd06': {u'R1': (u'Contractor-Contractee', u'T1', u'T2'), u'R2': (u'Contractor-Contractee', u'T4', u'T3'), u'R3': (u'Contractor-Contractee', u'T5', u'T6'), u'T6': {'mention': u'The Hive Inc.', 'abs_char_start': 780, 'abs_char_end': 793, 'entity_type': u'Org'}, u'T4': {'mention': u'Google Inc', 'abs_char_start': 429, 'abs_char_end': 439, 'entity_type': u'Org'}, u'T5': {'mention': u'Solarcity Corp.', 'abs_char_start': 695, 'abs_char_end': 710, 'entity_type': u'Org'}, u'T2': {'mention': u'Microsoft Corp', 'abs_char_start': 311, 'abs_char_end': 325, 'entity_type': u'Org'}, u'T3': {'mention': u'Cisco Systems, Inc.', 'abs_char_start': 510, 'abs_char_end': 529, 'entity_type': u'Org'}, u'T1': {'mention': u'Cisco System', 'abs_char_start': 279, 'abs_char_end': 291, 'entity_type': u'Org'}}}
Annotations
R1 (u'Contractor-Contractee', u'T1', u'T2')
R2 (u'C

Mapped 1/3 (33%) of BRAT labels to candidates


In [148]:
# Fit the generative model
from snorkel.learning import GenerativeModel

gen_model = GenerativeModel()
gen_model.train(L_train, epochs=1000, decay=0.95, step_size=0.01 / L_train.shape[0], reg_param=1e-6)

Inferred cardinality: 2


In [149]:
# error analysis of the generative model
tp, fp, tn, fn = gen_model.error_analysis(session, L_train, L_gold_train)

# generate the noisy-aware lables (marginals) for the training data
train_marginals = gen_model.marginals(L_train)
print train_marginals

Scores (Un-adjusted)
Pos. class accuracy: 1.0
Neg. class accuracy: 0.0
Precision            1.0
Recall               1.0
F1                   1.0
----------------------------------------
TP: 1 | FP: 0 | TN: 0 | FN: 0

[ 0.98803756]


In [150]:
# the coverage of the generative model for the LFs
gen_model.learned_lf_stats()

Unnamed: 0,Accuracy,Coverage,Precision,Recall
0,0.904649,0.7614,0.897992,0.692057
1,0.903163,0.7683,0.896807,0.692057


In [151]:
# label dev data using brat
brat.init_collection("contract/dev", split=1, overwrite=False)
brat.view("contract/dev")

Removed existing collection at 'contract/dev'


In [152]:
# load up the gold labels for dev data
dev_cands = session.query(rel_contract).filter(rel_contract.split == 1).order_by(rel_contract.id).all()
brat.import_gold_labels(session, "contract/dev", dev_cands)
L_gold_dev = load_gold_labels(session, annotator_name='brat', split=1)
print L_gold_dev



  (10, 0)	1


Mapped 1/2 (50%) of BRAT labels to candidates


In [153]:
# Training an LSTM model
# Carefule that the no. of relationships in dev dataset must >=2
from snorkel.learning.disc_models.rnn import reRNN

train_kwargs = {
    'lr':         0.01,
    'dim':        50,
    'n_epochs':   20,
    'dropout':    0.25,
    'print_freq': 1,
    'max_sentence_length': 100
}

lstm = reRNN(seed=1701, n_threads=None)
lstm.train(train_cands, train_marginals, X_dev=dev_cands, Y_dev=L_gold_dev, **train_kwargs)

[reRNN] Training model
[reRNN] n_train=1  #epochs=20  batch size=1
[reRNN] Epoch 0 (0.19s)	Average loss=0.683148	Dev F1=15.38
[reRNN] Epoch 1 (0.35s)	Average loss=0.556879	Dev F1=15.38
[reRNN] Epoch 2 (0.48s)	Average loss=0.345231	Dev F1=15.38
[reRNN] Epoch 3 (0.61s)	Average loss=0.114474	Dev F1=15.38
[reRNN] Epoch 4 (0.75s)	Average loss=0.070313	Dev F1=15.38
[reRNN] Epoch 5 (0.89s)	Average loss=0.064883	Dev F1=15.38
[reRNN] Epoch 6 (1.03s)	Average loss=0.068160	Dev F1=15.38
[reRNN] Epoch 7 (1.15s)	Average loss=0.072885	Dev F1=15.38
[reRNN] Epoch 8 (1.29s)	Average loss=0.076909	Dev F1=15.38
[reRNN] Epoch 9 (1.42s)	Average loss=0.079757	Dev F1=15.38
[reRNN] Epoch 10 (1.55s)	Average loss=0.081409	Dev F1=15.38
[reRNN] Epoch 11 (1.67s)	Average loss=0.081975	Dev F1=15.38
[reRNN] Epoch 12 (1.81s)	Average loss=0.081606	Dev F1=15.38
[reRNN] Epoch 13 (1.94s)	Average loss=0.080466	Dev F1=15.38
[reRNN] Epoch 14 (2.08s)	Average loss=0.078715	Dev F1=15.38
[reRNN] Epoch 15 (2.20s)	Average loss=0.076

In [154]:
# The accuracy of the lstm for the dev data
p, r, f1 = lstm.score(dev_cands, L_gold_dev)
print("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1))

Prec: 0.083, Recall: 1.000, F1 Score: 0.154


In [155]:
L_dev = labeler.apply_existing(split=1)
dev_marginals = gen_model.marginals(L_dev) 
lstm_dev_marginals = lstm.marginals(dev_cands)
for i in range(8):
    print L_dev.get_candidate(session, i).get_parent() # sentence
    print L_dev.get_candidate(session, i).org1.get_span(), " ", L_dev.get_candidate(session, i).org2.get_span() # candidate
    print L_dev.get_candidate(session, i).labels # labels by LFs
    print dev_marginals[i] # labels by generative models
    print lstm_dev_marginals[i]
    print 



Clearing existing...
Running UDF...

Sentence(Document b745346e-16a3-46ba-8adc-92b1fa085c38,0,THIS PURCHASE AND SALE AGREEMENT is entered into this  23rd Feb,  2017, 
by and between Microsoft Corporation (hereinafter Buyer), 600 North Wolfe Street, Baltimore, Maryland, 21287,
and Cisco Systems, Inc. (hereinafter Company), 3700 Momentum St, San Jose, CA 95134-2206, USA, and here is it. 
)
Microsoft Corporation   Buyer
[Label (LF_agreement = 1)]
0.898426912628
0.991041

Sentence(Document b745346e-16a3-46ba-8adc-92b1fa085c38,0,THIS PURCHASE AND SALE AGREEMENT is entered into this  23rd Feb,  2017, 
by and between Microsoft Corporation (hereinafter Buyer), 600 North Wolfe Street, Baltimore, Maryland, 21287,
and Cisco Systems, Inc. (hereinafter Company), 3700 Momentum St, San Jose, CA 95134-2206, USA, and here is it. 
)
Microsoft Corporation   North Wolfe Street
[Label (LF_agreement = 1)]
0.898426912628
0.991041

Sentence(Document b745346e-16a3-46ba-8adc-92b1fa085c38,0,THIS PURCHASE AND SAL

In [57]:
# finish the labeling of  test data
brat.init_collection("contract/test", split=2, overwrite=False)
brat.view("contract/test")

Removed existing collection at 'contract/test'


In [125]:
# load up the gold labels for test data
test_cands = session.query(rel_contract).filter(rel_contract.split == 2).order_by(rel_contract.id).all()
print test_cands
brat.import_gold_labels(session, "contract/test", test_cands)
L_gold_test = load_gold_labels(session, annotator_name='brat', split=2)
print L_gold_test

[Contractor-Contractee(Span("SmartPhone Services", sentence=10, chars=[0,18], words=[0,1]), Span("Darshi Inc.", sentence=10, chars=[24,34], words=[3,4]))]
  (0, 0)	1


Mapped 1/1 (100%) of BRAT labels to candidates


In [127]:
L_test = labeler.apply_existing(split=2)
# output marginals of the generative model
# It should be noted that the generative model depends on
# the LFs and operates on the output of labeler

test_marginals = gen_model.marginals(L_test) 
lstm_test_marginals = lstm.marginals(dev_cands)
for i in range(1):
    print L_test.get_candidate(session, i).get_parent() # sentence
    print L_test.get_candidate(session, i) # candidate
    print L_dev.get_candidate(session, i).org1.get_span(), " ", L_dev.get_candidate(session, i).org2.get_span() # candidate
    print L_test.get_candidate(session, i).labels # labels by LFs
    print test_marginals[i] # labels by generative models
    print lstm_test_marginals[i]
    print 

Clearing existing...
Running UDF...

Sentence(Document c5dbaa73-8076-4385-944c-1539a30152d7,0,SmartPhone Services and Darshi Inc. agree that the contract is valid.)
Contractor-Contractee(Span("SmartPhone Services", sentence=10, chars=[0,18], words=[0,1]), Span("Darshi Inc.", sentence=10, chars=[24,34], words=[3,4]))
Starbucks Corp   CA 95134-2206
[]
0.5
0.99784



In [11]:
from snorkel.contrib.brat import BratProject
bratProject = BratProject(session)

In [36]:
f = r"/Users/kunling/Dropbox/decisionengines/snorkel/snorkel/contrib/brat/brat-v1.3_Crunchy_Frog/data/contract/train"
bratProject.import_project(f)
from snorkel.annotations import load_gold_labels
L_gold_train = load_gold_labels(session, annotator_name='brat')
#print L_gold_train

CREATED TYPE Entity(Org,[org])
CREATED TYPE Relation(ContractorContractee,['org1', 'org2'])
0
Org
1
Contractor-Contractee
{'org_id': 35, 'split': 0}
Org(None)
Org(None)
{'org_id': 36, 'split': 0}
Org(None)
Org(None)
{'org_id': 37, 'split': 0}
Org(None)
Org(None)
{'org_id': 24, 'split': 0}
Org(None)
Org(None)
{'org_id': 38, 'split': 0}
Org(None)
Org(None)
{'org_id': 23, 'split': 0}
Org(None)
Org(None)
{'org2_id': 24, 'split': 1, 'org1_id': 23}
ContractorContractee(None, None)
ContractorContractee(None, None)
{'org2_id': 38, 'split': 1, 'org1_id': 36}
ContractorContractee(None, None)
ContractorContractee(None, None)
{'org2_id': 35, 'split': 1, 'org1_id': 37}
ContractorContractee(None, None)
ContractorContractee(None, None)


In [31]:
from snorkel.models import Candidate, StableLabel, Document, TemporarySpan, Sentence, candidate_subclass, GoldLabel
gold_labels = session.query(rel_contract).all()
print gold_labels

[Contractor-Contractee(Span("SmartPhone Services", sentence=12, chars=[0,18], words=[0,1]), Span("Darshi Inc.", sentence=12, chars=[24,34], words=[3,4]))]
