In [1]:

%matplotlib inline
import os

# TO USE A DATABASE OTHER THAN SQLITE, USE THIS LINE
# Note that this is necessary for parallel execution amongst other things...
# os.environ['SNORKELDB'] = 'postgres:///snorkel-intro'

from snorkel import SnorkelSession
session = SnorkelSession()

In [2]:
# Parse the document and save into the sqlite server
# see the intro-turtorial 1

# Configure a doc pre-processer
from snorkel.parser import TextDocPreprocessor
doc_preprocessor = TextDocPreprocessor('data/test/')
print doc_preprocessor.path


# Running a Sapcy corpus parser
from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser

# spacy lang_model = en_core_web_md
corpus_parser = CorpusParser(parser=Spacy()) # use spacy parser, fast but not accurate enough for NER



#corpus_parser = CorpusParser() # use corenlp parser, slow but accurate for NER
corpus_parser.apply(doc_preprocessor)



data/test/
Clearing existing...
Running UDF...


In [4]:
# Check if the document has beed loaded
from snorkel.models import Document, Sentence
for doc in session.query(Document).all():
    print doc.id
    #print doc.sentences

1
2
3


In [5]:
# Define Candidate Type
from snorkel.models import candidate_subclass, Document, Candidate
rel_spouse = candidate_subclass('Spouse', ['person1', 'person2'])

# Write the candidate extractor
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import PersonMatcher
ngrams         = Ngrams(n_max=7)
person_matcher = PersonMatcher()
cand_extractor = CandidateExtractor(rel_spouse, [ngrams, ngrams], [person_matcher, person_matcher])

In [19]:
# Split the data into train, dev and test 
from snorkel.models import Document

docs = session.query(Document).order_by(Document.name).all()

train_sents = set()
dev_sents   = set()
test_sents  = set()

for i, doc in enumerate(docs):
    for s in doc.sentences:
        if i % 3 == 1:
            dev_sents.add(s)
        elif i % 3 == 2:
            test_sents.add(s)
        else:
            train_sents.add(s)
            
print len(train_sents)
print len(test_sents)
print len(dev_sents)

1
15
19


In [20]:
# Apply candidate extractors on all the sentences
for i, sents in enumerate([train_sents, dev_sents, test_sents]):
    cand_extractor.apply(sents, split=i)
    print("Number of candidates:", session.query(rel_spouse).filter(rel_spouse.split == i).count())
    


Clearing existing...
Running UDF...

('Number of candidates:', 3)
Clearing existing...
Running UDF...

('Number of candidates:', 17)
Clearing existing...
Running UDF...

('Number of candidates:', 1)


In [21]:
docs = session.query(Document).order_by(Document.name).all()
for document in docs:
    sent_offsets = [sent.abs_char_offsets[0] for sent in document.sentences]
    char_offsets = [sent.char_offsets[0] for sent in document.sentences]
    print char_offsets


[0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [22]:
import re
from snorkel.lf_helpers import (
    get_left_tokens, get_right_tokens, get_between_tokens,
    get_text_between, get_tagged_text,
)

In [23]:
spouses = {'spouse', 'wife', 'husband', 'ex-wife', 'ex-husband'}
family = {'father', 'mother', 'sister', 'brother', 'son', 'daughter',
              'grandfather', 'grandmother', 'uncle', 'aunt', 'cousin'}
family = family | {f + '-in-law' for f in family}
other = {'boyfriend', 'girlfriend' 'boss', 'employee', 'secretary', 'co-worker'}

# Helper function to get last name
def last_name(s):
    name_parts = s.split(' ')
    return name_parts[-1] if len(name_parts) > 1 else None    

def LF_husband_wife(c):
    return 1 if len(spouses.intersection(get_between_tokens(c))) > 0 else 0

def LF_husband_wife_left_window(c):
    if len(spouses.intersection(get_left_tokens(c[0], window=2))) > 0:
        return 1
    elif len(spouses.intersection(get_left_tokens(c[1], window=2))) > 0:
        return 1
    else:
        return 0
    
def LF_same_last_name(c):
    p1_last_name = last_name(c.person1.get_span())
    p2_last_name = last_name(c.person2.get_span())
    if p1_last_name and p2_last_name and p1_last_name == p2_last_name:
        if c.person1.get_span() != c.person2.get_span():
            return 1
    return 0

def LF_no_spouse_in_sentence(c):
    return -1 if np.random.rand() < 0.75 and len(spouses.intersection(c.get_parent().words)) == 0 else 0

def LF_and_married(c):
    return 1 if 'and' in get_between_tokens(c) and 'married' in get_right_tokens(c) else 0
    
def LF_familial_relationship(c):
    return -1 if len(family.intersection(get_between_tokens(c))) > 0 else 0

def LF_family_left_window(c):
    if len(family.intersection(get_left_tokens(c[0], window=2))) > 0:
        return -1
    elif len(family.intersection(get_left_tokens(c[1], window=2))) > 0:
        return -1
    else:
        return 0

def LF_other_relationship(c):
    return -1 if len(other.intersection(get_between_tokens(c))) > 0 else 0


In [24]:
# Test the first labeling function: LF_between_and
labeled = []
for c in session.query(rel_spouse).filter(rel_spouse.split == 0).all():
    if LF_husband_wife(c) != 0:
        labeled.append(c)
print("Number labeled:", len(labeled))
for i in range(len(labeled)):
    print labeled[i].get_parent()
    #print labeled[i].labels
    print




('Number labeled:', 2)
Sentence(Document 1,0,Top Nigerian comedian Julius Agwu with his wife Ibiere held a birthday party for their daughter Zahra who turned six recently   The party was held at Monkey Joes in Houston Texas yesterday September 7)

Sentence(Document 1,0,Top Nigerian comedian Julius Agwu with his wife Ibiere held a birthday party for their daughter Zahra who turned six recently   The party was held at Monkey Joes in Houston Texas yesterday September 7)



In [25]:
LFs = [
    LF_husband_wife, LF_husband_wife_left_window, LF_same_last_name,
    LF_no_spouse_in_sentence, LF_and_married, LF_familial_relationship, 
    LF_family_left_window, LF_other_relationship
]



Clearing existing...
Running UDF...

Spouse(Span("Ibiere", sentence=4, chars=[48,53], words=[8,8]), Span("Zahra", sentence=4, chars=[96,100], words=[16,16]))
[Label (LF_husband_wife_left_window = 1), Label (LF_familial_relationship = -1), Label (LF_family_left_window = -1)]
Spouse(Span("Julius Agwu", sentence=4, chars=[22,32], words=[3,4]), Span("Zahra", sentence=4, chars=[96,100], words=[16,16]))
[Label (LF_husband_wife = 1), Label (LF_familial_relationship = -1), Label (LF_family_left_window = -1)]
Spouse(Span("Julius Agwu", sentence=4, chars=[22,32], words=[3,4]), Span("Ibiere", sentence=4, chars=[48,53], words=[8,8]))
[Label (LF_husband_wife = 1), Label (LF_husband_wife_left_window = 1)]


In [27]:
# Launch Brat
from snorkel.contrib.brat import BratAnnotator

brat = BratAnnotator(session, rel_spouse, encoding='utf-8', annotator_name='brat')

# Initilize the brat program
# the brat would copy the training data (split = 0) frrom the sqlite server 
# to the folder "snorkel/snorkel/snorkel/contrib/brat/brat-v1.3_Crunchy_Frog/data/contract/train"
brat.init_collection("geng_spouse/train", split=0, overwrite=True)

brat.view("geng_spouse/train")

Launching BRAT server at http://localhost:8001 [pid=79957]...
Killing BRAT server [79943]...
Removed existing collection at 'geng_spouse/train'


In [28]:
#load all the candidates and labeling results from the server
# the candiates are labeled 
train_cands = session.query(rel_spouse).filter(rel_spouse.split == 0).order_by(rel_spouse.id).all()
for c in train_cands:
    print c
    print c.labels
    print '-------------'

Spouse(Span("Ibiere", sentence=4, chars=[48,53], words=[8,8]), Span("Zahra", sentence=4, chars=[96,100], words=[16,16]))
[Label (LF_husband_wife_left_window = 1), Label (LF_familial_relationship = -1), Label (LF_family_left_window = -1)]
-------------
Spouse(Span("Julius Agwu", sentence=4, chars=[22,32], words=[3,4]), Span("Zahra", sentence=4, chars=[96,100], words=[16,16]))
[Label (LF_husband_wife = 1), Label (LF_familial_relationship = -1), Label (LF_family_left_window = -1)]
-------------
Spouse(Span("Julius Agwu", sentence=4, chars=[22,32], words=[3,4]), Span("Ibiere", sentence=4, chars=[48,53], words=[8,8]))
[Label (LF_husband_wife = 1), Label (LF_husband_wife_left_window = 1)]
-------------


Mapped 1/1 (100%) of BRAT labels to candidates


In [41]:
# Apply the labelling functions
from snorkel.annotations import LabelAnnotator
import numpy as np
labeler = LabelAnnotator(lfs=LFs)

np.random.seed(1701)
L_train = labeler.apply(split=0)
# L_train is essentially the sparse matrix
# check the labeling results

for i in range(3):
    print L_train.get_candidate(session, i)
    print L_train.get_candidate(session, i).labels
L_train.lf_stats(session)

Clearing existing...
Running UDF...

Spouse(Span("Ibiere", sentence=4, chars=[48,53], words=[8,8]), Span("Zahra", sentence=4, chars=[96,100], words=[16,16]))
[Label (LF_husband_wife_left_window = 1), Label (LF_familial_relationship = -1), Label (LF_family_left_window = -1)]
Spouse(Span("Julius Agwu", sentence=4, chars=[22,32], words=[3,4]), Span("Zahra", sentence=4, chars=[96,100], words=[16,16]))
[Label (LF_husband_wife = 1), Label (LF_familial_relationship = -1), Label (LF_family_left_window = -1)]
Spouse(Span("Julius Agwu", sentence=4, chars=[22,32], words=[3,4]), Span("Ibiere", sentence=4, chars=[48,53], words=[8,8]))
[Label (LF_husband_wife = 1), Label (LF_husband_wife_left_window = 1)]


Unnamed: 0,j,Coverage,Overlaps,Conflicts
LF_husband_wife,0,0.666667,0.666667,0.333333
LF_husband_wife_left_window,1,0.666667,0.666667,0.333333
LF_same_last_name,2,0.0,0.0,0.0
LF_no_spouse_in_sentence,3,0.0,0.0,0.0
LF_and_married,4,0.0,0.0,0.0
LF_familial_relationship,5,0.666667,0.666667,0.666667
LF_family_left_window,6,0.666667,0.666667,0.666667
LF_other_relationship,7,0.0,0.0,0.0


In [40]:
# Mapping BRAT Annotations to Snorkel Candidates and compare the accury of our labled train_cands
# Import a collection of BRAT annotations,  map it onto the provided set
# of candidates, and create gold labels. This method DOES NOT create new
# candidates, so some labels may not import if a corresponding candidate
# cannot be found.


brat.import_gold_labels(session, "geng_spouse/train", train_cands)

from snorkel.annotations import load_gold_labels

L_gold_train = load_gold_labels(session, annotator_name='brat', split=0)
print L_gold_train



  (2, 0)	1


In [61]:
# Fit the generative model
from snorkel.learning import GenerativeModel

gen_model = GenerativeModel()
gen_model.train(L_train, epochs=1000, decay=0.95, step_size=0.01 / L_train.shape[0], reg_param=1e-6)

Inferred cardinality: 2


In [71]:
# error analysis of the generative model
tp, fp, tn, fn = gen_model.error_analysis(session, L_train, L_gold_train)

# generate the noisy-aware lables (marginals) for the training data
train_marginals = gen_model.marginals(L_train)
print train_marginals

Scores (Un-adjusted)
Pos. class accuracy: 1.0
Neg. class accuracy: 1.0
Precision            1.0
Recall               1.0
F1                   1.0
----------------------------------------
TP: 1 | FP: 0 | TN: 2 | FN: 0

[ 0.08316157  0.08057584  0.96242559]


In [63]:
# the coverage of the generative model for the LFs
gen_model.learned_lf_stats()

Unnamed: 0,Accuracy,Coverage,Precision,Recall
0,0.826128,0.7356,0.821563,0.617129
1,0.839635,0.7352,0.839171,0.606398
2,0.839749,0.7326,0.830454,0.622798
3,0.836043,0.7258,0.835214,0.599312
4,0.838271,0.7358,0.83598,0.618141
5,0.877854,0.7532,0.87925,0.664912
6,0.891821,0.758,0.891489,0.67868
7,0.833969,0.7342,0.832508,0.612877


In [77]:
L_dev = labeler.apply_existing(split=1)
# output marginals of the generative model
# It should be noted that the generative model depends on
# the LFs and operates on the output of labeler

dev_marginals = gen_model.marginals(L_dev) 
for i in range(8):
    print L_dev.get_candidate(session, i).get_parent() # sentence
    print L_dev.get_candidate(session, i) # candidate
    print L_dev.get_candidate(session, i).labels # labels by LFs
    print dev_marginals[i] # labels by generative models
    print 

Clearing existing...
Running UDF...

Sentence(Document 2,4,The FBI agent and Red Reddington (James Spader) will be on the run in Season 3 of the NBC series after the pair's Season 2 clash with the Cabal ended with Liz killing the corrupt U.S. Attorney General Tom Connolly (Reed Birney).
)
Spouse(Span("James Spader", sentence=9, chars=[34,45], words=[7,8]), Span("Reed Birney", sentence=9, chars=[215,225], words=[44,45]))
[Label (LF_no_spouse_in_sentence = -1)]
0.168335729296

Sentence(Document 2,4,The FBI agent and Red Reddington (James Spader) will be on the run in Season 3 of the NBC series after the pair's Season 2 clash with the Cabal ended with Liz killing the corrupt U.S. Attorney General Tom Connolly (Reed Birney).
)
Spouse(Span("James Spader", sentence=9, chars=[34,45], words=[7,8]), Span("Tom Connolly", sentence=9, chars=[201,212], words=[41,42]))
[Label (LF_no_spouse_in_sentence = -1)]
0.168335729296

Sentence(Document 2,4,The FBI agent and Red Reddington (James Spader) will b

In [85]:
# finish the labeling of dev and test data
brat.init_collection("geng_spouse/dev", split=1, overwrite=False)
brat.view("geng_spouse/dev")

brat.init_collection("geng_spouse/test", split=2, overwrite=False)
brat.view("geng_spouse/test")

Error! Collection at 'geng_spouse/dev' already exists. Please set overwrite=True to erase all existing annotations.


Error! Collection at 'geng_spouse/test' already exists. Please set overwrite=True to erase all existing annotations.


In [86]:
# load up the gold labels for dev data
dev_cands = session.query(rel_spouse).filter(rel_spouse.split == 1).order_by(rel_spouse.id).all()
brat.import_gold_labels(session, "geng_spouse/dev", dev_cands)
L_gold_dev = load_gold_labels(session, annotator_name='brat', split=1)
print L_gold_dev


  (16, 0)	1


Mapped 1/1 (100%) of BRAT labels to candidates


In [87]:
# load up the gold labels for test data
test_cands = session.query(rel_spouse).filter(rel_spouse.split == 2).order_by(rel_spouse.id).all()
brat.import_gold_labels(session, "geng_spouse/test", test_cands)
L_gold_test = load_gold_labels(session, annotator_name='brat', split=2)
print L_gold_test

ZeroDivisionError: float division by zero

In [88]:
# Training an LSTM model
from snorkel.learning.disc_models.rnn import reRNN

train_kwargs = {
    'lr':         0.01,
    'dim':        50,
    'n_epochs':   10,
    'dropout':    0.25,
    'print_freq': 1,
    'max_sentence_length': 100
}

lstm = reRNN(seed=1701, n_threads=None)
lstm.train(train_cands, train_marginals, X_dev=dev_cands, Y_dev=L_gold_dev, **train_kwargs)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


[reRNN] Training model
[reRNN] n_train=3  #epochs=10  batch size=3
[reRNN] Epoch 0 (0.25s)	Average loss=0.690271	Dev F1=0.00
[reRNN] Epoch 1 (0.42s)	Average loss=0.665733	Dev F1=0.00
[reRNN] Epoch 2 (0.55s)	Average loss=0.689899	Dev F1=0.00
[reRNN] Epoch 3 (0.67s)	Average loss=0.661771	Dev F1=0.00
[reRNN] Epoch 4 (0.80s)	Average loss=0.662512	Dev F1=0.00
[reRNN] Epoch 5 (0.93s)	Average loss=0.664333	Dev F1=0.00
[reRNN] Epoch 6 (1.06s)	Average loss=0.664365	Dev F1=0.00
[reRNN] Epoch 7 (1.19s)	Average loss=0.662932	Dev F1=0.00
[reRNN] Epoch 8 (1.39s)	Average loss=0.660743	Dev F1=0.00
[reRNN] Epoch 9 (1.54s)	Average loss=0.658482	Dev F1=0.00
[reRNN] Training done (1.55s)


In [89]:
# The accuracy of the lstm for the dev data
p, r, f1 = lstm.score(dev_cands, L_gold_dev)
print("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1))

Prec: 0.000, Recall: 0.000, F1 Score: 0.000


In [97]:

lstm_marginals = lstm.marginals(dev_cands)
for i in range(8):
    print L_dev.get_candidate(session, i).get_parent() # sentence
    print L_dev.get_candidate(session, i).person1.get_span(), " ", L_dev.get_candidate(session, i).person2.get_span() # candidate
    print L_dev.get_candidate(session, i).labels # labels by LFs
    print dev_marginals[i] # labels by generative models
    print lstm_marginals[i]
    print 


Sentence(Document 2,4,The FBI agent and Red Reddington (James Spader) will be on the run in Season 3 of the NBC series after the pair's Season 2 clash with the Cabal ended with Liz killing the corrupt U.S. Attorney General Tom Connolly (Reed Birney).
)
James Spader   Reed Birney
[Label (LF_no_spouse_in_sentence = -1)]
0.168335729296
0.424342

Sentence(Document 2,4,The FBI agent and Red Reddington (James Spader) will be on the run in Season 3 of the NBC series after the pair's Season 2 clash with the Cabal ended with Liz killing the corrupt U.S. Attorney General Tom Connolly (Reed Birney).
)
James Spader   Tom Connolly
[Label (LF_no_spouse_in_sentence = -1)]
0.168335729296
0.418509

Sentence(Document 2,4,The FBI agent and Red Reddington (James Spader) will be on the run in Season 3 of the NBC series after the pair's Season 2 clash with the Cabal ended with Liz killing the corrupt U.S. Attorney General Tom Connolly (Reed Birney).
)
James Spader   Liz
[]
0.5
0.422157

Sentence(Document 2,