In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()

import os
from snorkel.parser import XMLMultiDocPreprocessor

# The following line is for testing only. Feel free to ignore it.
file_path = 'articles/training.xml'
train_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,
    doc='.//article',
    text='.//front/article-meta/abstract/p/text()',
    id=  './/front/article-meta/article-id/text()'
)

file_path = 'articles/development.xml'
dev_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,    
    doc='.//document',    
    text='.//passage/text/text()',    
    id='.//id/text()'
)

file_path = 'articles/testcorpus.xml'
test_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,    
    doc='.//document',    
    text='.//passage/text/text()',    
    id='.//id/text()'
)

In [2]:
from snorkel.parser import CorpusParser

corpus_parser = CorpusParser()
corpus_parser.apply(list(train_preprocessor)) #parallelism can be run with a Postgres DBMS, but not SQLite
corpus_parser.apply(list(dev_preprocessor), clear=False)
corpus_parser.apply(list(test_preprocessor), clear=False)

Clearing existing...
Running UDF...
[=                                       ] 0%



[=                                       ] 1%



[=                                       ] 2%



[==                                      ] 3%



[==                                      ] 4%



[===                                     ] 5%



[===                                     ] 7%



[====                                    ] 8%



[====                                    ] 9%



[=====                                   ] 11%



[=====                                   ] 12%
































































































































































































































































































Running UDF...

Running UDF...





In [2]:
from six.moves.cPickle import load
from snorkel.models import Document, Sentence

with open('articles/doc_ids.pkl', 'rb') as f:
    train_ids, dev_ids, test_ids = load(f)
train_ids, dev_ids, test_ids = set(train_ids), set(dev_ids), set(test_ids)
# print len(train_ids)
# print len(dev_ids)
# print len(test_ids)
train_sents, dev_sents, test_sents = set(), set(), set()
docs = session.query(Document).order_by(Document.name).all()
# print docs
# print len(docs)
for i, doc in enumerate(docs):
    for s in doc.sentences:
        if doc.name in train_ids:
            train_sents.add(s)
        elif doc.name in dev_ids:
            dev_sents.add(s)
        elif doc.name in test_ids:
            test_sents.add(s)
        else:
            raise Exception('ID <{0}> not found in any id set'.format(doc.name))

In [13]:
from snorkel.models import Candidate, candidate_subclass
from snorkel.models import candidate_subclass
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import PersonMatcher
import matchers
from snorkel.models import Document
from snorkel.viewer import SentenceNgramViewer
import os

# Defining the schema
BiomarkerLevelUnit = candidate_subclass('BiomarkerLevelUnit', ['biomarker', 'level', 'unit'])

biomarker_ngrams = Ngrams(n_max=1)
level_ngrams = Ngrams(n_max=8)
unit_ngrams = Ngrams(n_max=8)

# Create our two Matchers
bMatcher = matchers.getBiomarkerMatcher()
lMatcher = matchers.getLevelsMatcher()
uMatcher = matchers.getUnitsMatcher()
  
# Building the CandidateExtractor 
candidate_extractor = CandidateExtractor(BiomarkerLevelUnit, [biomarker_ngrams, level_ngrams, unit_ngrams], [bMatcher, lMatcher, uMatcher])


['Ym', 'yottameter', 'Zm', 'zettameter', 'Em', 'exameter', 'Pm', 'petameter', 'Tm', 'terameter', 'Gm', 'gigameter', 'Mm', 'megameter', 'km', 'kilometer', 'hm', 'hectometer', 'dam', 'dekameter', 'dm', 'decimeter', 'cm', 'centimeter', '\\u03bcm', 'micrometer', 'nm', 'nanometer', 'pm', 'picometer', 'fm', 'femtometer', 'am', 'attometer', 'zm', 'zeptometer', 'ym', 'yoctometer', 'Ym2', 'square yottameter', 'Zm2', 'square zettameter', 'Em2', 'square exameter', 'Pm2', 'square petameter', 'Tm2', 'square terameter', 'Gm2', 'square gigameter', 'Mm2', 'square megameter', 'km2', 'square kilometer', 'hm2', 'square hectometer', 'dam2', 'square dekameter', 'dm2', 'square decimeter', 'cm2', 'square centimeter', '\\u03bcm2', 'square micrometer', 'nm2', 'square nanometer', 'pm2', 'square picometer', 'fm2', 'square femtometer', 'am2', 'square attometer', 'zm2', 'square zeptometer', 'ym2', 'square yoctometer', 'Ym2', 'cubic yottameter', 'Zm2', 'cubic zettameter', 'Em2', 'cubic exameter', 'Pm2', 'cubic peta

In [15]:
# for k, sents in enumerate([train_sents, dev_sents]):
#     candidate_extractor.apply(sents, split=k)
#     print("Number of candidates:", session.query(BiomarkerDrug).filter(BiomarkerDrug.split == k).count())

# from candidate_adjective_fixer import *
# with session.no_autoflush:
for k, sents in enumerate([train_sents, dev_sents, test_sents]):
    candidate_extractor.apply(sents, split=k)
    print("Number of candidates:", session.query(BiomarkerLevelUnit).filter(BiomarkerLevelUnit.split == k).count())


dev_cands = session.query(BiomarkerLevelUnit).filter(BiomarkerLevelUnit.split == 1).all()
print dev_cands
sv = SentenceNgramViewer(dev_cands, session, n_per_page=1)
sv

Clearing existing...
Running UDF...

('Number of candidates:', 176)
Clearing existing...
Running UDF...

('Number of candidates:', 60)
Clearing existing...
Running UDF...

('Number of candidates:', 6)
[BiomarkerLevelUnit(Span("ISO", sentence=14571, chars=[26,28], words=[3,3]), Span("85", sentence=14571, chars=[31,32], words=[5,5]), Span("kg", sentence=14571, chars=[39,40], words=[6,6])), BiomarkerLevelUnit(Span("CD", sentence=13855, chars=[179,180], words=[35,35]), Span("3", sentence=13855, chars=[195,195], words=[40,40]), Span("kg", sentence=13855, chars=[279,280], words=[58,58])), BiomarkerLevelUnit(Span("CD", sentence=13855, chars=[179,180], words=[35,35]), Span("3", sentence=13855, chars=[195,195], words=[40,40]), Span("kg", sentence=13855, chars=[221,222], words=[46,46])), BiomarkerLevelUnit(Span("CD", sentence=13855, chars=[179,180], words=[35,35]), Span("2", sentence=13855, chars=[147,147], words=[29,29]), Span("kg", sentence=13855, chars=[279,280], words=[58,58])), BiomarkerLev

<IPython.core.display.Javascript object>

In [None]:
print len(dev_cands)

In [None]:
keyWords = ['basis', 'target', 'treat', 'treatment', 'inhibit', 'inhibition', 'inhibitor', ]
negationWords = ["not", "nor", "neither"]

def presenceOfNot(m):
    for word in negationWords:
        if (word in m.post_window1('lemmas', 20)) and (word in m.pre_window2('lemmas', 20)):
            return True
    return False

# 1
def LF_distance(m):
    # if 'neuroendocrine' in m.lemmas:
    #     print m.lemmas 
    # print m.dep_labels
    distance = abs(m.e2_idxs[0] - m.e1_idxs[0])
    count = 0
    for lemma in m.lemmas:
        if lemma == ',':
            count += 1
    if count > 1 and ',' in m.pre_window1('lemmas', 1):
        print m
        return 0
    if distance == 0:
        return -1
    if distance < 8:
        # print "RETURNING ONE"
        return 0
    else:
        return -1

def LF_keyword(m):
    for word in keyWords:
        if ((word in m.post_window1('lemmas', 20)) and (word in m.pre_window2('lemmas', 20))) or ((word in m.post_window1('lemmas', 20)) and (word in m.pre_window2('lemmas', 20))):
            if presenceOfNot(m):
                return -1
            else:
                return 1
    return 0

def LF_usedToTreat(m):
     if ('used' in m.pre_window1('lemmas'), 20) and ('to' in m.pre_window1('lemmas', 20)) and ('treat' in m.pre_window1('lemmas', 20)):
         return 1
     else:
         return 0
     
def LF_usedToInhibit(m):
     if ('used' in m.pre_window1('lemmas'), 20) and ('to' in m.pre_window1('lemmas', 20)) and ('inhibit' in m.pre_window1('lemmas', 20)):
         return 1
     else:
         return 0
     
def LF_inhibit(m):
    if ('inhibit' in m.pre_window1('lemmas', 20) and 'inhibit' in m.pre_window2('lemms', 20)):
        return 1
    else:
        return 0
    
def LF_target(m):
    if ('target' in m.pre_window1('lemmas', 20) and 'target' in m.pre_window2('lemms', 20)):
        return 1
    else:
        return 0

def LF_inhibitionOfWith(m):
    if ('inhibition' in m.pre_window1('lemmas', 20) and 'of' in m.pre_window1('lemmas', 20) and 'with' in m.post_window1('lemmas', 20)):
        return 1
    else:
        return 0
    
def LF_isATreatmentOf(m):
    if ('is' in m.pre_window1('lemmas', 20) and 'a' in m.pre_window1('lemmas', 20) and 'treatment' in m.pre_window1('lemmas', 20) and 'of' in m.pre_window1('lemmas', 20)):
        return 1
    else:
        return 0
    
def LF_antibody(m):
    if ('antibody' in m.post_window2('lemmas', 20)):
        return 1
    else:
        return 0
def LF_hadisbad(m):
    if(m.mention1 == "had" or m.mention2 == "had"):
        return -1
def LF_duration(m):
    if(m.mention1 == "duration" or m.mention2 == "duration"):
        return -1
def LF_isaBiomarker(m):
    post_window1_lemmas = m.post_window1('lemmas',20)
    pre_window2_lemmas = m.pre_window2('lemmas',20)
    if ('biomarker' in post_window1_lemmas and 'biomarker' in pre_window2_lemmas) or ('marker' in post_window1_lemmas and 'marker' in pre_window2_lemmas) or ('indicator' in post_window1_lemmas and 'indicator' in pre_window2_lemmas):
        marker_idx_post_window1 = -1
        markers = ['biomarker','marker','indicator']
        for marker in markers:
            try:
                # print post_window1_lemmas
                findMarker = post_window1_lemmas.index(marker)
                if not findMarker == -1:
                    marker_idx_post_window1 = findMarker
                    print marker
            except:
                pass
        if 'cop' in m.post_window1('dep_labels',20):
            try:
                cop_idx_post_window1 = m.post_window1('dep_labels',20).index('cop')
            except:
                pass
            
            print "MarkerIdx:"
            print marker_idx_post_window1
            print "ROOTIdx:"
            try:
                print  m.post_window1('dep_labels',marker_idx_post_window1)
                print  m.post_window1('dep_labels',marker_idx_post_window1).index('ROOT')
            except:
                pass
            print '\n'
            
            return 1 if ('nsubj' in m.mention1(attribute='dep_labels')) and (marker_idx_post_window1-cop_idx_post_window1 < 4)  else 0
    return 0

In [None]:
LFs = [LF_distance, LF_keyword, LF_usedToTreat, LF_inhibit, LF_inhibitionOFWith, LF_isaBiomarker]

In [None]:
from snorkel.annotations import LabelAnnotator
labeler = LabelAnnotator(lfs=LFs)

In [None]:
%time L_train = labeler.apply(split=0)
L_train

In [None]:
L_train.lf_stats(session)

In [None]:
from snorkel.learning.structure import DependencySelector
ds = DependencySelector()
deps = ds.select(L_train, threshold=0.1)
len(deps)

In [None]:
deps

In [None]:
deps = set()

In [None]:
from snorkel.learning import GenerativeModel

gen_model = GenerativeModel(lf_propensity=True)
gen_model.train(
    L_train, deps=deps, decay=0.95, step_size=0.1/L_train.shape[0], reg_param=0.0
)

In [None]:
train_marginals = gen_model.marginals(L_train)

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_marginals, bins=20)
plt.show()

In [None]:
gen_model.learned_lf_stats()

In [None]:
from snorkel.annotations import save_marginals
save_marginals(session, L_train, train_marginals)

In [None]:
print L_gold_dev
L_dev = labeler.apply_existing(split=1)
print L_gold_dev

In [None]:
from load_external_annotations_new import load_external_labels
load_external_labels(session, BiomarkerDrug, annotator_name='gold')

In [None]:
from snorkel.annotations import load_gold_labels
L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_dev

In [None]:
_ = gen_model.score(session, L_dev, L_gold_dev)

In [None]:
L_dev.lf_stats(session, L_gold_dev, gen_model.learned_lf_stats()['Accuracy'])

In [None]:
import numpy as np

# Load dev labels and convert to [0, 1] range
from snorkel.annotations import load_gold_labels
L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
dev_labels = (np.ravel(L_gold_dev.todense()) + 1) / 2

In [None]:
from snorkel.annotations import FeatureAnnotator
featurizer = FeatureAnnotator()

%time F_train = featurizer.apply(split=0)
F_train

In [None]:
%%time
F_dev  = featurizer.apply_existing(split=1)
F_test = featurizer.apply_existing(split=2)

In [None]:
from snorkel.annotations import load_marginals
train_marginals = load_marginals(session, F_train, split=0)

In [None]:
from snorkel.learning import SparseLogisticRegression
disc_model = SparseLogisticRegression()

In [None]:
from snorkel.learning.utils import MentionScorer
from snorkel.learning import RandomSearch, ListParameter, RangeParameter

# Searching over learning rate
rate_param = RangeParameter('lr', 1e-6, 1e-2, step=1, log_base=10)
l1_param  = RangeParameter('l1_penalty', 1e-6, 1e-2, step=1, log_base=10)
l2_param  = RangeParameter('l2_penalty', 1e-6, 1e-2, step=1, log_base=10)

searcher = RandomSearch(session, disc_model, F_train, train_marginals, [rate_param, l1_param, l2_param], n=20)

In [None]:
from snorkel.annotations import load_gold_labels
L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_dev

import numpy as np
np.random.seed(1701)
searcher.fit(F_dev, L_gold_dev, n_epochs=50, rebalance=0.5, print_freq=25)

### Scoring on the test set

Finally, we'll evaluate our performance on the blind test set of 500 documents. We'll load labels similar to how we did for the development set, and use the `score` function of our extraction model to see how we did.

from load_external_annotations import load_external_labels
load_external_labels(session, BiomarkerDrug, split=2, annotator='gold')

from snorkel.annotations import load_gold_labels
L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)
L_gold_test

_, _, _, _ = disc_model.score(session, F_test, L_gold_test)

In [None]:
from snorkel.contrib.rnn import reRNN

train_kwargs = {
    'lr':         0.01,
    'dim':        100,
    'n_epochs':   50,
    'dropout':    0.5,
    'rebalance':  0.25,
    'print_freq': 5
}

lstm = reRNN(seed=1701, n_threads=None)
lstm.train(train, train_marginals, dev_candidates=dev, dev_labels=dev_labels, **train_kwargs)