In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()

import os
from snorkel.parser import XMLMultiDocPreprocessor

# The following line is for testing only. Feel free to ignore it.
file_path = 'articles/training.xml'
train_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,
    doc='.//article',
    text='.//front/article-meta/abstract/p/text()',
    id=  './/front/article-meta/article-id/text()'
)

file_path = 'articles/development.xml'
dev_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,    
    doc='.//document',    
    text='.//passage/text/text()',    
    id='.//id/text()'
)


In [None]:
from snorkel.parser import CorpusParser

corpus_parser = CorpusParser()
corpus_parser.apply(list(train_preprocessor)) #parallelism can be run with a Postgres DBMS, but not SQLite
corpus_parser.apply(list(dev_preprocessor), clear=False)

In [None]:
from six.moves.cPickle import load

with open('articles/doc_ids.pkl', 'rb') as f:
    train_ids, dev_ids = load(f)
train_ids, dev_ids = set(train_ids), set(dev_ids)
print len(train_ids)
print len(dev_ids)
train_sents, dev_sents = set(), set()
docs = session.query(Document).order_by(Document.name).all()
print docs
print len(docs)
for i, doc in enumerate(docs):
    for s in doc.sentences:
        if doc.name in train_ids:
            train_sents.add(s)
        elif doc.name in dev_ids:
            dev_sents.add(s)
        else:
            raise Exception('ID <{0}> not found in any id set'.format(doc.name))

In [None]:
from snorkel.models import Candidate, candidate_subclass

BiomarkerMedium = candidate_subclass('BiomarkerMedium', ['biomarker', 'medium'])

from snorkel.models import candidate_subclass
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import PersonMatcher
import matchers
from snorkel.models import Document
from snorkel.viewer import SentenceNgramViewer
import os

biomarker_ngrams = Ngrams(n_max=1)
medium_ngrams = Ngrams(n_max=5)

# Create our two Matchers
bMatcher = matchers.getBiomarkerMatcher()
mMatcher = matchers.getMediumMatcher()
    
# Building the CandidateExtractor 
candidate_extractor = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher])


In [None]:
for k, sents in enumerate([train_sents, dev_sents]):
    candidate_extractor.apply(sents, split=k)
    print("Number of candidates:", session.query(BiomarkerMedium).filter(BiomarkerMedium.split == k).count())

In [None]:
sentence_keyword_lemmas = ["contain", "collect", "find", "sample", "fluid", "tissue", "detection"]
sentences = []
# 1- distance far
def LF_distance_far(m):
    # print m.lemmas
    # print m.dep_labels
    distance = abs(m.e2_idxs[0] - m.e1_idxs[0])
    if distance < 10:
        return 0
    else:
        return -1
        
            
# 2- distance close
def LF_distance_close(m):
    # print m.lemmas
    # print m.dep_labels
    distance = abs(m.e2_idxs[0] - m.e1_idxs[0])
    if distance < 5:
        return 1
    else:
        return 0

# 3-  Biomarker + preposition + Medium
def LF_IN(m):
    return 1 if (('IN' in m.post_window1('poses', 5) and 'IN' in m.pre_window2('poses',5))) or (('IN' in m.post_window2('poses', 5) and 'IN' in m.pre_window1('poses',5))) else 0 
         
# 4- If the sentence contains common keywords
def LF_keyword(m):
    for word in sentence_keyword_lemmas:
        if (word in m.pre_window1('lemmas',20)) or (word in m.post_window1('lemmas',20)):
            #if presenceOfNot(m):
            #    return -1
            #else:
            #    return 1
            return 1
    return 0

# 5- Medium "from" patients or subjects
def LF_From(m):
    index = 0
    post = m.post_window2('lemmas',20)
    if 'patient' in post:
        index = post.index('patient')
    elif 'subject' in post:
        index = post.index('subject')
    return 1 if "from" in m.post_window2('lemmas',index) else 0

# 6- Medium "-based": blood-based biomarker
def LF_based(m):
    mediumBased = m.mention2(attribute='words') + "-based"
    return 1 if mediumBased in m.pre_window1('words',20) or mediumBased in m.post_window1('words',20) else 0

# 7- Medium "biomarker": blood biomarker
def LF_biomarker(m):
    return 1 if 'biomarker' in m.post_window2('lemmas',3) else 0

# 8- if that relationship is in the references, MIGHT NOT WORK
def LF_References(m):
    sent_id = m.post_window1('sent_id',1)
    sentences_before = sentences[0:sent_id]
    for sentence in sentences_before:
        if 'References' in sentence.words():
            return -1
    return 0

In [None]:
LFs = [LF_distance_far, LF_distance_close, LF_IN, LF_keyword, LF_From, LF_based, LF_biomarker, LF_References]

In [None]:
from snorkel.annotations import LabelAnnotator
labeler = LabelAnnotator(lfs=LFs)

In [None]:
%time L_train = labeler.apply(split=0)
L_train

In [None]:
L_train.lf_stats(session)

In [None]:
from snorkel.learning.structure import DependencySelector
ds = DependencySelector()
deps = ds.select(L_train, threshold=0.1)
len(deps)

In [None]:
deps

In [None]:
deps = set()

In [None]:
from snorkel.learning import GenerativeModel

gen_model = GenerativeModel(lf_propensity=True)
gen_model.train(
    L_train, deps=deps, decay=0.95, step_size=0.1/L_train.shape[0], reg_param=0.0
)

In [None]:
train_marginals = gen_model.marginals(L_train)

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_marginals, bins=20)
plt.show()

In [None]:
gen_model.learned_lf_stats()

In [None]:
from snorkel.annotations import save_marginals
save_marginals(session, L_train, train_marginals)

In [None]:
print L_gold_dev
L_dev = labeler.apply_existing(split=1)
print L_gold_dev

In [None]:
from load_external_annotations_new import load_external_labels
load_external_labels(session, BiomarkerMedium, 'Biomarker', 'Medium', 'articles/medium_gold_labels.tsv', annotator_name='gold')

In [None]:
from snorkel.annotations import load_gold_labels
L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_dev

In [None]:
_ = gen_model.score(session, L_dev, L_gold_dev)

In [None]:
L_dev.lf_stats(session, L_gold_dev, gen_model.learned_lf_stats()['Accuracy'])

In [None]:
import numpy as np

# Load dev labels and convert to [0, 1] range
from snorkel.annotations import load_gold_labels
L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
dev_labels = (np.ravel(L_gold_dev.todense()) + 1) / 2

In [None]:
from snorkel.annotations import FeatureAnnotator
featurizer = FeatureAnnotator()

%time F_train = featurizer.apply(split=0)
F_train

In [None]:
%%time
F_dev  = featurizer.apply_existing(split=1)
F_test = featurizer.apply_existing(split=2)

In [None]:
from snorkel.annotations import load_marginals
train_marginals = load_marginals(session, F_train, split=0)

In [None]:
from snorkel.learning import SparseLogisticRegression
disc_model = SparseLogisticRegression()

In [None]:
from snorkel.learning.utils import MentionScorer
from snorkel.learning import RandomSearch, ListParameter, RangeParameter

# Searching over learning rate
rate_param = RangeParameter('lr', 1e-6, 1e-2, step=1, log_base=10)
l1_param  = RangeParameter('l1_penalty', 1e-6, 1e-2, step=1, log_base=10)
l2_param  = RangeParameter('l2_penalty', 1e-6, 1e-2, step=1, log_base=10)

searcher = RandomSearch(session, disc_model, F_train, train_marginals, [rate_param, l1_param, l2_param], n=20)

In [None]:
from snorkel.annotations import load_gold_labels
L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_dev

import numpy as np
np.random.seed(1701)
searcher.fit(F_dev, L_gold_dev, n_epochs=50, rebalance=0.5, print_freq=25)

### Scoring on the test set

Finally, we'll evaluate our performance on the blind test set of 500 documents. We'll load labels similar to how we did for the development set, and use the `score` function of our extraction model to see how we did.

from load_external_annotations import load_external_labels
load_external_labels(session, BiomarkerDrug, split=2, annotator='gold')

from snorkel.annotations import load_gold_labels
L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)
L_gold_test

_, _, _, _ = disc_model.score(session, F_test, L_gold_test)

In [None]:
from snorkel.contrib.rnn import reRNN

train_kwargs = {
    'lr':         0.01,
    'dim':        100,
    'n_epochs':   50,
    'dropout':    0.5,
    'rebalance':  0.25,
    'print_freq': 5
}

lstm = reRNN(seed=1701, n_threads=None)
lstm.train(train, train_marginals, dev_candidates=dev, dev_labels=dev_labels, **train_kwargs)