In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()

import os
from snorkel.parser import XMLMultiDocPreprocessor

# The following line is for testing only. Feel free to ignore it.

file_path = 'articles/training.xml'
train_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,
    doc='.//article',
    text='.//front/article-meta/abstract/p/text()',
    id=  './/front/article-meta/article-id/text()'
)

file_path = 'articles/development.xml'
dev_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,    
    doc='.//document',    
    text='.//passage/text/text()',    
    id='.//id/text()'
)

file_path = 'articles/testcorpus.xml'
test_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,    
    doc='.//document',    
    text='.//passage/text/text()',    
    id='.//id/text()'
)

In [None]:
from snorkel.parser import CorpusParser

corpus_parser = CorpusParser()
corpus_parser.apply(list(train_preprocessor)) #parallelism can be run with a Postgres DBMS, but not SQLite
corpus_parser.apply(list(dev_preprocessor), clear=False)
corpus_parser.apply(list(test_preprocessor), clear=False)

In [2]:
from six.moves.cPickle import load
from snorkel.models import Document, Sentence
import cPickle

with open('articles/doc_ids.pkl', 'rb') as f:
    train_ids, dev_ids, test_ids = load(f)
train_ids, dev_ids, test_ids = set(train_ids), set(dev_ids), set(test_ids)
print len(train_ids)
print len(dev_ids)
print len(test_ids)
train_sents, dev_sents, test_sents = set(), set(), set()
docs = session.query(Document).order_by(Document.name).all()
print docs
print len(docs)
for i, doc in enumerate(docs):
    for s in doc.sentences:
        if doc.name in train_ids:
            train_sents.add(s)
        elif doc.name in dev_ids:
            dev_sents.add(s)
        elif doc.name in test_ids:
            test_sents.add(s)
        else:
            raise Exception('ID <{0}> not found in any id set'.format(doc.name))

Parsing documents...
Parsing contexts...
SENTENCE_______________________


Exception Exception: Exception('Candidate must have word_start and word_end attributes.',) in <generator object apply at 0x109b4fa00> ignored
Exception Exception: Exception('Candidate must have word_start and word_end attributes.',) in <generator object apply at 0x109b4f910> ignored


IndexError: list index out of range

In [None]:
from snorkel.models import Candidate, candidate_subclass

BiomarkerType = candidate_subclass('BiomarkerType', ['biomarker', 'type3'])

from snorkel.models import candidate_subclass
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import PersonMatcher
import matchers
from snorkel.models import Document
from snorkel.viewer import SentenceNgramViewer
import os

biomarker_ngrams = Ngrams(n_max=1)
type_ngrams = Ngrams(n_max=5)

# Create our two Matchers
bMatcher = matchers.getBiomarkerMatcher()
tMatcher = matchers.getTypeMatcher()
    
# Building the CandidateExtractor 
candidate_extractor = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher])

In [None]:
for k, sents in enumerate([train_sents, dev_sents, test_sents]):
    candidate_extractor.apply(sents, split=k)
    print("Number of candidates:", session.query(BiomarkerType).filter(BiomarkerType.split == k).count())

In [None]:
negationWords = ["not", "nor", "neither"]

def presenceOfNot(m):
    for word in negationWords:
        if (word in m.post_window1('lemmas', 20)) and (word in m.pre_window2('lemmas', 20)):
            return True
    return False
        
def LF_distance(m):
    distance = abs(m.e2_idxs[0] - m.e1_idxs[0])
    if distance < 8:
        # print "RETURNING ONE"
        return 0
    else:
        return -1

def LF_isAMemberOf(m):
    return 1 if ('is' in m.post_window1('lemmas', 20) and 'a' in m.post_window1('lemmas', 20) and
        'member' in m.post_window1('lemmas', 20) and 'of' in m.post_window1('lemmas', 20)) and (('is' in
        m.pre_window2('lemmas', 20) and 'a' in m.pre_window2('lemmas', 20) and 'member' in
        m.pre_window2('lemmas', 20) and 'of' in m.pre_window2('lemmas', 20))) else 0

def LF_membersOf(m):
    return 1 if ('member' in m.post_window1('lemmas'), 20) and 'of' in m.post_window1('lemmas', 20) and (
    ('member' in m.pre_window2('lemmas'), 20) and 'of' in m.pre_window2('lemmas', 20)) else 0

def LF_family(m):
    return 1 if ('family' in m.post_window1('lemmas', 20) and 'family' in m.post_window2('lemmas', 20)) else 0

def LF_cancerrelated(m):
    return 1 if ('cancer' in m.post_window1('lemmas', 20) and '-' in m.post_window1('lemmas', 20) and
        'related' in m.post_window1('lemmas', 20)) and ('cancer' in m.pre_window2('lemmas', 20) and '-' in m.pre_window1('lemmas', 20) and
        'related' in m.pre_window1('lemmas', 20)) else 0

def LF_isA(m):
    return 1 if ('is' in m.post_window1('lemmas', 20) and 'a' in m.post_window1('lemmas', 20) and
    ('is' in m.pre_window2('lemmas', 20) and 'a' in m.pre_window2('lemmas', 20))) else 0

def LF_types(m):
    return 1 if ('type' in m.post_window1('lemmas', 20) or ('type' in m.pre_window2('lemmas', 20))) else 0

def LF_isaBiomarker(m):
    post_window1_lemmas = m.post_window1('lemmas', 20)
    pre_window2_lemmas = m.pre_window2('lemmas', 20)
    if ('biomarker' in post_window1_lemmas and 'biomarker' in pre_window2_lemmas) or (
            'marker' in post_window1_lemmas and 'marker' in pre_window2_lemmas) or (
            'indicator' in post_window1_lemmas and 'indicator' in pre_window2_lemmas):
        marker_idx_post_window1 = -1
        markers = ['biomarker', 'marker', 'indicator']
        for marker in markers:
            try:
                # print post_window1_lemmas
                findMarker = post_window1_lemmas.index(marker)
                if not findMarker == -1:
                    marker_idx_post_window1 = findMarker
            except:
                pass
        if 'cop' in m.post_window1('dep_labels', 20):
            try:
                cop_idx_post_window1 = m.post_window1('dep_labels', 20).index('cop')
            except:
                pass
            return 1 if ('nsubj' in m.mention1(attribute='dep_labels')) and (
            marker_idx_post_window1 - cop_idx_post_window1 < 4)  else 0
        return 0

In [None]:
LFs = []

In [None]:
from snorkel.annotations import LabelAnnotator
labeler = LabelAnnotator(lfs=LFs)

In [None]:
%time L_train = labeler.apply(split=0)
L_train

In [None]:
L_train.lf_stats(session)

In [None]:
from snorkel.learning.structure import DependencySelector
ds = DependencySelector()
deps = ds.select(L_train, threshold=0.1)
len(deps)

In [None]:
deps

In [None]:
deps = set()

In [None]:
from snorkel.learning import GenerativeModel

gen_model = GenerativeModel(lf_propensity=True)
gen_model.train(
    L_train, deps=deps, decay=0.95, step_size=0.1/L_train.shape[0], reg_param=0.0
)

In [None]:
train_marginals = gen_model.marginals(L_train)

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_marginals, bins=20)
plt.show()

In [None]:
gen_model.learned_lf_stats()

In [None]:
from snorkel.annotations import save_marginals
save_marginals(session, L_train, train_marginals)

In [None]:
print L_gold_dev
L_dev = labeler.apply_existing(split=1)
print L_gold_dev

In [None]:
from load_external_annotations_new import load_external_labels
load_external_labels(session, BiomarkerType, 'Biomarker', 'Type', 'articles/type_gold_labels.tsv', dev_cands, annotator_name='gold')

In [None]:
from snorkel.annotations import load_gold_labels
L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_dev

In [None]:
L_dev = labeler.apply_existing(split=1)
_ = gen_model.score(session, L_dev, L_gold_dev)

In [None]:
L_dev.lf_stats(session, L_gold_dev, gen_model.learned_lf_stats()['Accuracy'])

In [None]:
import numpy as np

# Load dev labels and convert to [0, 1] range
from snorkel.annotations import load_gold_labels
L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
dev_labels = (np.ravel(L_gold_dev.todense()) + 1) / 2

In [None]:
from snorkel.annotations import FeatureAnnotator
featurizer = FeatureAnnotator()

%time F_train = featurizer.apply(split=0)
F_train

In [None]:
%%time
F_dev  = featurizer.apply_existing(split=1)
F_test = featurizer.apply_existing(split=2)

In [None]:
from snorkel.annotations import load_marginals
train_marginals = load_marginals(session, F_train, split=0)

In [None]:
from snorkel.learning import SparseLogisticRegression
disc_model = SparseLogisticRegression()

In [None]:
from snorkel.learning.utils import MentionScorer
from snorkel.learning import RandomSearch, ListParameter, RangeParameter

# Searching over learning rate
rate_param = RangeParameter('lr', 1e-6, 1e-2, step=1, log_base=10)
l1_param  = RangeParameter('l1_penalty', 1e-6, 1e-2, step=1, log_base=10)
l2_param  = RangeParameter('l2_penalty', 1e-6, 1e-2, step=1, log_base=10)

searcher = RandomSearch(session, disc_model, F_train, train_marginals, [rate_param, l1_param, l2_param], n=20)

In [None]:
from snorkel.annotations import load_gold_labels
L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_dev

import numpy as np
np.random.seed(1701)
searcher.fit(F_dev, L_gold_dev, n_epochs=50, rebalance=0.5, print_freq=25)

### Scoring on the test set



In [None]:
test_cands = session.query(BiomarkerType).filter(BiomarkerType.split == 2).all()
train_cands = session.query(BiomarkerType).filter(BiomarkerType.split == 0).all()

In [None]:
from load_external_annotations_new import load_external_labels
load_external_labels(session, BiomarkerMedium, 'Biomarker', 'Medium', 'articles/medium_test_labels.tsv', test_cands, annotator_name='gold')

from snorkel.annotations import load_gold_labels
L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)
L_gold_test

tp, fp, tn, fn = disc_model.score(session, F_test, L_gold_test)

In [None]:
print fp

In [None]:
from snorkel.contrib.rnn import reRNN

train_kwargs = {
    'lr':         0.01,
    'dim':        100,
    'n_epochs':   50,
    'dropout':    0.5,
    'rebalance':  0.25,
    'print_freq': 5
}

lstm = reRNN(seed=1701, n_threads=None)
lstm.train(train, train_marginals, dev_candidates=dev, dev_labels=dev_labels, **train_kwargs)

In [None]:
lstm.save("biomarkertype.lstm")