In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import matplotlib.pyplot as plt
from six.moves.cPickle import load
import cPickle
import numpy as np

from snorkel import SnorkelSession
from snorkel.parser import XMLMultiDocPreprocessor, CorpusParser
from snorkel.models import Document, Sentence, Candidate, candidate_subclass
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.viewer import SentenceNgramViewer
from snorkel.annotations import LabelAnnotator, load_gold_labels, FeatureAnnotator, save_marginals, load_marginals
from snorkel.learning import SparseLogisticRegression, GenerativeModel, RandomSearch, ListParameter, RangeParameter
from snorkel.learning.structure import DependencySelector
from snorkel.learning.utils import MentionScorer
from snorkel.contrib.rnn import reRNN

import matchers
import LF
from candidate_adjective_fixer import *
from load_external_annotations_new import load_external_labels

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


IOError: [Errno 2] No such file or directory: 'databases/abbreviations.com.pkl'

In [None]:
#------------------
# Helper Functions
#------------------

def grabCandidates(extractor, schema):
    # Candidate Counts
    for k, sents in enumerate([train_sents, dev_sents, test_sents]):
        extractor.apply(sents, split=k)
        print "Number of candidates: ", session.query(schema).filter(schema.split == k).count()

    train_cands = session.query(schema).filter(
        schema.split == 0).all()
    dev_cands = session.query(schema).filter(
        schema.split == 1).all()
    test_cands = session.query(schema).filter(
        schema.split == 2).all()

    return [train_cands, dev_cands, test_cands]

In [None]:
#-----------------------
# Setup & Preprocessing
#-----------------------

# Instantiate the Session
session = SnorkelSession()


# Doc Preprocessing
file_path = 'articles/training.xml'
train_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,
    doc='.//article',
    text='.//front/article-meta/abstract/p/text()',
    id='.//front/article-meta/article-id/text()'
)

file_path = 'articles/development.xml'
dev_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,
    doc='.//document',
    text='.//passage/text/text()',
    id='.//id/text()'
)

file_path = 'articles/testcorpus.xml'
test_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,
    doc='.//document',
    text='.//passage/text/text()',
    id='.//id/text()'
)

# Parsing
corpus_parser = CorpusParser()

# Note: Parallelism can be run with a Postgres DBMS, but not SQLite
corpus_parser.apply(list(train_preprocessor))
corpus_parser.apply(list(dev_preprocessor), clear=False)
corpus_parser.apply(list(test_preprocessor), clear=False)


# Retrieving Stable IDs for each of the candidate sentences
with open('articles/doc_ids.pkl', 'rb') as f:
    train_ids, dev_ids, test_ids = load(f)

train_ids, dev_ids, test_ids = set(train_ids), set(dev_ids), set(test_ids)
train_sents, dev_sents, test_sents = set(), set(), set()
docs = session.query(Document).order_by(Document.name).all()


# Assigning each sentence to {train,dev,test}-set based on Stable ID
for i, doc in enumerate(docs):
    for s in doc.sentences:
        if doc.name in train_ids:
            train_sents.add(s)
        elif doc.name in dev_ids:
            dev_sents.add(s)
        elif doc.name in test_ids:
            test_sents.add(s)
        else:
            raise Exception(
                'ID <{0}> not found in any id set'.format(doc.name))



In [None]:
#----------------------
# Candidate Extraction
#----------------------

# Defining the Candidate Schemas
BiomarkerCondition = candidate_subclass(
    'BiomarkerCondition', ['biomarker', 'condition'])
BiomarkerDrug = candidate_subclass('BiomarkerDrug', ['biomarker', 'drug'])
BiomarkerMedium = candidate_subclass(
    'BiomarkerMedium', ['biomarker', 'medium'])


# N-grams: the probabilistic search space of our entities
biomarker_ngrams = Ngrams(n_max=1)
condition_ngrams = Ngrams(n_max=7)
drug_ngrams = Ngrams(n_max=5)
medium_ngrams = Ngrams(n_max=5)
type_ngrams = Ngrams(n_max=5)  # <--- Q: should we cut these down?


# Construct our Matchers
bMatcher = matchers.getBiomarkerMatcher()
cMatcher = matchers.getDiseaseMatcher()
dMatcher = matchers.getDrugMatcher()
mMatcher = matchers.getMediumMatcher()
tMatcher = matchers.getTypeMatcher()


# Building the CandidateExtractors
candidate_extractor_BC = CandidateExtractor(
    BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher])
candidate_extractor_BD = CandidateExtractor(
    BiomarkerDrug, [biomarker_ngrams, drug_ngrams], [bMatcher, dMatcher])
candidate_extractor_BM = CandidateExtractor(
    BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher])
candidate_extractor_BT = CandidateExtractor(
    BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher])


# List of Candidate Sets for each relation type: [train, dev, test]
cands_BC = grabCandidates(candidate_extractor_BC, BiomarkerCondition)
cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug)
cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium)
cands_BT = grabCandidates(candidate_extractor_BT, BiomarkerType)
