# MUST RUN AT THE START OF EVERYTHING

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()

# Parse the Pubmed Abstracts

In [2]:
from snorkel.parser import XMLMultiDocPreprocessor

xml_parser = XMLMultiDocPreprocessor(
    path='data/Epilepsy/epilepsy_data.xml',
    doc='.//document',
    text='.//passage/text/text()',
    id='.//id/text()')

In [3]:
from epilepsy_utils import Tagger
from snorkel.parser import CorpusParser

dg_tagger = Tagger()
corpus_parser = CorpusParser(fn=dg_tagger.tag)
%time corpus_parser.apply(list(xml_parser))

Clearing existing...
Running UDF...

CPU times: user 24.3 s, sys: 392 ms, total: 24.7 s
Wall time: 4min 11s


In [4]:
from snorkel.models import Document, Sentence

print "Documents: ", session.query(Document).count()
print "Sentences: ", session.query(Sentence).count()

Documents:  1500
Sentences:  14406


# Get each candidate relation

In [5]:
import cPickle

train_ids,dev_ids,test_ids = map(set,cPickle.load(open('data/Epilepsy/doc_ids.pkl','rb')))

train_sents,dev_sents,test_sents = set(),set(),set()
docs = session.query(Document).all()
for doc in docs:
        for s in doc.sentences:
            if doc.name in train_ids:
                train_sents.add(s)
            elif doc.name in dev_ids:
                dev_sents.add(s)
            elif doc.name in test_ids:
                test_sents.add(s)
            else:
                print "FAIL"

In [7]:
from snorkel.models import candidate_subclass

DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene'])

In [7]:
from snorkel.candidates import PretaggedCandidateExtractor

ce = PretaggedCandidateExtractor(DiseaseGene, ['Disease', 'Gene'])

In [8]:
for k,sents in enumerate([train_sents,dev_sents,test_sents]):
    ce.apply(sents,split=k)
    print "Number of Candidates: ", session.query(DiseaseGene).filter(DiseaseGene.split == k).count()

Clearing existing...
Running UDF...

Number of Candidates:  1431
Clearing existing...
Running UDF...

Number of Candidates:  1406
Clearing existing...
Running UDF...

Number of Candidates:  1075


# Look at the Possible Candidates below

In [9]:
from snorkel.viewer import SentenceNgramViewer

candidates = session.query(DiseaseGene).filter(DiseaseGene.split==0).all()
sv = SentenceNgramViewer(candidates, session)

<IPython.core.display.Javascript object>

In [10]:
sv