# MUST RUN AT THE START OF EVERYTHING

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

#import os
#os.environ['SNORKELDB'] = 'sqlite:///pubmed.db'

from snorkel import SnorkelSession
session = SnorkelSession()

# Parse the Pubmed Abstracts

The code below is designed to read and parse data gathered from pubtator. Pubtator outputs their annotated text in xml format, so that is the standard file format we are going to use. 

In [None]:
from snorkel.parser import XMLMultiDocPreprocessor

xml_parser = XMLMultiDocPreprocessor(
    path='../data/Epilepsy/epilepsy_data.xml',
    doc='.//document',
    text='.//passage/text/text()',
    id='.//id/text()')

In [None]:
from epilepsy_utils import Tagger
from snorkel.parser import CorpusParser

dg_tagger = Tagger("../data/Epilepsy/epilepsy_tags.txt")
corpus_parser = CorpusParser(fn=dg_tagger.tag)
%time corpus_parser.apply(list(xml_parser))

In [None]:
from snorkel.models import Document, Sentence

print "Documents: ", session.query(Document).count()
print "Sentences: ", session.query(Sentence).count()

# Get each candidate relation

This block of code below is designed to gather and tag each sentence found. **Note**: This does include the title of each abstract.

In [None]:
from snorkel.models import Sentence
test = session.query(Sentence)
print test[0].entity_types

In [None]:
#This line is for loading pre-sorted labels
#import cPickle
#train_ids,dev_ids,test_ids = map(set,cPickle.load(open('data/Epilepsy/doc_ids.pkl','rb')))

#This is a quick divide the documents without checking if they have gold standard or not
from snorkel.models import Document
ids = [doc.name for doc in session.query(Document)]
train_ids,dev_ids,test_ids = ids[0:4333],ids[4333:8666],ids[8666:12999]

#Grab the sentences!!!
train_sents,dev_sents,test_sents = set(),set(),set()
docs = session.query(Document).all()
for doc in docs:
    for s in doc.sentences:
        if doc.name in train_ids:
            train_sents.add(s)
        elif doc.name in dev_ids:
            dev_sents.add(s)
        elif doc.name in test_ids:
            test_sents.add(s)
        else:
            print "FAIL"

In [None]:
from snorkel.models import candidate_subclass

#This specifies that I want candidates that have a disease and gene mentioned in a given sentence
DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene'])

In [None]:
from snorkel.candidates import PretaggedCandidateExtractor

ce = PretaggedCandidateExtractor(DiseaseGene, ['Disease', 'Gene'])

In [None]:
#Get the candidates from my custom tagger and then print number of candidates found
for k,sents in enumerate([train_sents,dev_sents,test_sents]):
    ce.apply(sents,split=k)
    print "Number of Candidates: ", session.query(DiseaseGene).filter(DiseaseGene.split == k).count()

# Look at the Potential Candidates

The one cool thing about jupyter is that you can use this tool to look at candidates. Check it out after everything above has finished running

In [None]:
from snorkel.viewer import SentenceNgramViewer

candidates = session.query(DiseaseGene).filter(DiseaseGene.split==0)
sv = SentenceNgramViewer(candidates, session)

In [None]:
sv