# MUST RUN AT THE START OF EVERYTHING

In [9]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
database_str = "sqlite:///" + os.environ['WORKINGPATH'] + "/Database/epilepsy.db"
os.environ['SNORKELDB'] = database_str

from snorkel import SnorkelSession
session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Parse the Pubmed Abstracts

The code below is designed to read and parse data gathered from pubtator. Pubtator outputs their annotated text in xml format, so that is the standard file format we are going to use. 

In [10]:
from epilepsy_utils import XMLMultiDocPreprocessor
import os
working_path = os.environ['WORKINGPATH']
xml_parser = XMLMultiDocPreprocessor(
    path= working_path + '/Database/epilepsy_data.xml',
    doc='.//document',
    text='.//passage/text/text()',
    id='.//id/text()')

In [12]:
from epilepsy_utils import Tagger
from snorkel.parser import CorpusParser
import os
working_path = os.environ['WORKINGPATH']
dg_tagger = Tagger(working_path + "/Database/epilepsy_tags_shelve")
corpus_parser = CorpusParser(fn=dg_tagger.tag)
%time corpus_parser.apply(list(xml_parser))

Clearing existing...
Running UDF...

CPU times: user 3min, sys: 3.86 s, total: 3min 4s
Wall time: 30min 52s


In [13]:
from snorkel.models import Document, Sentence

print "Documents: ", session.query(Document).count()
print "Sentences: ", session.query(Sentence).count()

Documents:  12999
Sentences:  115466


# Get each candidate relation

This block of code below is designed to gather and tag each sentence found. **Note**: This does include the title of each abstract.

In [7]:
#This is a quick divide the documents without checking if they have gold standard or not
from snorkel.models import Document
ids = [doc.name for doc in session.query(Document).distinct()]
train_ids,dev_ids,test_ids = ids[0:4333],ids[4333:8666],ids[8666:12999]

#Grab the sentences!!!
train_sents,dev_sents,test_sents = set(),set(),set()
docs = session.query(Document).all()
for doc in docs:
    for s in doc.sentences:
        print s
        if doc.name in train_ids:
            train_sents.add(s)
        elif doc.name in dev_ids:
            dev_sents.add(s)
        elif doc.name in test_ids:
            test_sents.add(s)
        else:
            print "FAIL"

In [None]:
from snorkel.models import Document
test_doc = [doc for doc in session.query(Document) if doc.name == '18076640']
for s in test_doc[0].sentences:
    print s
    print s.entity_types

In [3]:
from snorkel.models import candidate_subclass

#This specifies that I want candidates that have a disease and gene mentioned in a given sentence
DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene'])

In [4]:
from snorkel.candidates import PretaggedCandidateExtractor

ce = PretaggedCandidateExtractor(DiseaseGene, ['Disease', 'Gene'])

In [6]:
#Get the candidates from my custom tagger and then print number of candidates found
for k,sents in enumerate([train_sents,dev_sents,test_sents]):
    print sents
    print sys.exit(1)
    ce.apply(sents,split=k)
    print "Number of Candidates: ", session.query(DiseaseGene).filter(DiseaseGene.split == k).count()

set([])


NameError: name 'sys' is not defined

In [None]:
ce.apply(session.query(Sentence).all(),split=0)

# Look at the Potential Candidates

The one cool thing about jupyter is that you can use this tool to look at candidates. Check it out after everything above has finished running

In [None]:
from snorkel.viewer import SentenceNgramViewer

candidates = session.query(DiseaseGene).filter(DiseaseGene.split==0)
sv = SentenceNgramViewer(candidates, session)

In [None]:
sv