# MarkerSub: a Snorkel-powered Biomarker Data Mining Pipeline


# 0) Initial Setup
Let's begin with imports, as well as environment and figure configuartion.

In [1]:
# cleanup if setup is reinitialized
import os
from snorkel import *
from snorkel import SnorkelSession
import extractions
import re
import cPickle, sys, matplotlib
import unicodedata
    
# Clear the old DB -- comment out in most instances
# try:
#     os.remove('snorkel.db')
# except OSError:
#     pass

%load_ext autoreload
%autoreload 2
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (18,6)

# We initialize the Snorkel Session, which initializes a SQLite DB by default
session = SnorkelSession()



# 1) Corpus Preprocessing
Preprocessing our raw data files (includes entity tagging) from XML
* [1a] Configuring a DocPreprocessor "sub-parser"
* [1b] Creating a CorpusParser
* [1c] Loading our Corpus with a CorpusParser

In [2]:
# Step 1a
#------------------------------------------------------------------------------
# We bgin by initializing a Document Preprocessor
#------------------------------------------------------------------------------
from snorkel.parser import XMLMultiDocPreprocessor
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence

# training corpus
file_path = 'articles/training.xml'
train_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,
    doc='.//article',
    text='.//front/article-meta/abstract/p/text()',
    id=  './/front/article-meta/article-id/text()'
)

# dev corpus
file_path = 'articles/development.xml'
dev_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,    
    doc='.//document',    
    text='.//passage/text/text()',    
    id='.//id/text()'
)

# Steps 1b-c
#------------------------------------------------------------------------------
# Next, we set up a CorpusParser and run it on our corpus
#------------------------------------------------------------------------------
corpus_parser = CorpusParser()
corpus_parser.apply(list(train_preprocessor)) #parallelism can be run with a Postgres DBMS, but not SQLite
corpus_parser.apply(list(dev_preprocessor), clear=False)
# Let's now analayze the counts of documents and sentences in our corpus
# ... Go ahead and ignore the "empty document" message - they're being read, 
#     but they're in a non-standard structure 

print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())


Clearing existing...
Running UDF...
[=                                       ] 0%



[=                                       ] 1%



[=                                       ] 2%



[==                                      ] 3%



[==                                      ] 4%



[===                                     ] 5%



[===                                     ] 7%



[====                                    ] 8%



[====                                    ] 9%



[=====                                   ] 11%



[=====                                   ] 12%




































































































































































































































































































Running UDF...

('Documents:', 1438)
('Sentences:', 13625)


# 2) Candidate (Relation) Extraction
The next step is to extract _candidates_ from our corpus. A `candidate` in Snorkel are the objects for which we want to make predictions. In this case, the candidates are pairs of people mentioned in sentences, and our task is to predict which pairs are described as married in the associated text.

* [2a] Defining a Candidate Schema
* [2b] Constructing a Candidate Extractor
* [2c] Loading Sentences and Partitioning by Document
* [2d] Candidate Extraction
* [2e] Inspecting Candidates in the Viewer
* [2f] Loading External "Gold" Labels
* [2f] Relations Generation.

In [3]:
from snorkel.models import candidate_subclass
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import PersonMatcher
import matchers
from snorkel.models import Document
from snorkel.viewer import SentenceNgramViewer
import os

# Step 2a
#------------------------------------------------------------------------------
# We define the schema of the relation mention we want to extract.
#------------------------------------------------------------------------------
BiomarkerCondition = candidate_subclass('BiomarkerCondition', ['biomarker', 'condition'])

# Step 2b
#------------------------------------------------------------------------------
# We write a basic function to extract candidate relation (biomarker-condition) 
# mentions from the corpus. Utilizes the SentenceParser instantiated.
#------------------------------------------------------------------------------
# N-gram creation
biomarker_ngrams = Ngrams(n_max=1)
condition_ngrams = Ngrams(n_max=5)

# Create our two Matchers
bMatcher = matchers.getBiomarkerMatcher()
cMatcher = matchers.getDiseaseMatcher()
    
# Building the CandidateExtractor 
cand_extractor = CandidateExtractor(BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher])

# Step 2c
#------------------------------------------------------------------------------
# Let's load the training sentences 
#------------------------------------------------------------------------------
docs = session.query(Document).order_by(Document.name).all()
ld = len(docs)
print ld
train_sents = set()
dev_sents = set()
for i,doc in enumerate(docs):
    if(i >= len(docs) - 160):
        for s in doc.sentences:
            dev_sents.add(s)
    else:
        for s in doc.sentences:
            train_sents.add(s)
#Insert Dev and Test!!!!


1438


In [5]:
# Step 2d
#------------------------------------------------------------------------------
# Now, we run our CandidateExtractor and pull the candidate relations
#------------------------------------------------------------------------------
# %time cand_extractor.apply(train_sents, split=0)
# train_cands = session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 0).all()
# print("Number of candidates:", len(train_cands))


# %time cand_extractor.apply(dev_sents, split=0)
# dev_cands = session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 0).all()
# print("Number of candidates:", len(dev_cands))

train_cands = session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 0).all()
dev_cands = session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 1).all()


# %time
# for i, sents in enumerate([dev_sents, train_sents]):
#     cand_extractor.apply(sents, split=i+1)
#     train_cands = session.query(BiomarkerCondition).filter(BiomarkerCondition.split == i+1)


# Step 2e
#------------------------------------------------------------------------------
# Time for inspection! Manually generating labels with the viewer.
#------------------------------------------------------------------------------

# for some reason, the viewer isn't responding....


In [6]:
import re
#from snorkel.lf_terms import *
from snorkel.lf_helpers import get_doc_candidate_spans
from snorkel.lf_helpers import get_sent_candidate_spans
from snorkel.lf_helpers import get_left_tokens, get_right_tokens

#umls_dict              = load_umls_dictionary()
#chemicals              = load_chemdner_dictionary()
#abbrv2text, text2abbrv = load_specialist_abbreviations()

keyWords = ["associate", "express", "marker", "biomarker", "elevated", "decreased",
            "correlation", "correlates", "found", "diagnose", "variant", "appear",
            "connect", "relate", "exhibit", "indicate", "signify", "show", "demonstrate",
            "reveal", "suggest", "evidence", "elevation", "indication", "diagnosis",
            "variation", "modification", "suggestion", "link", "derivation", "denote",
            "denotation", "demonstration", "magnification", "depression", "boost", "level",
            "advance", "augmentation", "lessening", "enhancement", "expression", "buildup",
            "diminishing", "diminishment", "reduction", "drop", "dwindling", "lowering"]

negationWords = ["not", "nor", "neither"]

# Document-level LFs:
#--------------------
#def LF_undefined_abbreviation(c):
#    '''Candidate is a known abbreviation, but no corresponding full name in document'''
#    doc_spans = get_doc_candidate_spans(c)
#    phrase = c[0].get_span().lower()
#    mentions = set([s.get_span().lower() for s in doc_spans])
#    if len(phrase) > 1 and phrase in abbrv2text and not set(abbrv2text[phrase].keys()).intersection(mentions):
#        return -
    
# Sentence-level LFs:
#---------------------
def LF_contiguous_mentions(c):
    '''Contiguous candidates are likely wrong'''
    neighbor_spans = get_sent_candidate_spans(c)
    start, end = c[0].get_word_start(), c[0].get_word_end()
    for s in neighbor_spans:
        if s.get_word_end() + 1 == start or s.get_word_start() - 1 == end:
            return -1
    return 0

# Mention-level LFs:
#-------------------
def LF_tumors_growths(c):
    phrase = " ".join(c[0].get_attrib_tokens('lemmas'))
    return 1 if re.search("^(\w* ){0,2}(['] )*(tumor|tumour|polyp|pilomatricoma|cyst|lipoma)$", phrase) else 0

def LF_cancer(c):
    '''<TYPE> cancer'''
    phrase = " ".join(c[0].get_attrib_tokens('lemmas'))
    return 1 if re.search("\w* cancer",phrase) else 0

def LF_disease_syndrome(c):
    '''<TYPE> disease or <TYPE> syndrome'''
    phrase = " ".join(c[0].get_attrib_tokens('lemmas'))
    return 1 if re.search("\w* (disease|syndrome)+",phrase) else 0

#def LF_indicators(c):
#    '''Indicator words'''
#    return 1 if " ".join(c[0].get_attrib_tokens()).lower() in indicators else 0

#def LF_common_disease(c):
#    '''Common disease'''
#    return 1 if " ".join(c[0].get_attrib_tokens()).lower() in common_disease else 0

#def LF_common_disease_acronyms(c):
#    '''Common disease acronyms'''
#    return 1 if " ".join(c[0].get_attrib_tokens()) in common_disease_acronyms else 0

def LF_deficiency_of(c):
    '''deficiency of <TYPE>'''
    phrase = " ".join(c[0].get_attrib_tokens()).lower()
    return 1 if phrase.endswith('deficiency') or phrase.startswith('deficiency') or phrase.endswith('dysfunction') else 0

#def LF_positive_indicator(c):
#    flag = False
#    for i in c[0].get_attrib_tokens():
#        if i.lower() in positive_indicator:
#            flag = True
#            break
#    return 1 if flag else 0

def LF_left_positive_argument(c):    
    phrase = " ".join(c[0].get_attrib_tokens('lemmas')).lower()
    pattern = "(\w+ ){1,2}(infection|lesion|neoplasm|attack|defect|anomaly|abnormality|degeneration|carcinoma|lymphoma|tumor|tumour|deficiency|malignancy|hypoplasia|disorder|deafness|weakness|condition|dysfunction|dystrophy)$"
    return 1 if re.search(pattern,phrase) else 0

def LF_right_negative_argument(c):    
    phrase = " ".join(c[0].get_attrib_tokens('lemmas')).lower()
    pattern = "^(history of|mitochondrial|amino acid)( \w+){1,2}"
    return 1 if re.search(pattern, phrase) else 0

def LF_medical_afixes(c):
    pattern = "(\w+(pathy|stasis|trophy|plasia|itis|osis|oma|asis|asia)$|^(hyper|hypo)\w+)"
    phrase = " ".join(c[0].get_attrib_tokens('lemmas')).lower()
    return 1 if re.search(pattern,phrase) else 0

#def LF_adj_diseases(c):
#    return 1 if ' '.join(c[0].get_attrib_tokens()) in adj_diseases else 0


# Dictionary LFs:
#----------------
#def LF_SNOWMED_CT_sign_or_symptom(c):
#    return 1 if c[0].get_span() in umls_dict["snomedct"]["sign_or_symptom"] else 0

#def LF_SNOWMED_CT_disease_or_syndrome(c):
#    return 1 if c[0].get_span() in umls_dict["snomedct"]["disease_or_syndrome"] else 0

#def LF_MESH_disease_or_syndrome(c):
#    return 1 if c[0].get_span() in umls_dict["mesh"]["disease_or_syndrome"] else 0

#def LF_MESH_sign_or_symptom(c):
#    return 1 if c[0].get_span() in umls_dict["mesh"]["sign_or_symptom"] else 0


# Negative LFs:
#--------------
#def LF_organs(c):
#    phrase = " ".join(c[0].get_attrib_tokens()).lower()
#    return -1 if phrase in organs else 0      

#def LF_chemical_name(c):
#    phrase = " ".join(c[0].get_attrib_tokens())
#    return -1 if phrase in chemicals and not phrase.isupper() else 0

#def LF_bodysym(c):
#    phrase = " ".join(c[0].get_attrib_tokens()).lower()
#    return -1 if phrase in bodysym else 0  

def LF_protein_chemical_abbrv(c):
    '''Gene/protein/chemical name'''
    lemma = " ".join(c[0].get_attrib_tokens('lemmas'))
    return -1 if re.search("\d+",lemma) else 0

def LF_base_pair_seq(c): 
    lemma = " ".join(c[0].get_attrib_tokens('lemmas'))
    return -1 if re.search("^[GACT]{2,}$",lemma) else 0

#def LF_too_vague(c):
#    phrase = " ".join(c[0].get_attrib_tokens('lemmas')).lower()
#    phrase_ = " ".join(c[0].get_attrib_tokens()).lower()
#    return -1 if phrase in vague or phrase_ in vague else 0

def LF_neg_surfix(c):
    terms = ['deficiency', 'the', 'the', 'of', 'to', 'a']
    rw = get_right_tokens(c, window=1, attrib='lemmas')
    if len(rw) > 0 and rw[0].lower() in terms:
        return -1
    return 0

#def LF_non_common_disease(c):
#    '''Non common diseases'''
#    return -1 if " ".join(c[0].get_attrib_tokens()).lower() in non_common_disease else 0

#def LF_non_disease_acronyms(c):
#    '''Non common disease acronyms'''
#    return -1 if " ".join(c[0].get_attrib_tokens()) in non_disease_acronyms else 0

def LF_pos_in(c):
    '''Candidates beginning with a preposition or subordinating conjunction'''
    pos_tags = c[0].get_attrib_tokens('pos_tags')
    return -1 if "IN" in pos_tags[0:1] else 0


#def LF_right_window_incomplete(c):
#    return -1 if right_terms.intersection(get_right_tokens(c, window=2, attrib='lemmas')) else 0

#def LF_negative_indicator(c):
#    flag = False
#    for i in c[0].get_attrib_tokens():
#        if i.lower() in negative_indicator:
#            flag = True
#            break
#    return -1 if flag else 0

x = '''
# extra custom
#--------------
def presenceOfNot(m):
    for word in negationWords:
        if (word in m[0].get_right_tokens('lemmas', 20)) and (word in m.pre_window2('lemmas', 20)):
            return True
    return False
# 1
def LF_remove_same_word(m):
    if(m.mention1(attribute='words')[0] == m.mention2(attribute='words')[0]):
        return -1
    
def LF_distance(m):
    print "FIRST"
    print type(m)
    # if 'neuroendocrine' in m.lemmas:
    #     print m.lemmas
    # print m.dep_labels
    distance = abs(m.e2_idxs[0] - m.e1_idxs[0])
    count = 0
    for lemma in m.lemmas:
        if lemma == ',':
            count += 1
    if count > 1 and ',' in m.pre_window1('lemmas', 1):
        print m
        return 0
    if distance == 0:
        return -1
    if distance < 8:
        # print "RETURNING ONE"
        return 0
    else:
        return -1
    
def LF_roman_numeral(m):
    biomarker = (m.mention1(attribute='words')[0])
    unicodedata.normalize('NFKD', biomarker).encode('ascii','ignore')
    if re.match(r'((?<=\s)|(?<=^))(M{1,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{1,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{1,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{1,3}))(?=\s)',
                biomarker):
        print "MATCHED ROMAN"
        print m.mention1(attribute='words')
        return -1

# 4
def LF_marker(m):
    return 1 if ( ('marker' in m[0].get_attrib_tokens('lemmas', 6) or 'biomarker' in m[.post_window1('lemmas', 6)) and (
        'marker' in m.pre_window2('lemmas', 6) or 'biomarker' in m.pre_window2('lemmas', 6)) ) or (('marker' in m.pre_window1('lemmas', 6) or 'biomarker' in m.pre_window1('lemmas', 6)) and (
        'marker' in m.post_window2('lemmas', 6) or 'biomarker' in m.post_window2('lemmas', 6)))  else 0

# 9 (-1 if biomarker is confused with a name of a person)
def LF_People(m):
    return -1 if ('NNP' in m.mention1(attribute='poses')) else 0
# 51
def LF_possible(m):
    return -1 if ('possible' in m.pre_window1('lemmas', 20)) else 0
# 52
def LF_explore(m):
    return -1 if ('explore' in m.pre_window1('lemmas', 20)) else 0
# 53
def LF_key(m):
    # print m.pre_window1('lemmas', 20)
    return -1 if ('abbreviation' in m.pre_window1('lemmas', 20) or (
        'word' in m.pre_window1('lemmas', 20) and 'key' in m.pre_window1('lemmas', 20))) else 0
# 54
def LF_investigate(m):
    return -1 if ('investigate' in m.pre_window1('lemmas', 20)) else 0
# 55
def LF_yetToBeConfirmed(m):
    return -1 if ('yet' in m.post_window1('lemmas', 20) and 'to' in m.post_window1('lemmas', 20) and 'be' in m.post_window1(
        'lemmas', 20) and 'confirmed' in m.post_window1('lemmas', 20)) else 0
# 56
def LF_notAssociated(m):
    return -1 if ('not' in m.post_window1('lemmas', 20) and 'associated' in m.post_window2('lemmas', 20)) else 0
# 56
def LF_notRelated(m):
    return -1 if ('not' in m.post_window1('lemmas', 20) and 'related' in m.post_window2('lemmas', 20)) else 0
# 57
def LF_doesNotShow(m):
    return -1 if (
        'does' in m.post_window1('lemmas', 20) and 'not' in m.post_window1('lemmas', 20) and 'show' in m.post_window2(
            'lemmas', 20)) else 0
# 58
def LF_notLinked(m):
    return -1 if ('not' in m.post_window1('lemmas', 20) and 'linked' in m.post_window2('lemmas', 20)) else 0
# 59
def LF_notCorrelated(m):
    return -1 if ('not' in m.post_window1('lemmas', 20) and 'correlated' in m.post_window2('lemmas', 20)) else 0
# 60
def LF_disprove(m):
    return -1 if ('disprove' in m.post_window1('lemmas', 20)) else 0
# 62
def LF_doesNotSignify(m):
    return -1 if (
        'does' in m.post_window1('lemmas', 20) and 'not' in m.post_window1('lemmas', 20) and 'signify' in m.post_window(
            'lemmas', 20)) else 0
# 63
def LF_doesNotIndicate(m):
    print "SECOND"
    return -1 if (
        'does' in m.post_window1('lemmas', 20) and 'not' in m.post_window1('lemmas', 20) and 'indicate' in m.post_window(
            'lemmas', 20)) else 0
# 64
def LF_doesNotImply(m):
    print "THIRD"
    return -1 if (
        'does' in m.post_window1('lemmas', 20) and 'not' in m.post_window1('lemmas', 20) and 'imply' in m.post_window(
            'lemmas', 20)) else 0
# 65
def LF_studies(m):
    return 1 if (
        'studies' in m.pre_window1('lemmas', 20) and 'have' in m.pre_window1('lemmas', 20) and'reported' in m.pre_window1(
            'lemmas', 20)) else 0
# 66
def LF_studies2(m):
    return 1 if (
        'studies' in m.pre_window1('lemmas', 20) and 'have' in m.pre_window1('lemmas', 20) and 'disclosed' in m.pre_window1(
            'lemmas', 20)) else 0
# 67
def LF_studies3(m):
    return 1 if (
        'studies' in m.pre_window1('lemmas', 20) and 'have' in m.pre_window1('lemmas', 20) and'disclosed' in m.pre_window1('lemmas', 20)) else 0
# 68
def LF_studies4(m):
    return 1 if (
        'studies' in m.pre_window1('lemmas', 20) and 'have' in m.pre_window1('lemmas', 20) and 'expressed' in m.pre_window1(
            'lemmas', 20)) else 0
# 69
def LF_interesting(m):
    return 1 if (
        'is' in m.post_window1('lemmas', 20) and 'an' in m.post_window1('lemmas', 20) and 'interesting' in m.post_window1(
            'lemmas', 20) and 'target' in m.post_window1('lemmas', 20) and 'is' in m.pre_window2('lemmas', 20) and 'an' in
        m.pre_window2('lemmas', 20) and 'interesting' in m.pre_window2('lemmas', 20) and 'target' in m.pre_window2(
            'lemmas', 20)) else 0
# 70
def LF_discussion(m):
    return 1 if (
        'discussion' in m.pre_window1('lemmas', 20)) else 0
# 71
def LF_conclusion(m):
    if ('conclusion' in m.pre_window1('lemmas', 20) or 'conclusion' in m.pre_window2('lemmas', 20)):
        # print "FOUND"
        return 1
    else:
        return 0
# 72
def LF_recently(m):
    return 1 if (
        'recently' in m.pre_window1('lemmas', 20) or 'recently' in m.post_window1('lemmas', 20)) else 0
# 73
def LF_induced(m):
    return 1 if (
        'induced' in m.post_window1('lemmas', 20) and 'induced' in m.pre_window2('lemmas', 20)) else 0
# 74
def LF_treatment(m):
    return 1 if (
        'treatment' in m.pre_window1('lemmas', 20) or 'treatment' in m.post_window1('lemmas', 20)) else 0
# 75
def LF_auxpass(m):
    if not ('auxpass' and 'aux') in (m.post_window1('dep_labels', 20) and m.pre_window2('dep_labels', 20)):
        return -1
    else:
        return 0
# 75
def LF_inbetween(m):
    # with open('diseaseDatabase.pickle', 'rb') as f:
    #     diseaseDictionary = pickle.load(f)
    # with open('diseaseAbbreviationsDatabase.pickle', 'rb') as f:
    #     diseaseAbb = pickle.load(f)
    # with open('markerData.pickle', 'rb') as f:
    #     markerDatabase = pickle.load(f)
    # for marker in markerDatabase:
    #     if(marker in list[m.e1_idxs[0] : m.e2_idxs[0]]):
    #         return -1
    # for disease in diseaseDictionary:
    #     if (disease in list[m.e1_idxs[0]: m.e2_idxs[0]]):
    #         return -1
    # for disease in diseaseAbb:
    #     if (marker in list[m.e1_idxs[0]: m.e2_idxs[0]]):
    #         return -1
    return 0
# 76
def LF_patientsWith(m):
    return 1 if ('patient' in m.post_window2('lemmas', 3)) and ('with' in m.post_window2('lemmas',2)) else 0
# 77
def LF_isaBiomarker(m):
    post_window1_lemmas = m.post_window1('lemmas',20)
    pre_window2_lemmas = m.pre_window2('lemmas',20)
    if ('biomarker' in post_window1_lemmas and 'biomarker' in pre_window2_lemmas) or ('marker' in post_window1_lemmas and 'marker' in pre_window2_lemmas) or ('indicator' in post_window1_lemmas and 'indicator' in pre_window2_lemmas):
        marker_idx_post_window1 = -1
        markers = ['biomarker','marker','indicator']
        for marker in markers:
            try:
                # print post_window1_lemmas
                findMarker = post_window1_lemmas.index(marker)
                if not findMarker == -1:
                    marker_idx_post_window1 = findMarker
                    print marker
            except:
                pass
        if 'cop' in m.post_window1('dep_labels',20):
            try:
                cop_idx_post_window1 = m.post_window1('dep_labels',20).index('cop')
            except:
                pass
            
            print "MarkerIdx:"
            print marker_idx_post_window1
            print "ROOTIdx:"
            try:
                print  m.post_window1('dep_labels',marker_idx_post_window1)
                print  m.post_window1('dep_labels',marker_idx_post_window1).index('ROOT')
            except:
                pass
            print '\n'
            
            return 1 if ('nsubj' in m.mention1(attribute='dep_labels')) and (marker_idx_post_window1-cop_idx_post_window1 < 4)  else 0
    return 0
# 78
def LF_suspect(m):
    return -1 if ('suspect' in m.pre_window1('lemmas', 20) or 'suspect' in m.post_window1('lemmas', 20)) else 0
# 79
def LF_mark(m):
    return -1 if ( 'vmod' in m.post_window1('dep_labels', 20) and 'mark' in m.post_window1('dep_labels', 20) or'vmod' in m.pre_window1('dep_labels', 20) and 'mark' in m.pre_window1('dep_labels', 20)) else 0
'''
        

Here are some CANDIDATES extracted: 
[]

 Here are some SENTENCES extracted: 

Sentence(Document 21993267,0,In an effort to identify neuronal repair mechanisms of the major pelvic ganglion (MPG), we evaluated changes in the expression of nestin, an intermediate filament protein and neural stem cell marker following cavernous nerve crush injury (CNI).)
Sentence(Document 24946761,0,Malignant gliomas are devastating tumours that frequently kill patients within 1 year of diagnosis.)
Sentence(Document 25404199,0,Epidermal growth factor receptor (EGFR) tyrosine kinase inhibitors (TKIs) such as gefitinib are one of gold standard treatment options for nonsmall-cell lung cancer (NSCLC) patients, which eventually fail due to the acquired resistance and relapse because of the development of secondary activating mutations such as T790M in EGFR.)
Sentence(Document 26206605,0,Neuropsychological research on adults with ADHD showed deficits in various aspects of attention.)
Sentence(Document 26245740,

In [None]:
LFs = [LF_contiguous_mentions, LF_tumors_growths, LF_cancer, LF_disease_syndrome, LF_deficiency_of, LF_left_positive_argument, LF_right_negative_argument, LF_medical_afixes, LF_protein_chemical_abbrv, LF_base_pair_seq, LF_neg_surfix, LF_pos_in]
from snorkel.annotations import LabelAnnotator
labeler = LabelAnnotator(lfs=LFs)
%time L_train = labeler.apply(split=0)
L_train
L_train.lf_stats(session)

## Training the Generative Model

In [None]:
from snorkel.learning.structure import DependencySelector
ds = DependencySelector()
deps = ds.select(L_train, threshold=0.1)
len(deps)

In [None]:
deps

In [None]:
deps = set()

In [None]:
from snorkel.learning import GenerativeModel

gen_model = GenerativeModel(lf_propensity=True)
gen_model.train(
    L_train, deps=deps, decay=0.95, step_size=0.1/L_train.shape[0], reg_param=0.0
)

In [None]:
train_marginals = gen_model.marginals(L_train)

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_marginals, bins=20)
plt.show()

In [None]:
gen_model.learned_lf_stats()

In [None]:
from snorkel.annotations import save_marginals
save_marginals(session, L_train, train_marginals)

## Now, gold labels

In [9]:
# Now, we load external labels
from snorkel.annotations import load_gold_labels
from load_external_annotations import load_external_labels

load_external_labels(session, BiomarkerCondition, split=1, annotator='gold')
L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_dev

defaultdict(<type 'set'>, {'28261310': set([('b1', 'c2'), ('b1', 'c1'), ('b1', 'c3')]), '28263676': set([('A@1645-1664@@Biomarker@protective seed coat', 'A@1708-1726@@Condition@longer space travel')]), '28260796': set([('b4', 'b7')]), '28260391': set([('b1', 'b2')]), '28261953': set([('b1', 'd1'), ('b2', 'd1')]), '28261797': set([('b1', 'c1')]), '28262777': set([('b2', 'c1'), ('b1', 'c1'), ('b3', 'c1')]), '28263447': set([('b3', 'cond1'), ('b2', 'cond1'), ('b1', 'cond1'), ('b5', 'cond1'), ('b4', 'cond1')]), '28262773': set([('b2', 'c1'), ('b1', 'c1')]), '28263037': set([('b', 'd')]), '28261220': set([('b1', 'c2'), ('b2', 'c2'), ('b1', 'c1'), ('b2', 'c1'), ('b3', 'c1'), ('b3', 'c2')]), '28261422': set([('b1', 'b2')]), '28261819': set([('b1', 'c1')]), '28263172': set([('b1', 'c1')]), '28261562': set([('b1', 'b2')]), '28263701': set([('b1', 'b2')]), '28262555': set([('b2', 'c1'), ('b1', 'c1')]), '28262798': set([('b2', 'c1'), ('b1', 'c1')]), '28260718': set([('b1', 'c1')]), '28261963': se

In [10]:
L_dev = labeler.apply_existing(split=1)
temp = gen_model.score(session, L_dev, L_gold_dev)




In [None]:
L_dev.lf_stats(session, L_gold_dev, gen_model.learned_lf_stats()['Accuracy'])


# 4) IGNORE BELOW
Training Model via Data Programming
We train our statistical mosel to differentiate between TRUE and FALSE Disease mentions.
This training will be achieved via data programming, enabling us to train a model using only a modest amount of hand-labeled data for validation and testing. We will not use any training labels provided with the skipped usage of training data to simulate a more realistic scenario.






In [22]:
import numpy as np

# set up
train = session.query(CandidateSet).filter(CandidateSet.name == 'BD Training Candidates').one()
#dont have yet:
#dev = session.query(CandidateSet).filter(CandidateSet.name == 'BD Development Candidates').one()


NameError: name 'CandidateSet' is not defined


# Generate Features 
We can do so automatically via Snorkel. Recall that the goal is to distinguish TRUE vs FALSE mentions of biomarker-Disease Relations. Hence, we embed our `BD Training Candidates` in a feature space.


In [None]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

# create a feature set
%time F_train = feature_manager.create(session, train, 'Train Features')

# stuff that we have with features...can load interns' features as well (via mindtagger)
#%time F_train2 = feature_manager.load(session, train, 'Train Features')

F_train
F_train.get_candidate(0)
F_train.get_key(0)

#maybe write to another features file: articles


# Creating Labeling Functions
Labeling functions are a core tool of data programming. These are heuristic functions that inform the search about the direction to a goal, and therefore aim to classify candidates correctly. Their outputs will be automatically combined and denoised to estimate the probabilities of training labels for the training data.

*Note: We should be creating document-, sentence-, and mention-level labeleing functions.*


# Apply the Labeling Functions


In [None]:
# LFs = LFs_mention + LFs_dicts + LFs_false

#LFs = [LF_investigate, LF_key, LF_distance, LF_auxpass, LF_inbetween,
#       LF_possible, LF_explore, LF_key, LF_investigate, LF_yetToBeConfirmed, LF_notAssociated, LF_notRelated,
#       LF_doesNotShow, LF_notLinked, LF_notCorrelated, LF_disprove, LF_doesNotSignify,
#       LF_doesNotIndicate, LF_doesNotImply, LF_studies, LF_studies2, LF_studies3, LF_studies4, LF_interesting,
#       LF_discussion, LF_conclusion, LF_recently, LF_induced, LF_treatment, LF_isaBiomarker, LF_marker, LF_suspect, LF_mark, LF_People]

LFs = [LF_contiguous_mentions, LF_tumors_growths, LF_cancer, LF_disease_syndrome, LF_deficiency_of, LF_left_positive_argument, LF_right_negative_argument, LF_medical_afixes, LF_protein_chemical_abbrv, LF_base_pair_seq, LF_neg_surfix, LF_pos_in]

# First, we construct a `CandidateLabeler`.
from snorkel.annotations import LabelManager
label_manager = LabelManager()

# Next we run the `CandidateLabeler` to to apply the labeling functions to the training `CandidateSet`. 
# We'll start with some of our labeling functions:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train

# statistical summary:
L_train.lf_stats()


# Fitting the Generative Model
We estimate the accuracies of the labeling functions without supervision. Specifically, we estimate the parameters of a NaiveBayes generative model.


In [None]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(L_train, n_iter=3000, rate=1e-5)
gen_model.save(session, 'Generative Params')

# We now apply the generative model to the training candidates.
train_marginals = gen_model.marginals(L_train)


# Training the Discriminative Model
We use the estimated probabilites to train a discriminative model that classifies each `Candidate` as a true or false mention.


In [None]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=5000, rate=1e-3)
disc_model.w.shape
%time disc_model.save(session, "Discriminative Params")

# Calibration and Accuracy