# MUST RUN AT THE START OF EVERYTHING

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
database_str = "sqlite:///" + os.environ['WORKINGPATH'] + "/Database/epilepsy.db"
os.environ['SNORKELDB'] = database_str

from snorkel import SnorkelSession
session = SnorkelSession()

# Load preprocessed data 

To save time, this code will automatically load our labels that were generated in the previous file.

In [None]:
from snorkel.annotations import LabelAnnotator
labeler = LabelAnnotator(f=None)

L_train = labeler.load_matrix(session,split=0)
L_dev = labeler.load_matrix(session,split=1)
L_test = labeler.load_matrix(session,split=2)

In [None]:
print "Total Data Shape:"
print L_train.shape
print L_dev.shape
print L_test.shape
print

print "The number of positive candiadtes (in KB) for each division:"
print L_train[(L_train[:,0] > 0)].shape
print L_dev[(L_dev[:,0] > 0)].shape
print L_test[L_test[:,0] > 0].shape

In [None]:
from snorkel.annotations import FeatureAnnotator
featurizer = FeatureAnnotator()

F_train = featurizer.load_matrix(session, split=0)
F_dev = featurizer.load_matrix(session, split=1)
F_test = featurizer.load_matrix(session, split=2)

# Run the machine learning models below

Since we are still in development stage below are just two generative models designed to model p(Labels,y). Until we can discuss more about the classifiers we want to use, feel free to run the below code and see some cool output.

In [None]:
from snorkel.learning import NaiveBayes
KB = L_train[:,0]
KB_CONTEXT = L_train
train_marginals = []
gen_model = NaiveBayes()

for models in [KB,KB_CONTEXT]:
    gen_model.train(models)
    train_marginals.append(gen_model.marginals(models))

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_marginals[0],bins=20)
plt.title("KB")
plt.show()
plt.hist(train_marginals[1],bins=20)
plt.title("KB + Context")
plt.show()

# Disc Model With Hyper-Param Tuning

In [None]:
from snorkel.learning.utils import MentionScorer
from snorkel.learning import RandomSearch, ListParameter, RangeParameter

# Searching over learning rate
rate_param = RangeParameter('lr', 1e-6, 1e-2, step=1, log_base=10)
l1_param  = RangeParameter('l1_penalty', 1e-6, 1e-2, step=1, log_base=10)
l2_param  = RangeParameter('l2_penalty', 1e-6, 1e-2, step=1, log_base=10)

In [None]:
from snorkel.models import candidate_subclass
DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene'])

In [None]:
from snorkel.learning import SparseLogisticRegression
import numpy as np
np.random.seed(1701)
test_marginals = []
disc_models = []
weights = []

for i,L_classes in enumerate([KB,KB_CONTEXT]):
    print i
    disc_model = SparseLogisticRegression()
    searcher = RandomSearch(session, disc_model, F_train, train_marginals[i], [rate_param, l1_param, l2_param], n=20)
    searcher.fit(F_dev, L_dev, n_epochs=50, rebalance=0.5, print_freq=25)
    disc_models.append(disc_model)
    w = disc_model.save_dict['w']
    f = w.read_value()
    values = f.eval(session = disc_model.session)
    weights.append(values)
    test_marginals.append(disc_model.marginals(F_test))

In [None]:
import numpy as np
for model_id, weight_arr in enumerate(weights):
    print "Model {} max: {}".format(model_id,np.max(weight_arr))
    print "Location: {}".format(np.argmax(weight_arr))
    print 

In [None]:
from snorkel.models import Candidate
import pandas as pd
test_marginals[0].shape
cand_probs = []
for candidate_id in L_test.candidate_index:
    cand = session.query(Candidate).filter(Candidate.id == candidate_id).one()
    index = L_test.candidate_index[candidate_id]
    cand_probs.append([cand[0].get_span(), cand[1].get_span(),test_marginals[0][index], test_marginals[1][index]])
cand_stats = pd.DataFrame(cand_probs, columns = ["Disease", "Gene", "Model(KB)", "Model(KB+CONTEXT)"])

In [None]:
tp,fp,tn,fn = disc_models[0].score(session, F_test, L_test)
tp,fp,tn,fn = disc_models[1].score(session, F_test, L_test)

In [None]:
from sklearn.metrics import average_precision_score, precision_recall_curve, roc_curve, auc
pr_models = []
roc_models = []
for marginal in test_marginals:    
    fpr, tpr, thresholds = roc_curve(list(L_test[:,0]),marginal,pos_label=1)
    roc_auc = auc(fpr, tpr)
    
    precision, recall, thresholds = precision_recall_curve(L_test[:,0].todense(),marginal,pos_label=1)
    avg_precision = average_precision_score(L_test[:,0].todense(), marginal)
    
    roc_models.append(tuple([fpr,tpr,roc_auc]))
    pr_models.append(tuple([recall,precision,avg_precision]))

In [None]:
import matplotlib.pyplot as plt
model_names = ["KB", "KB+Context"]
color_vals = ["darkorange", "cyan", "red"]
for i,model in enumerate(pr_models):
    plt.plot(model[0],model[1], color=color_vals[i],
         lw=2, label='%s (area = %0.2f)' % (model_names[i],model[2]))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
import matplotlib.pyplot as plt
model_names = ["KB", "KB+Context"]
color_vals = ["darkorange", "cyan", "red"]
for i,model in enumerate(roc_models):
    plt.plot(model[0],model[1], color=color_vals[i],
         lw=2, label='%s (area = %0.2f)' % (model_names[i],model[2]))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

# Disc Model Without Hyper-Param Tuning

In [None]:
from snorkel.annotations import load_marginals
train_marginals = load_marginals(session,0)

In [None]:
train_marginals

In [None]:
from snorkel.learning import SparseLogisticRegression
disc_model = SparseLogisticRegression()

In [None]:
disc_model.train(F_train, train_marginals[1], n_epochs=20, lr=0.001)

In [None]:
test_marginals.append(disc_model.marginals(F_test))

In [None]:
test_marginals = disc_model.marginals(F_test)

import matplotlib.pyplot as plt
plt.hist(test_marginals,bins=20)
plt.show()

In [None]:
from snorkel.models import candidate_subclass

DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene'])

In [None]:
tp,fp,tn,fn = disc_model.score(session, F_test, L_test)

In [None]:
print "TP"
print tp
print 

print "FP"
print fp