# MUST RUN AT THE START OF EVERYTHING

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from collections import Counter
from collections import defaultdict
import os
import tqdm

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import average_precision_score, precision_recall_curve, roc_curve, auc

In [2]:
#Set up the environment
username = "danich1"
password = "snorkel"
dbname = "pubmeddb"

#Path subject to change for different os
database_str = "postgresql+psycopg2://{}:{}@/{}?host=/var/run/postgresql".format(username, password, dbname)
os.environ['SNORKELDB'] = database_str

from snorkel import SnorkelSession
session = SnorkelSession()

In [3]:
from snorkel import SnorkelSession
from snorkel.annotations import FeatureAnnotator, LabelAnnotator, save_marginals
from snorkel.learning import GenerativeModel
from snorkel.learning.utils import MentionScorer
from snorkel.models import Candidate, FeatureKey, candidate_subclass
from snorkel.utils import get_as_dict
from tree_structs import corenlp_to_xmltree
from treedlib import compile_relation_feature_generator

In [4]:
edge_type = "dg"

In [5]:
if edge_type == "dg":
    DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene'])
elif edge_type == "gg":
    GeneGene = candidate_subclass('GeneGene', ['Gene1', 'Gene2'])
elif edge_type == "cg":
    CompoundGene = candidate_subclass('CompoundGene', ['Compound', 'Gene'])
elif edge_type == "cd":
    CompoundDisease = candidate_subclass('CompoundDisease', ['Compound', 'Disease'])
else:
    print("Please pick a valid edge type")

# Load preprocessed data 

To save time, this code will automatically load our labels that were generated in the previous file.

In [6]:
%%time
labeler = LabelAnnotator(lfs=[])

L_train = labeler.load_matrix(session,split=0)
#L_dev = labeler.load_matrix(session,split=1)
#L_test = labeler.load_matrix(session,split=2)

  0%|          | 8449/2683871 [00:06<35:49, 1244.74it/s]


KeyboardInterrupt: 

In [7]:
print "Total Data Shape:"
print L_train.shape
#print L_dev.shape
#print L_test.shape
print

Total Data Shape:
(2683871, 1)
(763802, 1)
(382464, 1)



# Run the machine learning models below

## Generative Model

Since we are still in development stage below are just two generative models designed to model p(Labels,y). Until we can discuss more about the classifiers we want to use, feel free to run the below code and see some cool output.

In [8]:
from snorkel.learning import GenerativeModel

gen_model = GenerativeModel()
%time gen_model.train(L_train, epochs=10, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6, threads=50, verbose=True)

Inferred cardinality: 2
FACTOR 0: STARTED BURN-IN...
FACTOR 0: DONE WITH BURN-IN
FACTOR 0: STARTED LEARNING
FACTOR 0: EPOCH #0
Current stepsize = 3.72596149368e-08
Learning epoch took 0.000 sec.
Weights:
    weightId: 0
        isFixed: True
        weight:  1.0

    weightId: 1
        isFixed: False
        weight:  0.0

FACTOR 0: EPOCH #1
Current stepsize = 3.53966341899e-08
Learning epoch took 3.712 sec.
Weights:
    weightId: 0
        isFixed: True
        weight:  1.0

    weightId: 1
        isFixed: False
        weight:  -0.0322550152375

FACTOR 0: EPOCH #2
Current stepsize = 3.36268024804e-08
Learning epoch took 3.709 sec.
Weights:
    weightId: 0
        isFixed: True
        weight:  1.0

    weightId: 1
        isFixed: False
        weight:  0.00250030273406

FACTOR 0: EPOCH #3
Current stepsize = 3.19454623564e-08
Learning epoch took 3.705 sec.
Weights:
    weightId: 0
        isFixed: True
        weight:  1.0

    weightId: 1
        isFixed: False
        weight:  0.0

In [9]:
%time train_marginals = gen_model.marginals(L_train)

100%|██████████| 2683871/2683871 [05:06<00:00, 8759.35it/s]

CPU times: user 5min 6s, sys: 912 ms, total: 5min 7s
Wall time: 5min 6s





In [None]:
gen_model.learned_lf_stats()

In [None]:
plt.hist(train_marginals, bins=20)
plt.title("Training Marginals for Gibbs Sampler")
plt.show()

# Save the training marginals for continued use down the road

In [10]:
%time save_marginals(session, L_train, train_marginals)

100%|██████████| 2683871/2683871 [26:47<00:00, 1669.47it/s]


Saved 2683871 marginals
CPU times: user 25min 42s, sys: 1min 14s, total: 26min 57s
Wall time: 3h 51s
