# MUST RUN AT THE START OF EVERYTHING

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import csv
import os

import numpy as np
import tqdm

In [2]:
#Set up the environment
username = "danich1"
password = "snorkel"
dbname = "pubmeddb"

#Path subject to change for different os
database_str = "postgresql+psycopg2://{}:{}@/{}?host=/var/run/postgresql".format(username, password, dbname)
os.environ['SNORKELDB'] = database_str

from snorkel import SnorkelSession
session = SnorkelSession()

In [3]:
from snorkel.annotations import FeatureAnnotator, LabelAnnotator, load_marginals
from snorkel.learning import SparseLogisticRegression
from snorkel.learning.disc_models.rnn import reRNN
from snorkel.learning.utils import RandomSearch
from snorkel.models import Candidate, FeatureKey, candidate_subclass

In [4]:
edge_type = "dg"

In [5]:
if edge_type == "dg":
    DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene'])
elif edge_type == "gg":
    GeneGene = candidate_subclass('GeneGene', ['Gene1', 'Gene2'])
elif edge_type == "cg":
    CompoundGene = candidate_subclass('CompoundGene', ['Compound', 'Gene'])
elif edge_type == "cd":
    CompoundDisease = candidate_subclass('CompoundDisease', ['Compound', 'Disease'])
else:
    print("Please pick a valid edge type")

# Load preprocessed data 

To save time, this code will automatically load our labels that were generated in the previous file.

In [6]:
%%time
labeler = LabelAnnotator(lfs=[])

#L_train = labeler.load_matrix(session,split=0)
L_dev = labeler.load_matrix(session,split=1)

3830137it [00:06, 582573.86it/s]


CPU times: user 8.82 s, sys: 1.23 s, total: 10 s
Wall time: 10.4 s


In [None]:
print "Total Data Shape:"
print L_train.shape
print L_dev.shape
print

In [None]:
%%time
featurizer = FeatureAnnotator()

F_train = featurizer.load_matrix(session, split=0)
F_dev = featurizer.load_matrix(session, split=1)

In [None]:
print "Total Data Shape:"
print F_train.shape
print F_dev.shape
print

# Run Disc Model Classification of Candidates

# Train Logistic Regression Disc Model

In [None]:
%time train_marginals = load_marginals(session, split=0)

In [None]:
# Searching over learning rate
param_ranges = {
    'lr' : [1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
    'l1_penalty' : [1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
    'l2_penalty' : [1e-2, 1e-3, 1e-4, 1e-5, 1e-6]
}
model_hyperparams = {
    'n_epochs' : 50,
    'rebalance' : 0.5,
    'print_freq' : 25
}
searcher = RandomSearch(SparseLogisticRegression, param_ranges, F_train,
                        Y_train=train_marginals, n=5, model_hyperparams=model_hyperparams)

In [None]:
%%time
np.random.seed(100)
disc_model, run_stats = searcher.fit(F_dev, L_dev, n_threads=4)

In [None]:
w, b = disc_model.get_weights()

In [None]:
# Write the weights and features for further processing
annot_select_query = FeatureKey.__table__.select().order_by(FeatureKey.id)
with open("LR_model.csv", "w") as f:
    fieldnames = ["Weight", "Feature"]
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    for weight, feature in tqdm.tqdm(zip(w, session.execute(annot_select_query))):
        writer.writerow({"Weight": weight, "Feature":feature[1]})

## Train Recurrent Neural Net Disc Model

In [6]:
%time train_marginals = load_marginals(session, split=0)
np.savetxt("pmacs/train_marginals", train_marginals)

CPU times: user 11.4 s, sys: 512 ms, total: 11.9 s
Wall time: 19.4 s


In [7]:
%%time
train_kwargs = {
    'lr':         0.001,
    'dim':        100,
    'n_epochs':   10,
    'dropout':    0.5,
    'print_freq': 1,
    'max_sentence_length': 1000,
}

lstm = reRNN(seed=100, n_threads=4)
#lstm.train(train_cands, train_marginals[0:10], X_dev=dev_cands, Y_dev=L_dev[0:10], **train_kwargs)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 98 µs


In [8]:
import csv
chunksize = 100000
start = 0
with open('pmacs/train_candidates_ends.csv', 'wb') as g:
    with open("pmacs/train_candidates_offsets.csv", "wb") as f:
        while True:
            train_cands = session.query(DiseaseGene).filter(DiseaseGene.split == 0).order_by(DiseaseGene.id).limit(chunksize).offset(start).all()
            
            if not train_cands:
                break
                
            output = csv.writer(f)
            for c in tqdm.tqdm(train_cands):
                data, ends = lstm._preprocess_data([c], extend=True)
                output.writerow(data[0])
                g.write("{}\n".format(ends[0]))
            
            start += chunksize

100%|██████████| 100000/100000 [04:52<00:00, 342.14it/s]
100%|██████████| 100000/100000 [04:48<00:00, 346.14it/s]
100%|██████████| 100000/100000 [04:42<00:00, 354.45it/s]
100%|██████████| 100000/100000 [04:36<00:00, 361.64it/s]
100%|██████████| 100000/100000 [04:26<00:00, 374.95it/s]
100%|██████████| 100000/100000 [04:21<00:00, 382.21it/s]
100%|██████████| 100000/100000 [04:22<00:00, 380.31it/s]
100%|██████████| 100000/100000 [04:29<00:00, 371.71it/s]
100%|██████████| 100000/100000 [04:25<00:00, 377.00it/s]
100%|██████████| 100000/100000 [04:24<00:00, 378.30it/s]
100%|██████████| 100000/100000 [04:33<00:00, 365.49it/s]
100%|██████████| 100000/100000 [04:26<00:00, 375.65it/s]
100%|██████████| 100000/100000 [04:28<00:00, 373.04it/s]
100%|██████████| 100000/100000 [04:23<00:00, 379.84it/s]
100%|██████████| 100000/100000 [04:16<00:00, 389.49it/s]
100%|██████████| 100000/100000 [04:13<00:00, 394.21it/s]
100%|██████████| 100000/100000 [04:12<00:00, 395.55it/s]
100%|██████████| 100000/100000 

In [11]:
import csv
with open("pmacs/train_word_dict.csv", 'w') as f:
    output = csv.DictWriter(f, fieldnames=["Key", "Value"])
    output.writeheader()
    for key in tqdm.tqdm(lstm.word_dict.d):
        output.writerow({'Key':key, 'Value': lstm.word_dict.d[key]})


  0%|          | 0/443430 [00:00<?, ?it/s][A
 10%|█         | 45258/443430 [00:00<00:00, 452579.35it/s][A
 21%|██        | 91831/443430 [00:00<00:00, 456443.15it/s][A
 32%|███▏      | 140205/443430 [00:00<00:00, 464302.02it/s][A
 42%|████▏     | 188022/443430 [00:00<00:00, 467369.15it/s][A
 54%|█████▎    | 237375/443430 [00:00<00:00, 473739.61it/s][A
 64%|██████▍   | 285669/443430 [00:00<00:00, 476461.50it/s][A
 76%|███████▌  | 335339/443430 [00:00<00:00, 481497.66it/s][A
 87%|████████▋ | 384756/443430 [00:00<00:00, 485215.05it/s][A
 98%|█████████▊| 433953/443430 [00:00<00:00, 487221.77it/s][A
100%|██████████| 443430/443430 [00:00<00:00, 480707.33it/s][A

In [16]:
%%time
dev_cands = session.query(DiseaseGene).filter(DiseaseGene.split == 1).order_by(DiseaseGene.id).all()

CPU times: user 10.6 s, sys: 316 ms, total: 10.9 s
Wall time: 12.8 s


In [19]:
import csv
with open('pmacs/dev_candidates_ends.csv', 'wb') as g:
    with open("pmacs/dev_candidates_offsets.csv", "wb") as f:
        output = csv.writer(f)
        for c in tqdm.tqdm(dev_cands):
            data, ends = lstm._preprocess_data([c])
            output.writerow(data[0])
            g.write("{}\n".format(ends[0]))

100%|██████████| 763802/763802 [33:11<00:00, 383.56it/s]  
