Now, we're going to try to incorporate the additional information in the true dev set labels directly to the final discriminative model using multi-fidelity modeling. For simplicity and interpretability, our end classifier(s) will be simpler than a neural network, but they'll use features from a trained neural network. Because of this, I've modified the neural network architecture to provide an additional layer before class probabilities with 16 nodes. The values that this layer will be used as features of each candidate.

Before going too far: what model do I want to use? GPC doesn't seem to offer many benefits, is there anything else I could use? Why not just use a NN? Might be worth talking to Aidan for this one-- some of the same ideas come into play for the other GP project. Lastly, might be useful to actually read one or two of the papers about MF classification and see what they did, see if I can recreate it .

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
from snorkel import SnorkelSession

session = SnorkelSession()

In [2]:
from snorkel.models import candidate_subclass

ChemicalDisease = candidate_subclass('ChemicalDisease', ['chemical', 'disease'])

train = session.query(ChemicalDisease).filter(ChemicalDisease.split == 0).all()
dev = session.query(ChemicalDisease).filter(ChemicalDisease.split == 1).all()
test = session.query(ChemicalDisease).filter(ChemicalDisease.split == 2).all()

print('Training set:\t{0} candidates'.format(len(train)))
print('Dev set:\t{0} candidates'.format(len(dev)))
print('Test set:\t{0} candidates'.format(len(test)))

Training set:	8433 candidates
Dev set:	920 candidates
Test set:	4683 candidates


In [417]:
train_marginals_orig = np.fromfile("train_marginals_orig.txt")
train_marginals = train_marginals_orig[:8433].copy()
total = train.copy()
total.extend(dev.copy())

In [4]:
from snorkel.annotations import load_gold_labels
L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)

In [5]:
from snorkel.learning.pytorch import LSTM # includes an extra layer for smaller set of features

In [9]:
train_kwargs = {
    'lr':              0.01,
    'embedding_dim':   100,
    'hidden_dim':      100,
    'n_epochs':        20,
    'dropout':         0.5,
    'rebalance':       0.25,
    'print_freq':      5,
    'seed':            1701
}

lstm = LSTM(n_threads=None)
lstm.train(total, train_marginals_orig, X_dev=dev, Y_dev=L_gold_dev, **train_kwargs)
lstm.save(model_name="lstm_with_features")

[LSTM] Training model
[LSTM] n_train=4112  #epochs=20  batch size=64




[LSTM] Epoch 1 (18.60s)	Average loss=0.689290	Dev F1=50.33
[LSTM] Epoch 6 (111.65s)	Average loss=0.667892	Dev F1=53.23
[LSTM] Epoch 11 (206.76s)	Average loss=0.664174	Dev F1=52.80
[LSTM] Epoch 16 (302.67s)	Average loss=0.662036	Dev F1=53.32
[LSTM] Epoch 20 (380.19s)	Average loss=0.660924	Dev F1=55.15
[LSTM] Model saved as <LSTM>
[LSTM] Training done (381.27s)
[LSTM] Loaded model <LSTM>
[LSTM] Model saved as <lstm_with_features>


In [10]:
lstm.score(test,L_gold_test)

(0.3968514266972778, 0.79920739762219284, 0.53035283804514566)

In [11]:
dev_features = lstm.feature_outputs(dev, 100).detach().numpy().reshape(920,10)
train_features = lstm.feature_outputs(train, 100).detach().numpy().reshape(8433,10)
test_features = lstm.feature_outputs(test, 100).detach().numpy().reshape(4683,10)

In [66]:
train_preds = train_marginals_orig[:8433].copy()



In [102]:
dev_true = L_gold_dev.toarray().reshape(920,)
dev_true[dev_true == -1] = 0

In [67]:
train_preds

array([ 0.,  1.,  1., ...,  1.,  1.,  1.])

In [50]:
import torch.nn as nn

In [51]:
import torch

In [371]:
class SimpleNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(10, 50)
        self.l2 = nn.Linear(50, 50)
        self.l3 = nn.Linear(50,50)
        self.l4 = nn.Linear(50,2)

    def forward(self,x):
        x = nn.functional.relu(self.l1(x))
        x = nn.functional.relu(self.l2(x))
        x = nn.functional.relu(self.l3(x))
        x = nn.functional.softmax(self.l4(x))
        return x

In [418]:
train_marginals = np.stack([1-train_marginals,train_marginals], axis=1)

In [431]:
from torch.optim import Adam

model = SimpleNet()
criterion = nn.BCEWithLogitsLoss()
optimizer = Adam(model.parameters(),lr = 0.001)

for i in range(500):
    model.zero_grad()
    output = model(torch.from_numpy(train_features))
    loss = criterion(output,torch.from_numpy(train_marginals).type(torch.float))
    loss.backward()
    if i % 50 == 0:
        print(loss)
    optimizer.step()

  del sys.path[0]


tensor(0.7223, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
tensor(0.7035, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
tensor(0.7031, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
tensor(0.7029, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
tensor(0.7026, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
tensor(0.7023, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
tensor(0.7020, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
tensor(0.7017, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
tensor(0.7013, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
tensor(0.7011, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)


In [432]:
probs = model(torch.from_numpy(test_features))

  del sys.path[0]


In [433]:
probs = probs.detach().numpy()

In [434]:
probs.mean(axis=0)

array([ 0.36597148,  0.63402838], dtype=float32)

In [452]:
devmodel = SimpleNet()
criterion = nn.CrossEntropyLoss()
optimizer = Adam(devmodel.parameters(),lr = 0.001)

for i in range(50):
    devmodel.zero_grad()
    output = devmodel(torch.from_numpy(dev_features))
    loss = criterion(output,torch.from_numpy(dev_true).type(torch.long))
    loss.backward()
    optimizer.step()

  del sys.path[0]


In [453]:
probsdev = devmodel(torch.from_numpy(test_features)).detach().numpy()

  del sys.path[0]


In [502]:
c = 0
res = L_gold_test.toarray()
tp,tn,fp,fn = 0,0,0,0
for i in range(len(probsdev)):
    pred = 0
    dc = max(probsdev[i])
    tc = max(probs[i])
    if dc > tc:
        c += 1
        if probsdev[i][0] > 0.5:
            pred = 0
        else:
            pred = 1
    else:
        if probs[i][0] > .5:
            pred = 0
        else:
            pred = 1
    if res[i][0] == 1:
        if pred == 1:
            tp += 1
        else:
            fn += 1
    else:
        if pred == 1:
            fp += 1
        else:
            tn += 1

In [503]:
print (tp,fp,tn,fn)

1225 1954 1215 289


In [504]:
prec = tp / (tp + fp)
rec = tp / (tp + fn)

In [505]:
2/(1/prec + 1/rec)

0.5220541231621564

In [506]:
print ((tp + tn)/(tp+tn+fp+fn))

0.5210335255178304


In [507]:
c

2652

In [265]:
from sklearn.linear_model import LogisticRegression

In [266]:
dev_model = LogisticRegression()
train_model = LogisticRegression()
dev_model.fit(dev_features, dev_true)
train_model.fit(train_features, train_preds)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [174]:
probsdev = dev_model.predict_proba(test_features)
probstrain = train_model.predict_proba(train_features)

In [188]:
res = L_gold_test.toarray()
tp,tn,fp,fn = 0,0,0,0
for i in range(len(probsdev)):
    pred = 0
    if max(probsdev[i]) > max(probstrain[i]):
        if probstrain[i][0] > .5:
            pred = 0
        else:
            pred = 1
    else:
        if probstrain[i][0] > .5:
            pred = 0
        else:
            pred = 1
    if res[i][0] == 1:
        if pred == 1:
            tp += 1
        else:
            fn += 1
    else:
        if pred == 1:
            fp += 1
        else:
            tn += 1

In [189]:
print (tp,fp,tn,fn)

1030 2295 874 484


In [190]:
prec = tp / (tp + fp)
rec = tp / (tp + fn)

In [191]:
2/(1/prec + 1/rec)

0.4257077908658814

In [192]:
print ((tp + tn)/(tp+tn+fp+fn))

0.40657698056801195
