# Global settings

In [1]:
device = 'cuda'
bs = 64
test_pct = .10
val_pct = .60
epochs = 2

# Data proc

In [2]:
import pandas as pd
import numpy as np
import torch
import itertools
from make_matrix import *
from torch.utils.data import TensorDataset, DataLoader, Sampler
import torch.nn as nn
import torch.nn.functional as F

In [3]:
fludf = pd.read_csv('flu_train.csv')

In [4]:
fludf.head()

Unnamed: 0.1,Unnamed: 0,virusUprot,humanUprot,edge
0,0,P03433,P49736,1.0
1,1,P03433,P15311,0.0
2,2,P03433,P11142,0.0
3,3,P03433,Q86U42,0.0
4,4,P03433,P33992,1.0


### Numericalize proteins

In [5]:
def numericalize(df):
    vs = df['virusUprot'].unique()
    hs = df['humanUprot'].unique()
    vs_map = {v:i for i,v in enumerate(vs)}
    hs_map = {h:i for i,h in enumerate(hs)}
    
    df.virusUprot = df.virusUprot.apply(lambda x : vs_map[x])
    df.humanUprot = df.humanUprot.apply(lambda x : hs_map[x])
    df = df.sample(frac=1).reset_index()
    return df[['virusUprot', 'humanUprot', 'edge']], vs_map, hs_map

In [6]:
fludf, flu_vsmap, flu_hsmap = numericalize(fludf)
fludf.head()

Unnamed: 0,virusUprot,humanUprot,edge
0,56,1407,0.0
1,2,2298,0.0
2,51,534,0.0
3,109,245,0.0
4,53,876,1.0


### Make a dataloaders

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
def arr_to_torch(arr, dtype):
    t =  torch.from_numpy(np.array(arr)).type(dtype).to(device)
    return t

In [13]:
def getLoaders(df):
    X = list(zip(df.virusUprot.values, df.humanUprot.values))
    y = df['edge'].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_pct, random_state=1)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_pct, random_state=1)
    
    train_dset = TensorDataset(arr_to_torch(X_train, torch.long), arr_to_torch(y_train, torch.float32))
    train_loader = DataLoader(train_dset, batch_size=bs, shuffle=True)
    
    val_dset = TensorDataset(arr_to_torch(X_val, torch.long), arr_to_torch(y_val, torch.float32))
    val_loader = DataLoader(val_dset, batch_size=bs, shuffle=True)
    
    test_dset = TensorDataset(arr_to_torch(X_test, torch.long), arr_to_torch(y_test, torch.float32))
    test_loader = DataLoader(test_dset, batch_size=bs, shuffle=True)
    
    return train_loader, val_loader, test_loader

In [14]:
flutrain_loader, fluval_loader, flutest_loader = getLoaders(fludf)

In [15]:
hepctrain_loader, hepcval_loader, hepctest_loader = getLoaders(hepcdf)

In [16]:
ebolatrain_loader, ebolaval_loader, ebolatest_loader = getLoaders(eboladf)

# Model

In [17]:
def l2_regularize(array):
    loss = torch.sum(array ** 2.0)
    return loss

In [18]:
class MF(nn.Module):
    def __init__(self, n_virus, n_human, k=18, c_vector=1.0, c_bias=1.0, writer=None):
        super(MF, self).__init__()
        self.writer = writer
        self.k = k
        self.n_virus = n_virus
        self.n_human = n_human
        self.c_bias = c_bias
        self.c_vector = c_vector
        
        self.virus = nn.Embedding(n_virus, k)
        self.human = nn.Embedding(n_human, k)
        
        # We've added new terms here:
        self.bias_virus = nn.Embedding(n_virus, 1)
        self.bias_human = nn.Embedding(n_human, 1)
        self.bias = nn.Parameter(torch.ones(1))
    
    def forward(self, train_x):
        virus_id = train_x[:, 0]
        human_id = train_x[:, 1]
        vector_virus = self.virus(virus_id)
        vector_human = self.human(human_id)
        
        # Pull out biases
        bias_virus = self.bias_virus(virus_id).squeeze()
        bias_human = self.bias_human(human_id).squeeze()
        biases = (self.bias + bias_virus + bias_human)
        
        ui_interaction = torch.sum(vector_virus * vector_human, dim=1)
        
        prediction = ui_interaction + biases
        return prediction
    
    def loss(self, prediction, target):
#         loss_mse = F.binary_cross_entropy_with_logits(prediction, target.squeeze())
        loss_mse = F.mse_loss(prediction, target.squeeze())
    
        # Add new regularization to the biases
        prior_bias_virus =  l2_regularize(self.bias_virus.weight) * self.c_bias
        prior_bias_human = l2_regularize(self.bias_human.weight) * self.c_bias
        
        prior_virus =  l2_regularize(self.virus.weight.data) * self.c_vector
        prior_human = l2_regularize(self.human.weight.data) * self.c_vector
        total = loss_mse + prior_virus + prior_human + prior_bias_virus + prior_bias_human
        return total

# Trainer

In [19]:
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Loss, Accuracy, Precision, Recall
from tensorboardX import SummaryWriter
from ignite.metrics import MeanSquaredError, Loss
from ignite.contrib.metrics import AveragePrecision, ROC_AUC

from datetime import datetime

In [26]:
class Trainer:
    
    def __init__(self, model, crit, optim, writer, train_loader, val_loader, test_loader, modelname):
        self.model = model
        self.crit = crit
        self.optim = optim
        self.writer = writer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.test_loader = test_loader
        self.best_model = None
        self.best_AP = -1.0
        self.modelname = modelname
        
        self.trainer = create_supervised_trainer(model, optim, crit)
        self.metrics = {'loss': Loss(crit), 'ap': AveragePrecision(), "roc": ROC_AUC()}
        self.evaluator = create_supervised_evaluator(model, metrics=self.metrics)
        
        ## Add events
        self.trainer.add_event_handler(event_name=Events.ITERATION_COMPLETED, handler=self.log_training_loss)
        self.trainer.add_event_handler(event_name=Events.EPOCH_COMPLETED, handler=self.log_validation_results)
        self.trainer.add_event_handler(event_name=Events.COMPLETED, handler=self.log_test_results)
        
        print(model)
        
    def log_training_loss(self, engine, log_interval=400):
        epoch = engine.state.epoch
        itr = engine.state.iteration
        fmt = "TRAIN: Epoch[{}] Iteration[{}/{}] Loss: {:.2f}"
        msg = fmt.format(epoch, itr, len(self.train_loader), engine.state.output)
        self.model.itr = itr
        if itr % log_interval == 0:
            print(msg)
#             self.evaluator.run(self.train_loader)
            
#             metrics = self.evaluator.state.metrics
#             mse = metrics['loss']
#             ap = metrics['ap']
#             roc = metrics['roc']
            
#             print("Epoch[{}] Validation MSE: {:.2f} Avg Prec: {:.2f} ROC: {:.2f} "
#                   .format(engine.state.epoch, mse, ap, roc))
            
#             self.writer.add_scalar("training/mse", mse, engine.state.epoch)
#             self.writer.add_scalar("training/ap", ap, engine.state.epoch)
#             self.writer.add_scalar("training/roc", roc, engine.state.epoch)
            
    def log_validation_results(self, engine):
        self.evaluator.run(self.val_loader)
        
        metrics = self.evaluator.state.metrics
        mse = metrics['loss']
        ap = metrics['ap']
        roc = metrics['roc']
        
        if ap > self.best_AP:
            self.best_AP = ap
            self.best_model = model.state_dict()
        
        print("VALIDATION Epoch[{}] Validation MSE: {:.2f} Avg Prec: {:.2f} ROC: {:.2f} "
              .format(engine.state.epoch, mse, ap, roc))
        self.writer.add_scalar("validation/mse", mse, engine.state.epoch)
        self.writer.add_scalar("validation/ap", ap, engine.state.epoch)
        self.writer.add_scalar("validation/roc", roc, engine.state.epoch)

    def log_test_results(self, engine):
        self.evaluator.run(self.test_loader)
        
        metrics = self.evaluator.state.metrics
        mse = metrics['loss']
        ap = metrics['ap']
        roc = metrics['roc']

        print("TEST: Epoch[{}] Validation MSE: {:.2f} Avg Prec: {:.2f} ROC: {:.2f}"
              .format(engine.state.epoch, mse, ap, roc))
        
        print("BEST AP: ", self.best_AP)
        torch.save(self.best_model, './{}.pt'.format(self.modelname))
        
    def run(self, epochs):
        self.trainer.run(self.train_loader, max_epochs=epochs)

## Hyperparametrs

In [38]:
lr = 1e-3
k =20
# regularizing bias
c_bias = 1e-4
c_vector = 1e-4
batchsize = bs

## Flu

In [28]:
log_dir = 'runs/FLU_simple_mf_02_bias_' + str(datetime.now()).replace(' ', '_')
print(log_dir)

runs/FLU_simple_mf_02_bias_2019-05-18_23:30:03.213090


In [29]:
writer = SummaryWriter(log_dir=log_dir)
model = MF(len(flu_vsmap), len(flu_hsmap), writer=writer, k=k, c_bias=c_bias, c_vector=c_vector)
crit = model.loss
model.cuda()
optim = torch.optim.Adam(model.parameters())

In [30]:
flutrainer = Trainer(model, crit, optim, writer, flutrain_loader, fluval_loader, flutrain_loader, 'flumodel')

MF(
  (virus): Embedding(203, 20)
  (human): Embedding(2727, 20)
  (bias_virus): Embedding(203, 1)
  (bias_human): Embedding(2727, 1)
)


In [33]:
flutrainer.run(2)

TRAIN: Epoch[1] Iteration[400/3114] Loss: 3.48
TRAIN: Epoch[1] Iteration[800/3114] Loss: 3.48
TRAIN: Epoch[1] Iteration[1200/3114] Loss: 3.46
TRAIN: Epoch[1] Iteration[1600/3114] Loss: 3.46
TRAIN: Epoch[1] Iteration[2000/3114] Loss: 3.45
TRAIN: Epoch[1] Iteration[2400/3114] Loss: 3.44
TRAIN: Epoch[1] Iteration[2800/3114] Loss: 3.42
VALIDATION Epoch[1] Validation MSE: 3.42 Avg Prec: 0.16 ROC: 0.85 
TRAIN: Epoch[2] Iteration[3200/3114] Loss: 3.43
TRAIN: Epoch[2] Iteration[3600/3114] Loss: 3.41
TRAIN: Epoch[2] Iteration[4000/3114] Loss: 3.38
TRAIN: Epoch[2] Iteration[4400/3114] Loss: 3.38
TRAIN: Epoch[2] Iteration[4800/3114] Loss: 3.40
TRAIN: Epoch[2] Iteration[5200/3114] Loss: 3.36
TRAIN: Epoch[2] Iteration[5600/3114] Loss: 3.34
TRAIN: Epoch[2] Iteration[6000/3114] Loss: 3.31


  recall = tps / tps[-1]
 Only one class present in y_true. ROC AUC score is not defined in that case.


VALIDATION Epoch[2] Validation MSE: 3.32 Avg Prec: 0.18 ROC: 0.85 


  recall = tps / tps[-1]
 Only one class present in y_true. ROC AUC score is not defined in that case.


TEST: Epoch[2] Validation MSE: 3.32 Avg Prec: 0.37 ROC: 0.92
BEST AP:  0.1808188976460829


## Hepc

In [39]:
# Hyperparameters
log_dir = 'runs/HEPC_simple_mf_02_bias_' + str(datetime.now()).replace(' ', '_')
print(log_dir)

runs/HEPC_simple_mf_02_bias_2019-05-18_23:33:24.186835


In [40]:
writer = SummaryWriter(log_dir=log_dir)
model = MF(len(hepc_vsmap), len(hepc_hsmap), writer=writer, k=k, c_bias=c_bias, c_vector=c_vector)
crit = model.loss
model.cuda()
optim = torch.optim.Adam(model.parameters())

In [41]:
hepctrainer = Trainer(model, crit, optim, writer, hepctrain_loader, hepcval_loader, hepctrain_loader, 'hepcmodel')

MF(
  (virus): Embedding(174, 20)
  (human): Embedding(1002, 20)
  (bias_virus): Embedding(174, 1)
  (bias_human): Embedding(1002, 1)
)


In [42]:
hepctrainer.run(epochs)

TRAIN: Epoch[1] Iteration[400/981] Loss: 17.08
TRAIN: Epoch[1] Iteration[800/981] Loss: 12.33


  recall = tps / tps[-1]
 Only one class present in y_true. ROC AUC score is not defined in that case.


VALIDATION Epoch[1] Validation MSE: 17.95 Avg Prec: 0.01 ROC: 0.50 
TRAIN: Epoch[2] Iteration[1200/981] Loss: 14.16
TRAIN: Epoch[2] Iteration[1600/981] Loss: 12.07


  recall = tps / tps[-1]
 Only one class present in y_true. ROC AUC score is not defined in that case.


VALIDATION Epoch[2] Validation MSE: 13.65 Avg Prec: 0.01 ROC: 0.50 
TEST: Epoch[2] Validation MSE: 12.88 Avg Prec: 0.01 ROC: 0.51
BEST AP:  0.012158489177327269


# Ebola

In [43]:
# Hyperparameters
log_dir = 'runs/EBOLA_simple_mf_02_bias_' + str(datetime.now()).replace(' ', '_')
print(log_dir)

runs/EBOLA_simple_mf_02_bias_2019-05-18_23:33:37.565642


In [44]:
writer = SummaryWriter(log_dir=log_dir)
model = MF(len(ebola_vsmap), len(ebola_hsmap), writer=writer, k=k, c_bias=c_bias, c_vector=c_vector)
crit = model.loss
model.cuda()
optim = torch.optim.Adam(model.parameters())

In [45]:
ebolatrainer = Trainer(model, crit, optim, writer, ebolatrain_loader, ebolaval_loader, ebolatest_loader, 'ebolamodel')

MF(
  (virus): Embedding(7, 20)
  (human): Embedding(151, 20)
  (bias_virus): Embedding(7, 1)
  (bias_human): Embedding(151, 1)
)


In [46]:
ebolatrainer.run(epochs)

VALIDATION Epoch[1] Validation MSE: 24.54 Avg Prec: 0.20 ROC: 0.52 
VALIDATION Epoch[2] Validation MSE: 24.39 Avg Prec: 0.20 ROC: 0.52 
TEST: Epoch[2] Validation MSE: 24.79 Avg Prec: 0.22 ROC: 0.51
BEST AP:  0.19557238544981628
