# Global settings

In [1]:
device = 'cuda'
bs = 64

# Data proc

In [2]:
import pandas as pd
import numpy as np
import torch
import itertools
from make_matrix import *
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

In [3]:
df = pd.read_csv('flu_train.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,virusUprot,humanUprot,edge
0,0,P03433,P49736,1.0
1,1,P03433,P15311,0.0
2,2,P03433,P11142,0.0
3,3,P03433,Q86U42,0.0
4,4,P03433,P33992,1.0


In [5]:
vs = df['virusUprot'].unique()
hs = df['humanUprot'].unique()

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,virusUprot,humanUprot,edge
0,0,P03433,P49736,1.0
1,1,P03433,P15311,0.0
2,2,P03433,P11142,0.0
3,3,P03433,Q86U42,0.0
4,4,P03433,P33992,1.0


In [7]:
vs.sort()
hs.sort()
vs_map = {v:i for i,v in enumerate(vs)}
hs_map = {h:i for i, h in enumerate(hs)}

In [8]:
df.virusUprot = df.virusUprot.apply(lambda x : vs_map[x])
df.humanUprot = df.humanUprot.apply(lambda x : hs_map[x])
df.head()

Unnamed: 0.1,Unnamed: 0,virusUprot,humanUprot,edge
0,0,52,946,1.0
1,1,52,634,0.0
2,2,52,577,0.0
3,3,52,1785,0.0
4,4,52,807,1.0


In [9]:
len(df.loc[df['edge'] == 1]) / len(df)

0.018504970365673678

### Make a dataloader

In [10]:
X = list(zip(df.virusUprot.values, df.humanUprot.values))

In [11]:
y = df['edge'].values

In [12]:
from sklearn.model_selection import train_test_split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [46]:
y_train[y_train == 1]

array([1., 1., 1., ..., 1., 1., 1.])

In [31]:
def arr_to_torch(arr, dtype):
    t =  torch.from_numpy(np.array(arr)).type(dtype).to(device)
    return t

In [32]:
train_dset = TensorDataset(arr_to_torch(X_train, torch.long), arr_to_torch(y_train, torch.float32))
train_loader = DataLoader(train_dset, batch_size=bs)

In [33]:
val_dset = TensorDataset(arr_to_torch(X_val, torch.long), arr_to_torch(y_val, torch.float32))
val_loader = DataLoader(val_dset, batch_size=bs)

In [34]:
test_dset = TensorDataset(arr_to_torch(X_test, torch.long), arr_to_torch(y_test, torch.float32))
test_loader = DataLoader(test_dset, batch_size=bs)

# Model

In [35]:
def l2_regularize(array):
    loss = torch.sum(array ** 2.0)
    return loss

In [36]:
class MF(nn.Module):
    def __init__(self, n_virus, n_human, k=18, c_vector=1.0, c_bias=1.0, writer=None):
        super(MF, self).__init__()
        self.writer = writer
        self.k = k
        self.n_virus = n_virus
        self.n_human = n_human
        self.c_bias = c_bias
        self.c_vector = c_vector
        
        self.virus = nn.Embedding(n_virus, k)
        self.human = nn.Embedding(n_human, k)
        
        # We've added new terms here:
        self.bias_virus = nn.Embedding(n_virus, 1)
        self.bias_human = nn.Embedding(n_human, 1)
        self.bias = nn.Parameter(torch.ones(1))
        
#         self.affine = nn.Linear(in_features=self.k, out_features=1)
#         self.logistic = nn.Sigmoid()
    
    def forward(self, train_x):
        virus_id = train_x[:, 0]
        human_id = train_x[:, 1]
        vector_virus = self.virus(virus_id)
        vector_human = self.human(human_id)
        
        # Pull out biases
        bias_virus = self.bias_virus(virus_id).squeeze()
        bias_human = self.bias_human(human_id).squeeze()
        biases = (self.bias + bias_virus + bias_human)
        
        ui_interaction = torch.sum(vector_virus * vector_human, dim=1)
        
        prediction = ui_interaction + biases
        return prediction
    
    def loss(self, prediction, target):
#         loss_mse = F.binary_cross_entropy_with_logits(prediction, target.squeeze())
        loss_mse = F.mse_loss(prediction, target.squeeze())
    
        # Add new regularization to the biases
        prior_bias_virus =  l2_regularize(self.bias_virus.weight) * self.c_bias
        prior_bias_human = l2_regularize(self.bias_human.weight) * self.c_bias
        
        prior_virus =  l2_regularize(self.virus.weight.data) * self.c_vector
        prior_human = l2_regularize(self.human.weight.data) * self.c_vector
        total = loss_mse + prior_virus + prior_human + prior_bias_virus + prior_bias_human
        return total

# Trainer

In [37]:
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Loss, Accuracy, Precision, Recall
from tensorboardX import SummaryWriter
from ignite.metrics import MeanSquaredError, Loss
from ignite.contrib.metrics import AveragePrecision, ROC_AUC

from datetime import datetime

In [38]:
# Hyperparameters
lr = 1e-3
k =20
# New parameter for regularizing bias
c_bias = 1e-4
c_vector = 1e-4
batchsize = bs
log_dir = 'runs/simple_mf_02_bias_' + str(datetime.now()).replace(' ', '_')
print(log_dir)

runs/simple_mf_02_bias_2019-05-18_21:33:06.870381


In [39]:
writer = SummaryWriter(log_dir=log_dir)
# crit = nn.MSELoss()

model = MF(len(vs_map), len(hs_map), writer=writer, k=k, c_bias=c_bias, c_vector=c_vector)
crit = model.loss
model.cuda()
optimizer = torch.optim.Adam(model.parameters())
trainer = create_supervised_trainer(model, optimizer, model.loss)
metrics = {'loss': Loss(crit), 'ap': AveragePrecision(), "acc": Accuracy(), "roc": ROC_AUC()}
evaluator = create_supervised_evaluator(model, metrics=metrics)

def log_training_loss(engine, log_interval=400):
    epoch = engine.state.epoch
    itr = engine.state.iteration
    fmt = "Epoch[{}] Iteration[{}/{}] Loss: {:.2f}"
    msg = fmt.format(epoch, itr, len(train_loader), engine.state.output)
    model.itr = itr
    if itr % log_interval == 0:
        print(msg)
#         metrics = trainer.state.metrics
#         mse = metrics['loss']
#         avg_precision = metrics['ap']
#         accuracy = metrics['acc']
#         roc = metrics['roc']
#         writer.add_scalar("training/mse", mse, engine.state.epoch)
#         writer.add_scalar("training/ap", avg_precision, engine.state.epoch)
#         writer.add_scalar("training/accuracy", accuracy, engine.state.epoch)
#         writer.add_scalar("training/roc", roc, engine.state.epoch)

trainer.add_event_handler(event_name=Events.ITERATION_COMPLETED, handler=log_training_loss)

def log_validation_results(engine):
    evaluator.run(val_loader)
    metrics = evaluator.state.metrics
    mse = metrics['loss']
    avg_precision = metrics['ap']
    accuracy = metrics['acc']
    roc = metrics['roc']
    print("Epoch[{}] Validation MSE: {:.2f} Avg Prec: {:.2f} acc: {:.2f} ROC: {:.2f} "
          .format(engine.state.epoch, mse, avg_precision, accuracy, roc))
    writer.add_scalar("validation/mse", mse, engine.state.epoch)
    writer.add_scalar("validation/avg_precision", avg_precision, engine.state.epoch)
    writer.add_scalar("validation/accuracy", accuracy, engine.state.epoch)
    writer.add_scalar("validation/roc", roc, engine.state.epoch)

    
def log_test_results(engine):
    evaluator.run(test_loader)
    metrics = evaluator.state.metrics
    mse = metrics['loss']
    avg_precision = metrics['ap']
    accuracy = metrics['acc']
    roc = metrics['roc']

    print("TEST: Epoch[{}] Validation MSE: {:.2f} Avg Prec: {:.2f} acc: {:.2f} ROC: {:.2f}"
          .format(engine.state.epoch, mse, avg_precision, accuracy, roc))


trainer.add_event_handler(event_name=Events.EPOCH_COMPLETED, handler=log_validation_results)
trainer.add_event_handler(event_name=Events.COMPLETED, handler=log_test_results)


model

MF(
  (virus): Embedding(203, 20)
  (human): Embedding(2727, 20)
  (bias_virus): Embedding(203, 1)
  (bias_human): Embedding(2727, 1)
)

In [40]:
model.to(device)

MF(
  (virus): Embedding(203, 20)
  (human): Embedding(2727, 20)
  (bias_virus): Embedding(203, 1)
  (bias_human): Embedding(2727, 1)
)

In [41]:
trainer.run(train_loader, max_epochs=500)

Epoch[1] Iteration[400/11072] Loss: 21.43
Epoch[1] Iteration[800/11072] Loss: 34.17
Epoch[1] Iteration[1200/11072] Loss: 23.30
Epoch[1] Iteration[1600/11072] Loss: 22.02
Epoch[1] Iteration[2000/11072] Loss: 15.63
Epoch[1] Iteration[2400/11072] Loss: 11.93
Epoch[1] Iteration[2800/11072] Loss: 17.66
Epoch[1] Iteration[3200/11072] Loss: 20.70
Epoch[1] Iteration[3600/11072] Loss: 12.47
Epoch[1] Iteration[4000/11072] Loss: 9.72
Epoch[1] Iteration[4400/11072] Loss: 12.59
Epoch[1] Iteration[4800/11072] Loss: 17.64
Epoch[1] Iteration[5200/11072] Loss: 9.59
Epoch[1] Iteration[5600/11072] Loss: 13.19
Epoch[1] Iteration[6000/11072] Loss: 11.99
Epoch[1] Iteration[6400/11072] Loss: 12.49
Epoch[1] Iteration[6800/11072] Loss: 8.99
Epoch[1] Iteration[7200/11072] Loss: 11.48
Epoch[1] Iteration[7600/11072] Loss: 9.27
Epoch[1] Iteration[8000/11072] Loss: 7.68
Epoch[1] Iteration[8400/11072] Loss: 7.80
Epoch[1] Iteration[8800/11072] Loss: 7.22
Epoch[1] Iteration[9200/11072] Loss: 7.29
Epoch[1] Iteration[96

  recall = tps / tps[-1]
 Only one class present in y_true. ROC AUC score is not defined in that case.


Epoch[1] Validation MSE: 5.90 Avg Prec: 0.02 acc: 0.33 ROC: 0.53 
Epoch[2] Iteration[11200/11072] Loss: 6.50
Epoch[2] Iteration[11600/11072] Loss: 4.94
Epoch[2] Iteration[12000/11072] Loss: 5.15
Epoch[2] Iteration[12400/11072] Loss: 4.72
Epoch[2] Iteration[12800/11072] Loss: 4.65
Epoch[2] Iteration[13200/11072] Loss: 5.13
Epoch[2] Iteration[13600/11072] Loss: 4.35
Epoch[2] Iteration[14000/11072] Loss: 4.44
Epoch[2] Iteration[14400/11072] Loss: 4.19
Epoch[2] Iteration[14800/11072] Loss: 4.23
Epoch[2] Iteration[15200/11072] Loss: 4.01
Epoch[2] Iteration[15600/11072] Loss: 3.88
Epoch[2] Iteration[16000/11072] Loss: 3.95
Epoch[2] Iteration[16400/11072] Loss: 3.73
Epoch[2] Iteration[16800/11072] Loss: 3.79
Epoch[2] Iteration[17200/11072] Loss: 3.76
Epoch[2] Iteration[17600/11072] Loss: 3.63
Epoch[2] Iteration[18000/11072] Loss: 3.61
Epoch[2] Iteration[18400/11072] Loss: 3.58
Epoch[2] Iteration[18800/11072] Loss: 3.51
Epoch[2] Iteration[19200/11072] Loss: 3.52
Epoch[2] Iteration[19600/11072]

  recall = tps / tps[-1]
 Only one class present in y_true. ROC AUC score is not defined in that case.


Epoch[2] Validation MSE: 3.43 Avg Prec: 0.08 acc: 0.97 ROC: 0.78 
Epoch[3] Iteration[22400/11072] Loss: 3.40
Epoch[3] Iteration[22800/11072] Loss: 3.39
Epoch[3] Iteration[23200/11072] Loss: 3.46
Epoch[3] Iteration[23600/11072] Loss: 3.38
Epoch[3] Iteration[24000/11072] Loss: 3.37
Epoch[3] Iteration[24400/11072] Loss: 3.36
Epoch[3] Iteration[24800/11072] Loss: 3.35
Epoch[3] Iteration[25200/11072] Loss: 3.36
Epoch[3] Iteration[25600/11072] Loss: 3.36
Epoch[3] Iteration[26000/11072] Loss: 3.34
Epoch[3] Iteration[26400/11072] Loss: 3.31
Epoch[3] Iteration[26800/11072] Loss: 3.31
Epoch[3] Iteration[27200/11072] Loss: 3.29
Epoch[3] Iteration[27600/11072] Loss: 3.28
Epoch[3] Iteration[28000/11072] Loss: 3.27
Epoch[3] Iteration[28400/11072] Loss: 3.26
Epoch[3] Iteration[28800/11072] Loss: 3.30
Epoch[3] Iteration[29200/11072] Loss: 3.26
Epoch[3] Iteration[29600/11072] Loss: 3.24
Epoch[3] Iteration[30000/11072] Loss: 3.22
Epoch[3] Iteration[30400/11072] Loss: 3.25
Epoch[3] Iteration[30800/11072]

  recall = tps / tps[-1]
 Only one class present in y_true. ROC AUC score is not defined in that case.


Epoch[3] Validation MSE: 3.12 Avg Prec: 0.16 acc: 0.98 ROC: 0.82 
Epoch[4] Iteration[33600/11072] Loss: 3.16
Epoch[4] Iteration[34000/11072] Loss: 3.10
Epoch[4] Iteration[34400/11072] Loss: 3.09
Epoch[4] Iteration[34800/11072] Loss: 3.09


KeyboardInterrupt: 