# Global settings

In [1]:
device = 'cuda'
bs = 32

# Data proc

In [2]:
import pandas as pd
import numpy as np
import torch
import itertools
from make_matrix import *
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

In [3]:
df = pd.read_csv('data.txt', delim_whitespace=True, header=None)

In [4]:
df.head()

Unnamed: 0,0,1,2,3
0,STAT1,ATF3,71,3
1,STAT1,NFYB,71,85
2,STAT1,SP1,71,70
3,ZFP238,EGR1,15,8
4,ZFP238,IRF9,15,83


In [5]:
vs = df[2].unique()
hs = df[3].unique()

In [6]:
len(df)

666

In [7]:
observed = []
for _, row in df.iterrows():
    observed.append((row[2], row[3]))

In [8]:
len(observed)

666

In [9]:
net_df = {'vs':[], 'hs':[], 'edge':[]}

In [10]:
for v in vs:
    for h in hs:
        net_df['vs'].append(v)
        net_df['hs'].append(h)
        if (v,h) in observed:
            net_df['edge'].append(1.0)
        else:
            net_df['edge'].append(0.0)

In [11]:
netdf = pd.DataFrame(net_df)

In [12]:
len(netdf.loc[netdf['edge'] == 1])

666

In [13]:
netdf = netdf.sample(frac=1).reset_index()[['vs', 'hs', 'edge']]

make sure indices are continuous

In [14]:
netdf.head()

Unnamed: 0,vs,hs,edge
0,85,20,1.0
1,94,35,0.0
2,53,94,1.0
3,93,0,0.0
4,44,81,1.0


In [15]:
vs.sort()
hs.sort()
vs_map = {v:i for i,v in enumerate(vs)}
hs_map = {h:i for i, h in enumerate(hs)}

In [16]:
netdf.vs = netdf.vs.apply(lambda x : vs_map[x])
netdf.hs = netdf.hs.apply(lambda x : hs_map[x])
netdf.head()

Unnamed: 0,vs,hs,edge
0,34,11,1.0
1,39,16,0.0
2,24,39,1.0
3,38,0,0.0
4,21,32,1.0


In [32]:
len(netdf.loc[netdf['edge'] == 1]) / len(netdf)

0.41625

### Make a dataloader

In [17]:
X = list(zip(netdf.vs.values, netdf.hs.values))

In [18]:
y = netdf['edge'].values

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=1)

In [21]:
def arr_to_torch(arr, dtype):
    t =  torch.from_numpy(np.array(arr)).type(dtype).to(device)
    return t

In [22]:
train_dset = TensorDataset(arr_to_torch(X_train, torch.long), arr_to_torch(y_train, torch.float32))
train_loader = DataLoader(train_dset, batch_size=bs)

In [23]:
val_dset = TensorDataset(arr_to_torch(X_val, torch.long), arr_to_torch(y_val, torch.float32))
val_loader = DataLoader(val_dset, batch_size=bs)

In [24]:
test_dset = TensorDataset(arr_to_torch(X_test, torch.long), arr_to_torch(y_test, torch.float32))
test_loader = DataLoader(test_dset, batch_size=bs)

# Model

In [25]:
def l2_regularize(array):
    loss = torch.sum(array ** 2.0)
    return loss

In [26]:
class MF(nn.Module):
    def __init__(self, n_virus, n_human, k=18, c_vector=1.0, c_bias=1.0, writer=None):
        super(MF, self).__init__()
        self.writer = writer
        self.k = k
        self.n_virus = n_virus
        self.n_human = n_human
        self.c_bias = c_bias
        self.c_vector = c_vector
        
        self.virus = nn.Embedding(n_virus, k)
        self.human = nn.Embedding(n_human, k)
        
        # We've added new terms here:
        self.bias_virus = nn.Embedding(n_virus, 1)
        self.bias_human = nn.Embedding(n_human, 1)
        self.bias = nn.Parameter(torch.ones(1))
        
#         self.affine = nn.Linear(in_features=self.k, out_features=1)
#         self.logistic = nn.Sigmoid()
    
    def forward(self, train_x):
        virus_id = train_x[:, 0]
        human_id = train_x[:, 1]
        vector_virus = self.virus(virus_id)
        vector_human = self.human(human_id)
        
        # Pull out biases
        bias_virus = self.bias_virus(virus_id).squeeze()
        bias_human = self.bias_human(human_id).squeeze()
        biases = (self.bias + bias_virus + bias_human)
        
        ui_interaction = torch.sum(vector_virus * vector_human, dim=1)
        
        prediction = ui_interaction + biases
        return prediction
    
    def loss(self, prediction, target):
#         loss_mse = F.binary_cross_entropy_with_logits(prediction, target.squeeze())
        loss_mse = F.mse_loss(prediction, target.squeeze())
    
        # Add new regularization to the biases
        prior_bias_virus =  l2_regularize(self.bias_virus.weight) * self.c_bias
        prior_bias_human = l2_regularize(self.bias_human.weight) * self.c_bias
        
        prior_virus =  l2_regularize(self.virus.weight.data) * self.c_vector
        prior_human = l2_regularize(self.human.weight.data) * self.c_vector
        total = loss_mse + prior_virus + prior_human + prior_bias_virus + prior_bias_human
        return total

# Trainer

In [27]:
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Loss, Accuracy, Precision, Recall
from tensorboardX import SummaryWriter
from ignite.metrics import MeanSquaredError, Loss
from ignite.contrib.metrics import AveragePrecision, ROC_AUC

from datetime import datetime

In [28]:
# Hyperparameters
lr = 1e-3
k =20
# New parameter for regularizing bias
c_bias = 1e-4
c_vector = 1e-4
batchsize = bs
log_dir = 'runs/simple_mf_02_bias_' + str(datetime.now()).replace(' ', '_')
print(log_dir)

runs/simple_mf_02_bias_2019-05-16_21:16:41.891986


In [29]:
writer = SummaryWriter(log_dir=log_dir)
# crit = model.loss
crit = nn.MSELoss()

model = MF(len(vs_map), len(hs_map), writer=writer, k=k, c_bias=c_bias, c_vector=c_vector)
# model.cuda()
optimizer = torch.optim.Adam(model.parameters())
trainer = create_supervised_trainer(model, optimizer, model.loss)
metrics = {'loss': Loss(crit), 'ap': AveragePrecision(), "acc": Accuracy(), "roc": ROC_AUC()}
evaluator = create_supervised_evaluator(model, metrics=metrics)

def log_training_loss(engine, log_interval=400):
    epoch = engine.state.epoch
    itr = engine.state.iteration
    fmt = "Epoch[{}] Iteration[{}/{}] Loss: {:.2f}"
    msg = fmt.format(epoch, itr, len(train_loader), engine.state.output)
    model.itr = itr
    if itr % log_interval == 0:
        print(msg)
        metrics = evaluator.state.metrics
        mse = metrics['loss']
        avg_precision = metrics['ap']
        accuracy = metrics['acc']
        roc = metrics['roc']
        writer.add_scalar("training/mse", mse, engine.state.epoch)
        writer.add_scalar("training/ap", avg_precision, engine.state.epoch)
        writer.add_scalar("training/accuracy", accuracy, engine.state.epoch)
        writer.add_scalar("training/roc", roc, engine.state.epoch)

trainer.add_event_handler(event_name=Events.ITERATION_COMPLETED, handler=log_training_loss)

def log_validation_results(engine):
    evaluator.run(val_loader)
    metrics = evaluator.state.metrics
    mse = metrics['loss']
    avg_precision = metrics['ap']
    accuracy = metrics['acc']
    roc = metrics['roc']
    print("Epoch[{}] Validation MSE: {:.2f} Avg Prec: {:.2f} acc: {:.2f} ROC: {:.2f} "
          .format(engine.state.epoch, mse, avg_precision, accuracy, roc))
    writer.add_scalar("validation/mse", mse, engine.state.epoch)
    writer.add_scalar("validation/avg_precision", avg_precision, engine.state.epoch)
    writer.add_scalar("validation/accuracy", accuracy, engine.state.epoch)
    writer.add_scalar("validation/roc", roc, engine.state.epoch)

    
def log_test_results(engine):
    evaluator.run(test_loader)
    metrics = evaluator.state.metrics
    mse = metrics['loss']
    avg_precision = metrics['ap']
    accuracy = metrics['acc']
    roc = metrics['roc']

    print("TEST: Epoch[{}] Validation MSE: {:.2f} Avg Prec: {:.2f} acc: {:.2f} ROC: {:.2f}"
          .format(engine.state.epoch, mse, avg_precision, accuracy, roc))


trainer.add_event_handler(event_name=Events.EPOCH_COMPLETED, handler=log_validation_results)
trainer.add_event_handler(event_name=Events.COMPLETED, handler=log_test_results)


model

MF(
  (virus): Embedding(40, 20)
  (human): Embedding(40, 20)
  (bias_virus): Embedding(40, 1)
  (bias_human): Embedding(40, 1)
)

In [30]:
model.to(device)

MF(
  (virus): Embedding(40, 20)
  (human): Embedding(40, 20)
  (bias_virus): Embedding(40, 1)
  (bias_human): Embedding(40, 1)
)

In [31]:
trainer.run(train_loader, max_epochs=500)

Epoch[1] Validation MSE: 24.27 Avg Prec: 0.46 acc: 0.08 ROC: 0.51 
Epoch[2] Validation MSE: 23.71 Avg Prec: 0.46 acc: 0.08 ROC: 0.51 
Epoch[3] Validation MSE: 23.17 Avg Prec: 0.46 acc: 0.08 ROC: 0.51 
Epoch[4] Validation MSE: 22.65 Avg Prec: 0.46 acc: 0.07 ROC: 0.51 
Epoch[5] Validation MSE: 22.15 Avg Prec: 0.46 acc: 0.08 ROC: 0.51 
Epoch[6] Validation MSE: 21.68 Avg Prec: 0.46 acc: 0.08 ROC: 0.51 
Epoch[7] Validation MSE: 21.22 Avg Prec: 0.46 acc: 0.07 ROC: 0.51 
Epoch[8] Validation MSE: 20.78 Avg Prec: 0.46 acc: 0.08 ROC: 0.51 
Epoch[9] Validation MSE: 20.36 Avg Prec: 0.46 acc: 0.09 ROC: 0.51 
Epoch[10] Iteration[400/41] Loss: 13.10
Epoch[10] Validation MSE: 19.95 Avg Prec: 0.46 acc: 0.10 ROC: 0.51 
Epoch[11] Validation MSE: 19.56 Avg Prec: 0.46 acc: 0.10 ROC: 0.51 
Epoch[12] Validation MSE: 19.18 Avg Prec: 0.46 acc: 0.11 ROC: 0.51 
Epoch[13] Validation MSE: 18.82 Avg Prec: 0.46 acc: 0.10 ROC: 0.51 
Epoch[14] Validation MSE: 18.47 Avg Prec: 0.46 acc: 0.10 ROC: 0.51 
Epoch[15] Validat

<ignite.engine.engine.State at 0x7f5b162f9e10>