# Global settings

In [1]:
device = 'cuda'
bs = 64
test_pct = .10
val_pct = .60
epochs = 2

# Data proc

In [2]:
import pandas as pd
import numpy as np
import torch
import itertools
from make_matrix import *
from torch.utils.data import TensorDataset, DataLoader, Sampler
import torch.nn as nn
import torch.nn.functional as F

In [3]:
fludf = pd.read_csv('flu_train.csv')

In [4]:
fludf.head()

Unnamed: 0.1,Unnamed: 0,virusUprot,humanUprot,edge
0,0,P03433,P49736,1.0
1,1,P03433,P15311,0.0
2,2,P03433,P11142,0.0
3,3,P03433,Q86U42,0.0
4,4,P03433,P33992,1.0


## create features

In [5]:
import numpy as np
import pickle
from Bio import SeqIO
from Bio.Alphabet import ProteinAlphabet
import json
import re

In [6]:
D = pickle.load(open('D.pkl', 'rb'))

In [7]:
classes = dict.fromkeys(['A', 'G','V'], 1)
classes.update(dict.fromkeys(['I', 'L', 'F', 'P'], 2))
classes.update(dict.fromkeys(['Y', 'M', 'T', 'S'], 3))
classes.update(dict.fromkeys(['H', 'N', 'Q', 'W'], 4))
classes.update(dict.fromkeys(['R', 'K'], 5))
classes.update(dict.fromkeys(['D', 'E'], 6))
classes.update(dict.fromkeys(['C', 'U'], 7))

In [8]:
def protToClass(p):
    if p == 'X':
        return '-1'
    else:
        return str(classes[p])

def seqToClass(seq):
    return ''.join(list(map(protToClass, seq)))

def normalize(Fi):
    return (Fi - min(Fi)) / max(Fi)

def getFi(D, seq):
    grptoi = {p:i for i,p in enumerate(D)} # group to index mappings
    Fi = np.zeros(len(grptoi.values()))
    
    classSeq = seqToClass(seq)
#     print(classSeq)
    
    for p in D:
        Fi[grptoi[p]] += classSeq.count(''.join(p))

    return normalize(Fi)

In [9]:
def proteinize(records):
    for r in records:
        r.seq.Alphabet = ProteinAlphabet

In [10]:
virus_records = list(SeqIO.parse('virus_prots.fasta', 'fasta'))
human_records = list(SeqIO.parse('human_prots.fasta', 'fasta'))

In [11]:
proteinize(virus_records)
proteinize(human_records)

In [12]:
print(virus_records[0].name.split('|')[1]) # to get names of viruses

Q9WMX2


In [13]:
def getProt(fasta):
    return fasta.name.split('|')[1]

### Make a dictionary with k:v gene:Fi

In [14]:
def getFeatMap(records):
    sidefeats_map = {}
    
    for fasta in records:
        prot = getProt(fasta)
        sidefeats_map[prot] = getFi(D, fasta.seq)
        
    return sidefeats_map

In [15]:
v_featmap = getFeatMap(virus_records)

In [16]:
h_featmap = getFeatMap(human_records)

### Numericalize proteins

In [17]:
def numericalize(df):
    vs = df['virusUprot'].unique()
    hs = df['humanUprot'].unique()
    vs_map = {v:i for i,v in enumerate(vs)}
    hs_map = {h:i for i,h in enumerate(hs)}
    
    return vs_map, hs_map

In [18]:
vs_map, hs_map = numericalize(fludf)

In [19]:
def createFeatMatrix(prots, ptoi, featmap):
    feats = np.zeros((len(prots), 2793))
    
    for p in prots:
#         print(p)
        feats[ptoi[p], :] =  featmap[p]
        
    return feats

In [20]:
vfeats = createFeatMatrix(fludf['virusUprot'].unique(), vs_map, v_featmap)

In [21]:
hfeats = createFeatMatrix(fludf['humanUprot'].unique(), hs_map, h_featmap)

### Make a dataloaders

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
def arr_to_torch(arr, dtype):
    t =  torch.from_numpy(np.array(arr)).type(dtype).to(device)
    return t

In [24]:
def getLoaders(df):
    X = list(zip(df.virusUprot.values, df.humanUprot.values))
    y = df['edge'].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_pct, random_state=1)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_pct, random_state=1)
    
    train_dset = TensorDataset(arr_to_torch(X_train, torch.long), arr_to_torch(y_train, torch.float32))
    train_loader = DataLoader(train_dset, batch_size=bs, shuffle=True)
    
    val_dset = TensorDataset(arr_to_torch(X_val, torch.long), arr_to_torch(y_val, torch.float32))
    val_loader = DataLoader(val_dset, batch_size=bs, shuffle=True)
    
    test_dset = TensorDataset(arr_to_torch(X_test, torch.long), arr_to_torch(y_test, torch.float32))
    test_loader = DataLoader(test_dset, batch_size=bs, shuffle=True)
    
    return train_loader, val_loader, test_loader

In [25]:
fludf.virusUprot = fludf.virusUprot.apply(lambda x : vs_map[x])
fludf.humanUprot = fludf.humanUprot.apply(lambda x : hs_map[x])

In [26]:
flutrain_loader, fluval_loader, flutest_loader = getLoaders(fludf)

# Model

In [27]:
def l2_regularize(array):
    loss = torch.sum(array ** 2.0)
    return loss

In [28]:
class BMF(nn.Module):
    def __init__(self, n_virus, n_human, vfeats, hfeats, c_vector=1.0, c_bias=1.0, writer=None):
        super(BMF, self).__init__()
        self.writer = writer
        self.k = vfeats.shape[1]
        self.n_virus = n_virus
        self.n_human = n_human
        self.c_bias = c_bias
        self.c_vector = c_vector
        
        self.virus = nn.Embedding(n_virus, 2793)
        self.human = nn.Embedding(n_human, 2793)
#         self.virus = nn.Parameter(n_virus, 2793)
#         self.human = nn.Parameter(n_human, 2793)
        
        # We've added new terms here:
        self.bias_virus = nn.Embedding(n_virus, 1)
        self.bias_human = nn.Embedding(n_human, 1)
        self.bias = nn.Parameter(torch.ones(1))
        
        self.vfeats = nn.Embedding(n_virus, 2793)
        self.vfeats.weight = nn.Parameter(torch.from_numpy(vfeats).to(device))
        self.vfeats.weight.requires_grad = False
        
        self.hfeats = nn.Embedding(n_human, 2793)
        self.hfeats.weight = nn.Parameter(torch.from_numpy(hfeats).to(device))
        self.hfeats.weight.requires_grad = False
    
    def forward(self, train_x):
        virus_id = train_x[:, 0]
        human_id = train_x[:, 1]
        
        virus_feats = self.virus(virus_id)
        human_feats = self.human(human_id)
        vector_virus = self.virus(virus_id)
        vector_human = self.human(human_id)
        
#         print('feats: ', virus_feats.shape, human_feats.shape)
#         print('vecs:', vector_virus.shape, vector_human.shape)
        
        # Pull out biases
        bias_virus = self.bias_virus(virus_id).squeeze()
        bias_human = self.bias_human(human_id).squeeze()
        biases = (self.bias + bias_virus + bias_human)
        
        xU = torch.sum(virus_feats * vector_virus, dim=1)
        xUV = torch.sum(xU * torch.t(vector_human), dim=1)
        xUVy = torch.sum(xUV * human_feats, dim=1)

        prediction = xUVy + biases
        return prediction
    
    def loss(self, prediction, target):
#         loss_mse = F.binary_cross_entropy_with_logits(prediction, target.squeeze())
        loss_mse = F.mse_loss(prediction, target.squeeze())
    
        # Add new regularization to the biases
        prior_bias_virus =  l2_regularize(self.bias_virus.weight) * self.c_bias
        prior_bias_human = l2_regularize(self.bias_human.weight) * self.c_bias
        
        prior_virus =  l2_regularize(self.virus.weight.data) * self.c_vector
        prior_human = l2_regularize(self.human.weight.data) * self.c_vector
        total = loss_mse + prior_virus + prior_human + prior_bias_virus + prior_bias_human
        return total

# Trainer

In [29]:
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Loss, Accuracy, Precision, Recall
from tensorboardX import SummaryWriter
from ignite.metrics import MeanSquaredError, Loss
from ignite.contrib.metrics import AveragePrecision, ROC_AUC

from datetime import datetime

In [30]:

class Trainer:
    
    def __init__(self, model, crit, optim, writer, train_loader, val_loader, test_loader, modelname):
        self.model = model
        self.crit = crit
        self.optim = optim
        self.writer = writer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.test_loader = test_loader
        self.best_model = None
        self.best_AP = -1.0
        self.modelname = modelname
        
        self.trainer = create_supervised_trainer(model, optim, crit)
        self.metrics = {'loss': Loss(crit), 'ap': AveragePrecision(), "roc": ROC_AUC()}
        self.evaluator = create_supervised_evaluator(model, metrics=self.metrics)
        
        ## Add events
        self.trainer.add_event_handler(event_name=Events.ITERATION_COMPLETED, handler=self.log_training_loss)
        self.trainer.add_event_handler(event_name=Events.EPOCH_COMPLETED, handler=self.log_validation_results)
        self.trainer.add_event_handler(event_name=Events.COMPLETED, handler=self.log_test_results)
        
        print(model)
        
    def log_training_loss(self, engine, log_interval=400):
        epoch = engine.state.epoch
        itr = engine.state.iteration
        fmt = "TRAIN: Epoch[{}] Iteration[{}/{}] Loss: {:.2f}"
        msg = fmt.format(epoch, itr, len(self.train_loader), engine.state.output)
        self.model.itr = itr
        if itr % log_interval == 0:
            print(msg)
#             self.evaluator.run(self.train_loader)
            
#             metrics = self.evaluator.state.metrics
#             mse = metrics['loss']
#             ap = metrics['ap']
#             roc = metrics['roc']
            
#             print("Epoch[{}] Validation MSE: {:.2f} Avg Prec: {:.2f} ROC: {:.2f} "
#                   .format(engine.state.epoch, mse, ap, roc))
            
#             self.writer.add_scalar("training/mse", mse, engine.state.epoch)
#             self.writer.add_scalar("training/ap", ap, engine.state.epoch)
#             self.writer.add_scalar("training/roc", roc, engine.state.epoch)
            
    def log_validation_results(self, engine):
        self.evaluator.run(self.val_loader)
        
        metrics = self.evaluator.state.metrics
        mse = metrics['loss']
        ap = metrics['ap']
        roc = metrics['roc']
        
        if ap > self.best_AP:
            self.best_AP = ap
            self.best_model = model.state_dict()
        
        print("VALIDATION Epoch[{}] Validation MSE: {:.2f} Avg Prec: {:.2f} ROC: {:.2f} "
              .format(engine.state.epoch, mse, ap, roc))
        self.writer.add_scalar("validation/mse", mse, engine.state.epoch)
        self.writer.add_scalar("validation/ap", ap, engine.state.epoch)
        self.writer.add_scalar("validation/roc", roc, engine.state.epoch)

    def log_test_results(self, engine):
        self.evaluator.run(self.test_loader)
        
        metrics = self.evaluator.state.metrics
        mse = metrics['loss']
        ap = metrics['ap']
        roc = metrics['roc']

        print("TEST: Epoch[{}] Validation MSE: {:.2f} Avg Prec: {:.2f} ROC: {:.2f}"
              .format(engine.state.epoch, mse, ap, roc))
        
        print("BEST AP: ", self.best_AP)
        torch.save(self.best_model, './{}.pt'.format(self.modelname))
        
    def run(self, epochs):
        self.trainer.run(self.train_loader, max_epochs=epochs)

## Hyperparametrs

In [31]:
lr = 1e-3
k =2793
# regularizing bias
c_bias = 1e-4
c_vector = 1e-4
batchsize = bs

## Flu

In [32]:
log_dir = 'runs/FLU_simple_mf_02_bias_' + str(datetime.now()).replace(' ', '_')
print(log_dir)

runs/FLU_simple_mf_02_bias_2019-05-20_23:26:02.195029


In [33]:
writer = SummaryWriter(log_dir=log_dir)
model = BMF(len(vs_map), len(hs_map), vfeats, hfeats, writer=writer, c_bias=c_bias, c_vector=c_vector)
crit = model.loss

model.cuda()
optim = torch.optim.Adam(model.parameters())

In [34]:
flutrainer = Trainer(model, crit, optim, writer, flutrain_loader, fluval_loader, flutrain_loader, 'flumodel')

BMF(
  (virus): Embedding(203, 2793)
  (human): Embedding(2727, 2793)
  (bias_virus): Embedding(203, 1)
  (bias_human): Embedding(2727, 1)
  (vfeats): Embedding(203, 2793)
  (hfeats): Embedding(2727, 2793)
)


In [None]:
flutrainer.run(2)

TRAIN: Epoch[1] Iteration[400/3114] Loss: 36458681335808.00
