### Notebook to configure model

In [1]:
import time

from models.modules.multihead import *
from utils.prepare_data import *

import pandas as pd


import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context(context="talk")

import torch
import torch.nn as nn
import tensorflow as tf
import torch.nn.functional as F
from torchvision import datasets
import torchvision.transforms as transforms
from torch.autograd import Variable

import math, copy, time

from keras.utils import np_utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [72]:
# Hyperparameter

MAX_SEQ_LENGTH = 100
EMBEDDING_SIZE = 10
HIDDEN_SIZE = 512
ATTENTION_SIZE = 64

lr = 1e-3
BATCH_SIZE = 256
KEEP_PROB = 0.5
LAMBDA = 0.0001

VOCAB_SIZE = 5

MAX_LABEL = 2

GENOME_LENGTH = 20000
CONTEXT_SIZE = GENOME_LENGTH


In [None]:
def attention(query, key, value, dropout=None):
    
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn


In [None]:
class NGramDenseEmbedding(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramDenseEmbedding, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


In [None]:
class AttentionLR(nn.Module):
    
    def __init__(self, input_size, num_classes, dropout=0.1):
        super(AttentionLR, self).__init__()
        
        self.dropout = nn.Dropout(p=dropout)
        self.KQ_attn = None
        self.KQV_attn = None
        self.linears = clones(nn.Linear(input_size, num_classes),1)
        
        self.K = NGramDenseEmbedding(VOCAB_SIZE, EMBEDDING_SIZE, CONTEXT_SIZE)
        self.V = NGramDenseEmbedding(VOCAB_SIZE, EMBEDDING_SIZE, CONTEXT_SIZE)
        
        self.linear = nn.Linear(input_size, num_classes)
        
    def forward(self, query_seq):
        
        Q_lookup = torch.tensor([word_to_ix[w] for w in query_seq], dtype=torch.long)
        
        K_lookup = self.K(Q_lookup)
        V_lookup = self.V(Q_lookup)       
        
        self.KQ_attn, self.KQV_attn = attention(Q_lookup, K_lookup, V_lookup, dropout=self.dropout)
                
        return F.log_softmax(self.linear(self.KQV_attn), dim=1)
    

In [2]:
class SequenceAttentionClassifier(nn.Module):
    def __init__(self, genome_length, read_length, vocab_size=5, query_size=64, embedding_size=128, num_classes=2):
        super(SequenceAttentionClassifier, self).__init__()
        self.genome_length = genome_length
        self.read_length = read_length
        self.vocab_size = vocab_size
        self.query_size = query_size
        self.embedding_size = embedding_size
        self.num_classes = num_classes
        self.K = nn.Embedding(vocab_size*genome_length, embedding_size)
        self.V = nn.Embedding(vocab_size*genome_length, query_size)
        self.W = nn.Linear(query_size, num_classes)
        self.Q = nn.Linear(embedding_size, query_size)
        
    def forward(self, read):
        # 'read' here should be mapped to a flattened form where X_ij = 1 maps to i*vocab_size + j
        K_lookup = self.K(read) # Get the relevant keys
        V_lookup = self.V(read) # Get the relevant values

        # Get the attention weights
        logits = self.Q(K_lookup) / math.sqrt(self.embedding_size)
        probs = F.softmax(logits, dim = -1)
        
        # Calculate the covariates for the logistic regression
        X = torch.matmul(probs, V_lookup)

        # Right now we can just ignore the fact that we're doing a linear-transform.
        # In the future we'll add nonlinearities

        # Return the logits for the classifier
        return self.W(X)
    

In [92]:
# load data
x_train, y_train = load_data("../data/train-BRAF.csv", sample_ratio=1)
x_test, y_test = load_data("../data/test-BRAF.csv", sample_ratio=1)


(2440198, 2)
(155244, 2)


In [58]:
word_to_ix = {'N':0, 'A':1, 'C':2, 'T':3, 'G':4}
x_train_l = np.ndarray((len(x_train),MAX_SEQ_LENGTH))

for t in np.arange(len(x_train)):
    line = list(x_train[t])[1:MAX_SEQ_LENGTH+1]
    for k in np.arange(MAX_SEQ_LENGTH):
        x_train_l[t,k] = word_to_ix[line[k]]
    
x_test_l = np.ndarray((len(x_test),MAX_SEQ_LENGTH))

for t in np.arange(len(x_test)):    
    line = list(x_test[t])[1:MAX_SEQ_LENGTH+1]
    for k in np.arange(MAX_SEQ_LENGTH):
        x_test_l[t,k] = word_to_ix[line[k]]
    
print(x_train_l.shape)
print(x_test_l.shape)
  

(2440198, 100)
(155244, 100)


In [86]:
print(x_train_l.shape)
print(x_test_l.shape)

print(x_train_l[0])

(2440198, 100)
(155244, 100)
[3. 3. 1. 4. 3. 3. 1. 1. 1. 1. 1. 3. 4. 1. 4. 4. 3. 3. 4. 4. 1. 1. 1. 4.
 3. 1. 1. 1. 3. 2. 3. 4. 1. 2. 2. 3. 1. 4. 1. 3. 4. 1. 3. 3. 4. 4. 3. 3.
 3. 4. 1. 2. 1. 1. 3. 4. 1. 4. 4. 1. 3. 3. 1. 1. 2. 3. 3. 3. 1. 2. 3. 1.
 1. 3. 3. 1. 4. 1. 3. 3. 1. 3. 4. 3. 4. 4. 2. 3. 4. 1. 3. 4. 4. 4. 3. 4.
 3. 3. 3. 3.]


In [93]:
x_train = x_train_l[0:10000,:]
x_test = x_test_l[0:1000,:]

y_train = y_train[0:10000,:]
y_test = y_test[0:1000,:]

print(x_train.shape)
print(x_test.shape)

(10000, 100)
(1000, 100)


In [88]:
# # data preprocessing
# x_train, x_test, vocab, VOCAB_SIZE = \
#     data_preprocessing(x_train, x_test, MAX_SEQ_LENGTH)
# print(vocab_size)


In [94]:
# split dataset to test and dev
x_test, x_dev, y_test, y_dev, dev_size, test_size = \
    split_dataset(x_test, y_test, 0.1)
print("Validation size: ", dev_size)


1000
100
Validation size:  100


Conv ret: (?, 100, 128)
(?, 100, 128)


In [None]:

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

model = AttentionLR(MAX_SEQ_LENGTH, MAX_LABEL = 2)

In [None]:
# Training the model

num_epochs = 5

for epoch in range(num_epochs):
    
    for i, (images, labels) in enumerate(train_loader):
        
        seq = Variable(images.view(-1, 28 * 28))  # Images flattened into 1D tensors
        labels = Variable(labels)  # Labels 
        
        # Forward -> Backprop -> Optimize
        optimizer.zero_grad()  # Manually zero the gradient buffers
        outputs = model(images)  # Predict the class using the test set
                
        loss = criterion(outputs, labels)  
        loss.backward()
        optimizer.step()
        
        if (i + 1) % 100 == 0:
            print("Epoch {}, loss :{}".format(epoch + 1, loss.data[0]))