In [1]:
import numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.autograd import Variable

## Load Data

In [2]:
# Load sequences, labels, protein lengths for train/valid/test
train = torch.load('data/yst_train.pt')
valid = torch.load('data/yst_valid.pt')
test = torch.load('data/yst_test.pt')

train_lengths = torch.from_numpy(np.load('data/yst_train_lengths.npy'))
valid_lengths = torch.from_numpy(np.load('data/yst_valid_lengths.npy'))
test_lengths = torch.from_numpy(np.load('data/yst_test_lengths.npy'))

y_train = torch.from_numpy(np.load('data/yst_train_labels.npy'))
y_valid = torch.from_numpy(np.load('data/yst_valid_labels.npy'))
y_test = torch.from_numpy(np.load('data/yst_test_labels.npy'))

yst_go_terms = np.load('data/yst_go_terms.npy')

## Train Model

In [3]:
class FastText(nn.Module):
    """
    FastText model
    """
       
    def __init__(self, vocab_size, emb_dim):
       
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(FastText, self).__init__()
        # TODO: replace with your code
        self.embed = nn.Embedding(vocab_size+1, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,num_labels)
        
    
    def forward(self, data, length):
        """
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        # TODO: replace with your code
        out = torch.sum(self.embed(data.long()),dim=1)
        length = length.float().view((-1,1))
        out = self.linear(out/length)
        
        return nn.functional.sigmoid(out)
    
def train_test(tr_seq,tr_len,tr_lab,tst_seq,tst_len,tst_lab):
    criterion = nn.MultiLabelSoftMarginLoss()  
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 
    train_sequences = Variable(tr_seq)
    train_lens = Variable(tr_len)
    train_labels = Variable(tr_lab)
    valid_sequences = Variable(tst_seq)
    valid_lens = Variable(tst_len)
    valid_label = tst_lab.numpy()
    
    for j in range(data_size//batch_size):
        batch = j*batch_size
        sequence = train_sequences[batch:batch+batch_size]
        length = train_lens[batch:batch+batch_size]
        optimizer.zero_grad()
        outputs = model(sequence, length)
        loss = criterion(outputs.float(),train_labels[batch:batch+batch_size].view((-1,1)).float())
        loss.backward()
        optimizer.step()

    model.eval()
    val_outputs = model(valid_sequences,valid_lens)
    model.train()
    
    predicted = np.round(val_outputs.data.numpy())
    total_predictions = valid_label.size
    accuracy = np.sum(predicted==valid_label)/total_predictions
    
    print("Accuracy: ",accuracy)

In [4]:
# (some of the) hyper parameters
learning_rate = 0.001
vocab_size = 26 # number words in the vocabulary base
emb_dim = 50 # dimension for n-gram embedding
num_epochs = 1 # number epoch to train
batch_size = 26
data_size = train.shape[0]
num_labels = len(yst_go_terms)

In [5]:
model = FastText(vocab_size, emb_dim)
criterion = nn.MultiLabelSoftMarginLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 

In [6]:
train_test(train,train_lengths,y_train,valid,valid_lengths,y_valid)

  "Please ensure they have the same size.".format(target.size(), input.size()))


Accuracy:  0.864965252816
