# Convert Sequences to Tensors 

In [2]:
import numpy as np
import pandas as pd
import scipy.io as sio
from Bio import SeqIO
import collections


import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import random


pdir = '/Users/Brenton/Documents/Capstone/'

def load_test_sets(filename):

    print("### Loading *.mat file...")
    go_data = sio.loadmat(filename, squeeze_me=True)

    go_terms = go_data['goTerm_labels'] # names of gene ontology function terms
    train_annotations = np.asarray(go_data['trainProts_label'].todense()) # training set of function annotations
    valid_annotations = np.asarray(go_data['validProts_label'].todense()) # valid "" ""
    test_annotations = np.asarray(go_data['testProts_label'].todense()) # test "" ""
    train_inds = go_data['trainProts']
    train_inds = train_inds - 1
    valid_inds = go_data['validProts']
    valid_inds = valid_inds - 1
    test_inds = go_data['testProts']
    test_inds = test_inds - 1 # subtract 1 for matlab index conversion into python

    return train_inds, valid_inds, test_inds, train_annotations, valid_annotations, test_annotations, go_terms

def load_FASTA(filename):
    """ Loads fasta file and returns a list of the Bio SeqIO records """
    print("### Loading fasta file...")
    infile = open(filename, 'rU')
    full_entries = list(SeqIO.parse(infile, 'fasta'))
    sequences = [str(entry.seq) for entry in full_entries]
    names = [str(entry.id) for entry in full_entries]

    return sequences, names

In [4]:
#Human Sequences
fasta = pdir+'human_sequences.fasta'
test_set_file = pdir+'human_annotations_temporal_holdout.mat'

sequences, names = load_FASTA(fasta)
train_inds, valid_inds, test_inds, y_trainH, y_validHuman, y_testHuman, go_termsHuman = load_test_sets(test_set_file)

train_seqsHuman = [sequences[i] for i in train_inds]
print('Number of training prots: ' + str(len(train_seqsHuman)))
valid_seqsHuman = [sequences[i] for i in valid_inds]
print('Number of validation prots: ' + str(len(valid_seqsHuman)))
test_seqsHuman = [sequences[i] for i in test_inds]
print('Number of testing prots: ' + str(len(test_seqsHuman)))

### Loading fasta file...




### Loading *.mat file...
Number of training prots: 9751
Number of validation prots: 3871
Number of testing prots: 1647


In [5]:
#Yeast sequences
fasta = pdir+'yeast_sequences.fasta'
test_set_file = pdir+'yeast_MF_temporal_holdout.mat'

sequences, names = load_FASTA(fasta)
train_inds, valid_inds, test_inds, y_trainYeast, y_validYeast, y_testYeast, go_termsYeast = load_test_sets(test_set_file)

train_seqsYeast = [sequences[i] for i in train_inds]
print('Number of training prots: ' + str(len(train_seqsYeast)))
valid_seqsYeast = [sequences[i] for i in valid_inds]
print('Number of validation prots: ' + str(len(valid_seqsYeast)))
test_seqsYeast = [sequences[i] for i in test_inds]
print('Number of testing prots: ' + str(len(test_seqsYeast)))

### Loading fasta file...
### Loading *.mat file...
Number of training prots: 3447
Number of validation prots: 963
Number of testing prots: 206




------------

# Re-Format data into tensor objects

In [7]:
yTrainYeast = torch.from_numpy(y_trainYeast).type(torch.LongTensor)
yValidYeast = torch.from_numpy(y_validYeast).type(torch.LongTensor)
yTestYeast = torch.from_numpy(y_testYeast).type(torch.LongTensor)
yTrainYeast


    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      1     0     1
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      1     0     1
[torch.LongTensor of size 3447x26]

## Vectorize all amino-acid chains in the list 
#### Each amino-acid string becomes one row in a tensor object.
#### This tensor object has dimension NxD, where N is the number of amino-acid strings and D is the length of the longest chain in the set. 

In [8]:
ConvertCharToInt = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'H':8, 'I':9, 'J':10,
                   'K':11, 'L':12, 'M':13, 'N':14, 'O':15, 'P':16, 'Q':17, 'R':18, 'S':19,
                   'T':20, 'U':21, 'V':22, 'W':23, 'X':24, 'Y':25, 'Z':26}

def vectorize_AAs(string):
    '''This function takes an amino-acid string as input and outputs a vector of integers, with each
    integer representing one amino acid.
    
    For example, 'BACEA' is converted to [2, 1, 3, 5, 1]
    '''
    character_list = list(string) #converts 'BACEA' to ['B','A','C','E','A]
    for i in range(len(character_list)):
        character_list[i] = ConvertCharToInt[character_list[i]] #convert the character to a number
    return character_list

def AddZeros_and_ConvertToTensor(oldvector, max_length):
    '''This function adds the necessary number of zeros at the end of a tensor object'''
    #max_length = length of longest vector in the batch
    #oldvector = initial vector for that amino-acid chain
    vector = torch.FloatTensor(oldvector)
    diff = max_length - vector.size()[0]
    if diff>0:
        ZerosToAdd = torch.zeros(diff)
        NewTensor = torch.cat((vector,ZerosToAdd),0)
    else:
        NewTensor = vector
    return NewTensor 

def TransformAAsToTensor(OldListOfSequences):
    '''This function takes as input a list of amino acid strings and creates a tensor matrix
    of dimension NxD, where N is the number of strings and D is the length of the longest AA chain
    
    "OldListOfSequences" can be training, validation, or test sets
    '''
    NewListOfSequences = [] #This will be a list of vectors (one vector per chain)
    max_length = 0
    for AA in range(len(OldListOfSequences)): #for each amino-acid sequence
        integer_vector = vectorize_AAs(OldListOfSequences[AA])
        #find longest amino-acid sequence
        if len(integer_vector)>max_length:
            max_length = len(integer_vector)
        
        NewListOfSequences.append(integer_vector) #append integer-list to NewListOfSequences
    
    #Initialize new tensor
    NewTensor = AddZeros_and_ConvertToTensor(NewListOfSequences[0], max_length)
    
    for i in range(1, len(NewListOfSequences)):
        vector = AddZeros_and_ConvertToTensor(NewListOfSequences[i], max_length)
        if i == 1:
            NewTensor = torch.stack((NewTensor,vector),1)
        else:
            NewTensor = torch.cat((NewTensor,vector),1)
    
    NewTensor = torch.transpose(NewTensor,0,-1)
    return NewTensor

### Takes a couple minutes to run. 

In [9]:
TrainSeqsYeast = TransformAAsToTensor(train_seqsYeast)
ValidSeqsYeast = TransformAAsToTensor(valid_seqsYeast)
TestSeqsYeast = TransformAAsToTensor(test_seqsYeast)
TestSeqsYeast


   13     3     7  ...      0     0     0
   13     4     1  ...      0     0     0
   13     9    11  ...      0     0     0
       ...          ⋱          ...       
   13    12     4  ...      0     0     0
   13    22    19  ...      0     0     0
   13    12    13  ...      0     0     0
[torch.FloatTensor of size 206x1592]

In [13]:
test_seqsYeast[0]

'MCGIFGYCNFLIEKTRGEIIDTLIEGLQALEYKEYDSSGISIQGDELESLNIYKQTGKISSLKEEIDLYNLNKNLPFISHCGIAHTRRATHGGLRRANCHPHNSDPSNEFVVVHNGVITNFANLKALLMAKGYVFKSDTDTECIPKLYKHIYDTSIELGYNLDFHVLTNLVLKELEGSYGLLCTSSHFPDEVVAARKGSPLVIGVKGKTDMDVNFVEVEYLDQEEDYLKLNTQTKSSGNVLAAAPVKYNTCLRKSPPFVHNT'