In [12]:
from biotite.sequence.io.fasta import FastaFile
import torch 


torch.manual_seed(13)

<torch._C.Generator at 0x13fa67630>

In [17]:
# read in our FASTA file 
input_file = "./hypf.fa"

proteins = []
fasta_file = FastaFile.read(input_file) 
for header, sequence in fasta_file.items():
    proteins.append(sequence)

max_protein_length = max(len(w) for w in proteins)

proteins[0], max_protein_length


('MCLLSLAAATVAARRTPLRLLGRGLAAAMSTAGPLKSVDYEVFGRVQGVCFRMYTEGEAKKIGVVGWVKNTSKGTVTGQVQGPEDKVNSMKSWLSKVGSPSSRIDRTNFSNEKTISKLEYSNFSIRY',
 127)

In [14]:
# partition the input data into a training and the test set
test_set_size = int(len(proteins) * 0.1) # 10% of the training set
rp = torch.randperm(len(proteins)).tolist()
train_proteins = [proteins[i] for i in rp[:-test_set_size]]
test_proteins = [proteins[i] for i in rp[-test_set_size:]]
print(f"split up the dataset into {len(train_proteins)} training examples and {len(test_proteins)} test examples")


split up the dataset into 24171 training examples and 2685 test examples


In [18]:
chars = sorted(list(set(''.join(proteins)))) # all the possible characters
tokens = sum(len(w) for w in proteins)

print("using characters as tokens")
print(f"number of examples in the dataset: {len(proteins)}")
print(f"max protein length: {max_protein_length}")
print(f"number of unique characters in the vocabulary: {len(chars)}")
print("vocabulary:")
print(''.join(chars))
print(f"total tokens: {tokens}")

using characters as tokens
number of examples in the dataset: 26856
max protein length: 127
number of unique characters in the vocabulary: 21
vocabulary:
ACDEFGHIKLMNPQRSTVWXY
total tokens: 2526254


In [None]:
class ProteinDataset(Dataset):
    """Dataset for protein sequences with character-level tokenization"""

    def __init__(self, proteins, chars, max_protein_length):
        """
        proteins: list of protein sequence as string
        chars: list of tokens in the vocabulary as string
        max_protein_length: int
        """
        self.proteins = proteins
        self.chars = chars
        self.max_protein_length = max_protein_length
        self.stoi = {ch: i + 1 for i, ch in enumerate(chars)}
        self.itos = {i: s for s, i in self.stoi.items()} # inverse mapping

    def __len__(self):
        return len(self.proteins)

    def contains(self, word):
        return word in self.proteins

    def get_vocab_size(self):
        return len(self.chars) + 1 # all the possible characters and special 0 token

    def get_output_length(self):
        return self.max_protein_length + 1 # <START> token followed by proteins

    def encode(self, word):
        ix = torch.tensor([self.stoi[w] for w in word], dtype=torch.long)
        return ix

    def decode(self, ix):
        word = ''.join(self.itos[i] for i in ix)
        return word

    def __getitem__(self, idx):
        word = self.proteins[idx]
        ix = self.encode(word)
        x = torch.zeros(self.max_protein_length + 1, dtype=torch.long)
        y = torch.zeros(self.max_protein_length + 1, dtype=torch.long)
        x[1:1+len(ix)] = ix
        y[:len(ix)] = ix
        y[len(ix)+1:] = -1 # index -1 will mask the loss at the inactive locations
        return x, y