In [8]:
import torch 
from biotite.sequence.io.fasta import FastaFile


torch.manual_seed(13)

<torch._C.Generator at 0x107f07210>

In [9]:
# read in our FASTA file 
input_file = "./fasta/hypf.fa"

proteins = []
fasta_file = FastaFile.read(input_file) 
for header, sequence in fasta_file.items():
    proteins.append(sequence)

max_protein_length = max(len(w) for w in proteins)

proteins[:3], max_protein_length

(['MCLLSLAAATVAARRTPLRLLGRGLAAAMSTAGPLKSVDYEVFGRVQGVCFRMYTEGEAKKIGVVGWVKNTSKGTVTGQVQGPEDKVNSMKSWLSKVGSPSSRIDRTNFSNEKTISKLEYSNFSIRY',
  'MSSQIKKSKTTTKKLVKSAPKSVPNAAADDQIFCCQFEVFGHVQDFSGVFFRKHTQKKANELGITGWCMNTTRGTVQGMLEGSLDQMTDMKYWLQHKGSPRSVIEKAVFSENEALPINNFKMFSIRR',
  'MLTKLYLKIVLCLLVALPFLSEVTSQNTDTTMTKLVGVDFEVYGRVQGVFFRKYTQKHSTELGLKGWCMNTDKGTVVGRIEGEKEKVEQMKNWLRYTGSPQSAIDKAEFKNEKELSQPSFTNFEIKK'],
 127)

In [10]:
chars = sorted(list(set(''.join(proteins)))) # all the possible characters
tokens = sum(len(w) for w in proteins)

print(f"Number of examples in the dataset: {len(proteins)}")
print(f"Max protein length: {max_protein_length}")
print(f"Number of unique characters in the vocabulary: {len(chars)}")
print(f"Vocabulary (amino acids): {''.join(chars)}")
print(f"Total tokens: {tokens}")

Number of examples in the dataset: 27246
Max protein length: 127
Number of unique characters in the vocabulary: 21
Vocabulary (amino acids): ACDEFGHIKLMNPQRSTVWXY
Total tokens: 2565728


In [11]:
# partition the input data into a training and the test set
test_set_size = int(len(proteins) * 0.1) # 10% of the training set
rp = torch.randperm(len(proteins)).tolist()
train_proteins = [proteins[i] for i in rp[:-test_set_size]]
test_proteins = [proteins[i] for i in rp[-test_set_size:]]

print(f"Split up the dataset into {len(train_proteins):,} training examples and {len(test_proteins):,} test examples")

Split up the dataset into 24,522 training examples and 2,724 test examples
