## Understanding Amino Acid Seq Data --> creating input Tensor

1. Loading the data 
2. Create a Tokenizer for amino acids
3. Create a Tensor object 


### 0. notebook init

In [1]:
import numpy as np
import pandas as pd
import torch

import string
from typing import Iterable, Tuple


### 1. Load the data

- The data is in .txt file, somewhat in a format for two columns. the first column is species-code and the next one is the amino-acid-seq  
- The simplest way to get the data is create lists. 

In [2]:
file_path = 'X_set.txt'

In [3]:
# Initialize lists to hold the phylogenetic position strings and amino acid sequences
specie_code = []
amino_acid_sequences = []

# Read the file
with open(file_path, 'r') as file:
    for line in file:
        parts = line.strip().split(' ')
        specie_code.append(parts[0])
        amino_acid_sequences.append(parts[1])

In [4]:
specie_code[0:3]

['111133333333333333333333333333',
 '111211333333333333333333333333',
 '111212333333333333333333333333']

In [5]:
amino_acid_sequences[0:3]

['---LSQF--LLMLWVPGSKGEIVLTQSPASVSVSPGERVTISCQASESVGNTYLNWLQQKSGQSPRWLIYQVSKLESGIPARFRGSGSGTDFTFTISRVEAEDVAHYYSQQ-----',
 'MESLSQC--LLMLWVPVSRGAIVLTQSPALVSVSPGERVTISCKASQSVGNTYLSWFRQKPGQSPRGLIYKVSNLPSGVPSRFRGSGAEKDFTLTISRVEAVDGAVYYCAQASYSP',
 'MESLSQC--LLMLWVPVSRGAIVLTQSPASVSVSPGERVTISCKASQSLGNTYLHWFQQKPGQSPRRLIYQVSNLLSGVPSRFSGSGAGKDFSLTISSVEAGDGAVYYCFQGSYDP']

### 2. Create a Tokenizer for amino acids

- There are 20 amino acids, each letter in the chain represents one of them. 
- Converting them into 20 tokens, meaning each amino acid would get a number associated with it. 
- Would also need a special character token, which is "-", something related to multiple-sequence-alignment 

In [6]:
# Creating a set of all amino-acids

amino_acid_set = set()

for seq in amino_acid_sequences:
    for acid in seq:
        if acid != "-":
            amino_acid_set.add(acid)

# 20 amino acids
print(f"Num of Amino Acids: {len(amino_acid_set) }")
amino_acids_list = list(amino_acid_set)

Num of Amino Acids: 20


In [7]:
# Creating a Tokenzer class, which ennodes and decodes an amino acid sequence 

class Tokenizer:
    ''' 
    To encode and decode any amino acid string
    '''
    # class attribute 
    amino_acids = amino_acids_list

    def __init__(self, special_tokens = Iterable[str]):
        # define a vocab
        self.vocab = Tokenizer.amino_acids + list(special_tokens)
        # mapping each vocab to a token (a numeric value)
        self.token2idx = {token:i for i, token in enumerate(self.vocab)} 
        # mapping numeric value back to a token
        self.idx2token = {i:token for token, i  in self.token2idx.items()}

    def encode(self, inputs: Iterable[str]) -> Iterable[int]:
        return [self.token2idx[token] for token in inputs]
    
    def decode(self, inputs: Iterable[int]) -> Iterable[str]:
        return [self.idx2token[idx] for idx in inputs]

    def __len__(self):
        return len(self.vocab)

In [8]:
# creating an instance of the Tokenizer. 
amino_acid_tokenizer = Tokenizer(special_tokens=["-", "[MASK]"])

In [9]:
# let's encode the first amino-acid-sequence and see the first 10 positions
print(f"First 20 amino acids         : {[i for i in amino_acid_sequences[0][0:20]]}")
print(f"First 20 encoded amino acids : {amino_acid_tokenizer.encode(amino_acid_sequences[0])[0:20]}")
print(f"First 20 decoded amino acids : {amino_acid_tokenizer.decode(amino_acid_tokenizer.encode(amino_acid_sequences[0])[0:20])}")

First 20 amino acids         : ['-', '-', '-', 'L', 'S', 'Q', 'F', '-', '-', 'L', 'L', 'M', 'L', 'W', 'V', 'P', 'G', 'S', 'K', 'G']
First 20 encoded amino acids : [20, 20, 20, 1, 6, 9, 2, 20, 20, 1, 1, 14, 1, 0, 7, 17, 4, 6, 3, 4]
First 20 decoded amino acids : ['-', '-', '-', 'L', 'S', 'Q', 'F', '-', '-', 'L', 'L', 'M', 'L', 'W', 'V', 'P', 'G', 'S', 'K', 'G']


In [10]:
len(amino_acid_tokenizer)

22

In [11]:
print(amino_acid_tokenizer.token2idx)

{'W': 0, 'L': 1, 'F': 2, 'K': 3, 'G': 4, 'H': 5, 'S': 6, 'V': 7, 'N': 8, 'Q': 9, 'R': 10, 'A': 11, 'Y': 12, 'C': 13, 'M': 14, 'D': 15, 'E': 16, 'P': 17, 'I': 18, 'T': 19, '-': 20, '[MASK]': 21}


In [12]:
amino_acid_tokenizer.encode(["A", "[MASK]"])

[11, 21]

### 3. Creating a Tensor object

In [13]:
# making sure that the size of each amino-acid-seq is same

len_amino_acid_seq = set()
for seq in amino_acid_sequences:
    len_amino_acid_seq.add(len(seq))

# this set should have only one value 
len_amino_acid_seq
# perfect! all the seq are 116 character long

{116}

In [14]:

def create_amino_acids_tensor(amino_acid_sequences:list, my_tokenizer:Tokenizer):

    amino_acid_tensors = []

    for seq in amino_acid_sequences:
        amino_acid_tensors.append(torch.Tensor(my_tokenizer.encode(seq)).to(torch.int64))

    # stacking them 
    stacked_tensor =  torch.stack(amino_acid_tensors)

    return stacked_tensor


In [15]:
all_amino_acids_tensor = create_amino_acids_tensor(amino_acid_sequences, amino_acid_tokenizer)

In [16]:
all_amino_acids_tensor

tensor([[20, 20, 20,  ..., 20, 20, 20],
        [14, 16,  6,  ..., 12,  6, 17],
        [14, 16,  6,  ..., 12, 15, 17],
        ...,
        [20, 20, 20,  ..., 16, 15, 17],
        [20, 20, 20,  ..., 16, 15, 17],
        [20, 20, 20,  ..., 16, 15, 17]])

In [17]:
all_amino_acids_tensor.shape
# the shape is 1001 species * 116 amino acids

torch.Size([1001, 116])

## Create Training data

In [18]:
def create_training_data_old(input_tensor:torch.Tensor, batch_size:int, mask_token:int):

    rows, cols = input_tensor.shape

    idx = torch.randint(rows-1, (batch_size,))

    input_seqs = []
    target_amino_acids = []
    mask_positions = []
    for i in idx:
        # select one amino acid seq
        selected_amino_seq = input_tensor[i].clone()
        # randomly choose a position to mask
        mask_position = torch.randint(cols-1, (1,)) 
        target_amino_acid = selected_amino_seq[mask_position]
        # replace the mask posiiton with mask-token
        selected_amino_seq[mask_position] = mask_token
        train_input_seq = selected_amino_seq

        input_seqs.append(train_input_seq)
        target_amino_acids.append(target_amino_acid)
        mask_positions.append(mask_position)

    return input_seqs, target_amino_acids, mask_positions
        

# create_training_data(all_amino_acids_tensor, batch_size=64, mask_token=21)


In [19]:
def create_training_data(input_tensor: torch.Tensor, batch_size: int, mask_token: int):
    """
    Creates masked training data efficiently using vectorized operations.

    Args:
      input_tensor (torch.Tensor): Input tensor of shape (num_sequences, sequence_length)
      batch_size (int): The desired batch size.
      mask_token (int): The token used for masking.

    Returns:
      tuple: (input_seqs, target_amino_acids, mask_positions)
             - input_seqs: Tensor of shape (batch_size, sequence_length) with masked sequences.
             - target_amino_acids: Tensor of shape (batch_size,) containing the masked amino acids.
             - mask_positions: Tensor of shape (batch_size,) indicating mask positions.
    """

    rows = input_tensor.shape[0]
    seq_len = input_tensor.shape[1]
    # Randomly select 'batch_size' rows (amino acid sequences)
    idx = torch.randint(rows, size=(batch_size,))
    input_seqs = input_tensor[idx].clone()

    # Generate random mask positions within each selected sequence
    mask_positions = torch.randint(seq_len, size=(batch_size, 1))

    # Get the target amino acids at the mask positions
    target_amino_acids = input_seqs.gather(1, mask_positions).squeeze()

    # Create a mask for the selected positions 
    mask = torch.zeros(input_seqs.size(), dtype=torch.bool)
    mask.scatter_(1, mask_positions, 1)

    # Apply the mask to replace the target positions with the mask_token
    input_seqs[mask] = mask_token

    return input_seqs, target_amino_acids, mask_positions.squeeze()

In [20]:
input_seqs, targets, mask_pos = create_training_data(all_amino_acids_tensor, batch_size=32, mask_token=21)

In [21]:
input_seqs.shape

torch.Size([32, 116])

In [22]:
# exactly one masked value
(input_seqs[0] == 21).sum()

tensor(1)

In [23]:
input_seqs[0]

tensor([20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 14,  2, 17, 19,  6,  4, 11,
        10, 13, 15, 18,  9, 14, 19,  9,  6, 19,  6,  6,  1,  6, 11,  6,  7,  4,
        15, 10,  7, 21, 18, 19, 13, 10, 11,  6,  9,  4, 18, 20,  6,  8,  8,  1,
         8,  0, 12,  9,  9,  3, 17,  4,  3, 19, 17,  3,  1,  1, 18, 12, 11, 11,
         6,  6,  1,  9,  6,  4, 18,  1,  6, 10,  2,  6, 15,  6,  4,  6,  4, 19,
        15, 12, 19,  1, 19, 18,  6,  6,  1,  9, 17, 16, 15,  2, 11, 11, 12, 12,
        13,  9,  9,  6, 15,  6, 19, 17])

In [24]:
mask_pos

tensor([ 39, 114,  22,  26,  32, 103,  45,  58,  74,  12,  43,  17,  85,   5,
         24,  37, 100,  47,  86,  72,  60,  87,  83,  33,  10, 109,  97,  95,
         81,  96, 107,  72])

In [25]:
targets

tensor([19,  8,  9,  6,  7, 11,  6,  9, 10,  1, 10,  6,  6, 20, 19, 10, 11,  6,
         4,  6, 17,  6,  6, 19,  1,  9, 10, 18, 10,  6, 12, 19])

## Creating a Dataset Class

In [28]:
from torch.utils.data import Dataset, DataLoader

class MaskedAminoSeqDataset(Dataset):
    def __init__(self, input_tensor: torch.Tensor, mask_token: int):
            """
            Dataset for masked amino acid sequence prediction.

            Args:
            input_tensor (torch.Tensor): Input tensor of shape (num_sequences, sequence_length).
            mask_token (int): The token used for masking.
            """
            self.input_tensor = input_tensor
            self.mask_token = mask_token

    def __len__(self):
        return self.input_tensor.shape[0] 

    def __getitem__(self, idx):
        input_seqs, target_amino_acids, mask_positions = \
            self._create_training_data(self.input_tensor, batch_size=1, mask_token=self.mask_token)
        return input_seqs.squeeze(0), target_amino_acids.squeeze(0), mask_positions.squeeze(0)

    def _create_training_data(self, input_tensor: torch.Tensor, batch_size: int, mask_token: int):
        """
        Creates masked training data efficiently using vectorized operations.

        Args:
        input_tensor (torch.Tensor): Input tensor of shape (num_sequences, sequence_length)
        batch_size (int): The desired batch size.
        mask_token (int): The token used for masking.

        Returns:
        tuple: (input_seqs, target_amino_acids, mask_positions)
            - input_seqs: Tensor of shape (batch_size, sequence_length) with masked sequences.
            - target_amino_acids: Tensor of shape (batch_size,) containing the masked amino acids.
            - mask_positions: Tensor of shape (batch_size,) indicating mask positions.
        """
        rows = input_tensor.shape[0]
        seq_len = input_tensor.shape[1]
        # Randomly select 'batch_size' rows (amino acid sequences)
        idx = torch.randint(rows, size=(batch_size,))
        input_seqs = input_tensor[idx].clone()

        # Generate random mask positions within each selected sequence
        mask_positions = torch.randint(seq_len, size=(batch_size, 1))

        # Get the target amino acids at the mask positions
        target_amino_acids = input_seqs.gather(1, mask_positions).squeeze()

        # Create a mask for the selected positions 
        mask = torch.zeros(input_seqs.size(), dtype=torch.bool)
        mask.scatter_(1, mask_positions, 1)

        # Apply the mask to replace the target positions with the mask_token
        input_seqs[mask] = mask_token

        return input_seqs, target_amino_acids, mask_positions.squeeze()


In [30]:
# Assuming input_tensor is your tensor of amino acid sequences
masked_amino_seq_dataset = MaskedAminoSeqDataset(all_amino_acids_tensor, mask_token=21) # Assuming 0 is your mask token
masked_amino_seq_dataloader = DataLoader(masked_amino_seq_dataset, batch_size=32, shuffle=True)

In [32]:
## each iteration now gives a batch with 32 data points.
for i in masked_amino_seq_dataloader:
    print(i[0].shape)
    print(i[1].shape)
    print(i[2].shape)
    break

torch.Size([32, 116])
torch.Size([32])
torch.Size([32])


## Training part

In [None]:
import torch.nn as nn
import math

In [None]:
class ProteinPredictor(nn.Module):
    def init(self, num_variants, seq_length, num_amino_acids, emd_dim=128, nhead=8, num_layers=3):
        super().init()
        
        self.embed = nn.Embedding(num_amino_acids, emd_dim)
        self.pos_encoder = PositionalEncoding(emd_dim, seq_length)
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=emd_dim, nhead=nhead)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.fc = nn.Linear(emd_dim, num_amino_acids)

In [None]:
len(amino_acid_tokenizer)

22

In [None]:
vocab_size = len(amino_acid_tokenizer)
emb_dim = 8


embed = nn.Embedding(vocab_size, emb_dim)
pos_emb = nn.Embedding(vocab_size, )

In [None]:
input_seqs.shape

torch.Size([32, 116])

In [None]:
embed(input_seqs).shape

torch.Size([32, 116, 8])

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_dim, max_len=5000):
        super().__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, emb_dim, 2) * (-math.log(10000.0) / emb_dim))
        pe = torch.zeros(max_len, 1, emb_dim)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor of shape (batch_size, seq_len, emb_dim)
        Returns:
            Tensor of shape (batch_size, seq_len, emb_dim) with positional encodings added.
        """
        return x + self.pe[:x.size(1)]

In [None]:
pos_encoder = PositionalEncoding(emb_dim, max_len=116)

In [None]:
pos_encoder(embed(input_seqs))

RuntimeError: The size of tensor a (32) must match the size of tensor b (116) at non-singleton dimension 0