## Understanding Amino Acid Seq Data --> creating input Tensor

1. Loading the data 
2. Create a Tokenizer for amino acids
3. Create a Tensor object 


### 0. notebook init

In [44]:
import numpy as np
import pandas as pd
import torch

import string
from typing import Iterable, Tuple


### 1. Load the data

- The data is in .txt file, somewhat in a format for two columns. the first column is species-code and the next one is the amino-acid-seq  
- The simplest way to get the data is create lists. 

In [45]:
file_path = 'X_set.txt'

In [46]:
# Initialize lists to hold the phylogenetic position strings and amino acid sequences
specie_code = []
amino_acid_sequences = []

# Read the file
with open(file_path, 'r') as file:
    for line in file:
        parts = line.strip().split(' ')
        specie_code.append(parts[0])
        amino_acid_sequences.append(parts[1])

In [47]:
specie_code[0:3]

['111133333333333333333333333333',
 '111211333333333333333333333333',
 '111212333333333333333333333333']

In [48]:
amino_acid_sequences[0:3]

['---LSQF--LLMLWVPGSKGEIVLTQSPASVSVSPGERVTISCQASESVGNTYLNWLQQKSGQSPRWLIYQVSKLESGIPARFRGSGSGTDFTFTISRVEAEDVAHYYSQQ-----',
 'MESLSQC--LLMLWVPVSRGAIVLTQSPALVSVSPGERVTISCKASQSVGNTYLSWFRQKPGQSPRGLIYKVSNLPSGVPSRFRGSGAEKDFTLTISRVEAVDGAVYYCAQASYSP',
 'MESLSQC--LLMLWVPVSRGAIVLTQSPASVSVSPGERVTISCKASQSLGNTYLHWFQQKPGQSPRRLIYQVSNLLSGVPSRFSGSGAGKDFSLTISSVEAGDGAVYYCFQGSYDP']

### 2. Create a Tokenizer for amino acids

- There are 20 amino acids, each letter in the chain represents one of them. 
- Converting them into 20 tokens, meaning each amino acid would get a number associated with it. 
- Would also need a special character token, which is "-", something related to multiple-sequence-alignment 

In [49]:
# Creating a set of all amino-acids

amino_acid_set = set()

for seq in amino_acid_sequences:
    for acid in seq:
        if acid != "-":
            amino_acid_set.add(acid)

# 20 amino acids
print(f"Num of Amino Acids: {len(amino_acid_set) }")
amino_acids_list = list(amino_acid_set)

Num of Amino Acids: 20


In [50]:
# Creating a Tokenzer class, which ennodes and decodes an amino acid sequence 

class Tokenizer:
    ''' 
    To encode and decode any amino acid string
    '''
    # class attribute 
    amino_acids = amino_acids_list

    def __init__(self, special_tokens = Iterable[str]):
        # define a vocab
        self.vocab = Tokenizer.amino_acids + list(special_tokens)
        # mapping each vocab to a token (a numeric value)
        self.token2idx = {token:i for i, token in enumerate(self.vocab)} 
        # mapping numeric value back to a token
        self.idx2token = {i:token for token, i  in self.token2idx.items()}

    def encode(self, inputs: Iterable[str]) -> Iterable[int]:
        return [self.token2idx[token] for token in inputs]
    
    def decode(self, inputs: Iterable[int]) -> Iterable[str]:
        return [self.idx2token[idx] for idx in inputs]

    def __len__(self):
        return len(self.vocab)

In [51]:
# creating an instance of the Tokenizer. 
amino_acid_tokenizer = Tokenizer(special_tokens=["-", "[MASK]"])

In [52]:
# let's encode the first amino-acid-sequence and see the first 10 positions
print(f"First 20 amino acids         : {[i for i in amino_acid_sequences[0][0:20]]}")
print(f"First 20 encoded amino acids : {amino_acid_tokenizer.encode(amino_acid_sequences[0])[0:20]}")
print(f"First 20 decoded amino acids : {amino_acid_tokenizer.decode(amino_acid_tokenizer.encode(amino_acid_sequences[0])[0:20])}")

First 20 amino acids         : ['-', '-', '-', 'L', 'S', 'Q', 'F', '-', '-', 'L', 'L', 'M', 'L', 'W', 'V', 'P', 'G', 'S', 'K', 'G']
First 20 encoded amino acids : [20, 20, 20, 1, 11, 5, 13, 20, 20, 1, 1, 7, 1, 4, 6, 10, 0, 11, 15, 0]
First 20 decoded amino acids : ['-', '-', '-', 'L', 'S', 'Q', 'F', '-', '-', 'L', 'L', 'M', 'L', 'W', 'V', 'P', 'G', 'S', 'K', 'G']


In [53]:
len(amino_acid_tokenizer)

22

In [54]:
print(amino_acid_tokenizer.token2idx)

{'G': 0, 'L': 1, 'Y': 2, 'T': 3, 'W': 4, 'Q': 5, 'V': 6, 'M': 7, 'N': 8, 'C': 9, 'P': 10, 'S': 11, 'A': 12, 'F': 13, 'D': 14, 'K': 15, 'R': 16, 'I': 17, 'E': 18, 'H': 19, '-': 20, '[MASK]': 21}


In [55]:
amino_acid_tokenizer.encode(["A", "[MASK]"])

[12, 21]

### 3. Creating a Tensor object

In [56]:
# making sure that the size of each amino-acid-seq is same

len_amino_acid_seq = set()
for seq in amino_acid_sequences:
    len_amino_acid_seq.add(len(seq))

# this set should have only one value 
len_amino_acid_seq
# perfect! all the seq are 116 character long

{116}

In [57]:

def create_amino_acids_tensor(amino_acid_sequences:list, my_tokenizer:Tokenizer):

    amino_acid_tensors = []

    for seq in amino_acid_sequences:
        amino_acid_tensors.append(torch.Tensor(my_tokenizer.encode(seq)).to(torch.int64))

    # stacking them 
    stacked_tensor =  torch.stack(amino_acid_tensors)

    return stacked_tensor


In [58]:
all_amino_acids_tensor = create_amino_acids_tensor(amino_acid_sequences, amino_acid_tokenizer)

In [59]:
all_amino_acids_tensor

tensor([[20, 20, 20,  ..., 20, 20, 20],
        [ 7, 18, 11,  ...,  2, 11, 10],
        [ 7, 18, 11,  ...,  2, 14, 10],
        ...,
        [20, 20, 20,  ..., 18, 14, 10],
        [20, 20, 20,  ..., 18, 14, 10],
        [20, 20, 20,  ..., 18, 14, 10]])

In [60]:
all_amino_acids_tensor.shape
# the shape is 1001 species * 116 amino acids

torch.Size([1001, 116])

## Create Training data

In [66]:
def create_training_data_old(input_tensor:torch.Tensor, batch_size:int, mask_token:int):

    rows, cols = input_tensor.shape

    idx = torch.randint(rows-1, (batch_size,))

    input_seqs = []
    target_amino_acids = []
    mask_positions = []
    for i in idx:
        # select one amino acid seq
        selected_amino_seq = input_tensor[i].clone()
        # randomly choose a position to mask
        mask_position = torch.randint(cols-1, (1,)) 
        target_amino_acid = selected_amino_seq[mask_position]
        # replace the mask posiiton with mask-token
        selected_amino_seq[mask_position] = mask_token
        train_input_seq = selected_amino_seq

        input_seqs.append(train_input_seq)
        target_amino_acids.append(target_amino_acid)
        mask_positions.append(mask_position)

    return input_seqs, target_amino_acids, mask_positions
        

# create_training_data(all_amino_acids_tensor, batch_size=64, mask_token=21)


In [67]:
def create_training_data(input_tensor: torch.Tensor, batch_size: int, mask_token: int):
    """
    Creates masked training data efficiently using vectorized operations.

    Args:
      input_tensor (torch.Tensor): Input tensor of shape (num_sequences, sequence_length)
      batch_size (int): The desired batch size.
      mask_token (int): The token used for masking.

    Returns:
      tuple: (input_seqs, target_amino_acids, mask_positions)
             - input_seqs: Tensor of shape (batch_size, sequence_length) with masked sequences.
             - target_amino_acids: Tensor of shape (batch_size,) containing the masked amino acids.
             - mask_positions: Tensor of shape (batch_size,) indicating mask positions.
    """

    rows = input_tensor.shape[0]
    seq_len = input_tensor.shape[1]
    # Randomly select 'batch_size' rows (amino acid sequences)
    idx = torch.randint(rows, size=(batch_size,))
    input_seqs = input_tensor[idx].clone()

    # Generate random mask positions within each selected sequence
    mask_positions = torch.randint(seq_len, size=(batch_size, 1))

    # Get the target amino acids at the mask positions
    target_amino_acids = input_seqs.gather(1, mask_positions).squeeze()

    # Create a mask for the selected positions 
    mask = torch.zeros(input_seqs.size(), dtype=torch.bool)
    mask.scatter_(1, mask_positions, 1)

    # Apply the mask to replace the target positions with the mask_token
    input_seqs[mask] = mask_token

    return input_seqs, target_amino_acids, mask_positions.squeeze()

In [71]:
input_seqs, targets, mask_pos = create_training_data(all_amino_acids_tensor, batch_size=32, mask_token=21)

In [72]:
input_seqs.shape

torch.Size([32, 116])

In [78]:
# exactly one masked value
(input_seqs[0] == 21).sum()

tensor(1)

In [79]:
input_seqs[0]

tensor([ 7,  6, 11,  5,  3,  5, 13, 17, 11, 20,  1,  1,  1,  4, 17, 21,  0, 12,
         2,  0, 14, 17,  6,  7,  3,  5, 11, 10, 14, 11,  1, 12,  6, 11,  1,  0,
        18, 16,  6,  3, 17,  8,  9, 15, 11, 11,  5, 11,  6,  8, 15,  8,  2,  1,
         8,  4,  2,  5,  5, 15, 10,  0,  5, 12, 10, 15,  1,  1, 17,  2,  4, 12,
        11,  3, 16, 18, 11,  0,  6, 10, 14, 16, 13, 11,  0, 11,  0, 11,  0,  3,
        14, 13,  3,  1,  3, 17, 11, 11,  1,  5, 12, 18, 14,  6, 12,  6,  2,  2,
         9,  5,  5, 11,  2, 11,  3, 10])

In [81]:
mask_pos

tensor([ 15,  78,  48, 115,  13,  44,  43,  70,  99, 101,  18,  21,   3,  52,
         68, 100,  28, 110,  28,  56,  68,  38,  48,  77,  18,  58,  78,  81,
         78,  43, 111, 108])