In [1]:
import numpy as np
import pandas as pd

In [2]:
file_path = 'X_set.txt'

In [3]:
# Initialize lists to hold the phylogenetic position strings and amino acid sequences
specie_code = []
amino_acid_sequences = []

# Read the file
with open(file_path, 'r') as file:
    for line in file:
        parts = line.strip().split(' ')
        specie_code.append(parts[0])
        amino_acid_sequences.append([parts[1]])

In [4]:
specie_code[0:5]

['111133333333333333333333333333',
 '111211333333333333333333333333',
 '111212333333333333333333333333',
 '111221333333333333333333333333',
 '111222333333333333333333333333']

In [5]:
amino_acid_sequences[0:5]

[['---LSQF--LLMLWVPGSKGEIVLTQSPASVSVSPGERVTISCQASESVGNTYLNWLQQKSGQSPRWLIYQVSKLESGIPARFRGSGSGTDFTFTISRVEAEDVAHYYSQQ-----'],
 ['MESLSQC--LLMLWVPVSRGAIVLTQSPALVSVSPGERVTISCKASQSVGNTYLSWFRQKPGQSPRGLIYKVSNLPSGVPSRFRGSGAEKDFTLTISRVEAVDGAVYYCAQASYSP'],
 ['MESLSQC--LLMLWVPVSRGAIVLTQSPASVSVSPGERVTISCKASQSLGNTYLHWFQQKPGQSPRRLIYQVSNLLSGVPSRFSGSGAGKDFSLTISSVEAGDGAVYYCFQGSYDP'],
 ['MEAPSQF--LLMLWVPGSRGEVVLTQSPASVSVSPGERVTISCQASESVGNTYLNWFQQKPGQSPRLLIYQISKLDSGIPARFGGSGADRNFTFTISSVSAEDGADYYCFQDTFYP'],
 ['METLSQF--LLMLWVPGSRGEVVLTQSAASVSVSPGERVTISCQASQSVGRTYLDWIQQKPGQSPRLLIYQVSNLDSGIPARFSGSGADRDFTLTINSVSAEDGADYYCAQRSFYP']]

In [6]:
amino_acid_set = set()

for seq in amino_acid_sequences:
    for acid in seq[0]:
        if acid != "-":
            amino_acid_set.add(acid)

# 20 amino acids
print(len(amino_acid_set) )
amino_acids_list = list(amino_acid_set)

20


In [7]:
import string
from typing import Iterable, Tuple

class Tokenizer:
    ''' 
    To encode and decode any string using all ASCII letters 
    '''
    # class attribute 
    amino_acids = amino_acids_list

    def __init__(self, special_tokens = Iterable[str]):
        self.vocab = Tokenizer.amino_acids + list(special_tokens)
        self.token2idx = {token:i for i, token in enumerate(self.vocab)} 
        self.idx2token = {i:token for token, i  in self.token2idx.items()}

    def encode(self, inputs: Iterable[str]) -> Iterable[int]:
        return [self.token2idx[token] for token in inputs]
    
    def decode(self, inputs: Iterable[int]) -> Iterable[str]:
        return [self.idx2token[idx] for idx in inputs]

    def __len__(self):
        return len(self.vocab)

In [8]:
amino_acid_tokenizer = Tokenizer(special_tokens="-")

In [9]:
# encode some proiens

amino_acid_tokenizer.encode(amino_acid_sequences[0][0])

[20,
 20,
 20,
 8,
 16,
 18,
 6,
 20,
 20,
 8,
 8,
 10,
 8,
 14,
 17,
 13,
 1,
 16,
 11,
 1,
 5,
 0,
 17,
 8,
 9,
 18,
 16,
 13,
 12,
 16,
 17,
 16,
 17,
 16,
 13,
 1,
 5,
 3,
 17,
 9,
 0,
 16,
 19,
 18,
 12,
 16,
 5,
 16,
 17,
 1,
 2,
 9,
 15,
 8,
 2,
 14,
 8,
 18,
 18,
 11,
 16,
 1,
 18,
 16,
 13,
 3,
 14,
 8,
 0,
 15,
 18,
 17,
 16,
 11,
 8,
 5,
 16,
 1,
 0,
 13,
 12,
 3,
 6,
 3,
 1,
 16,
 1,
 16,
 1,
 9,
 7,
 6,
 9,
 6,
 9,
 0,
 16,
 3,
 17,
 5,
 12,
 5,
 7,
 17,
 12,
 4,
 15,
 15,
 16,
 18,
 18,
 20,
 20,
 20,
 20,
 20]

In [10]:
amino_acid_tokenizer.decode(amino_acid_tokenizer.encode(amino_acid_sequences[0][0]))

['-',
 '-',
 '-',
 'L',
 'S',
 'Q',
 'F',
 '-',
 '-',
 'L',
 'L',
 'M',
 'L',
 'W',
 'V',
 'P',
 'G',
 'S',
 'K',
 'G',
 'E',
 'I',
 'V',
 'L',
 'T',
 'Q',
 'S',
 'P',
 'A',
 'S',
 'V',
 'S',
 'V',
 'S',
 'P',
 'G',
 'E',
 'R',
 'V',
 'T',
 'I',
 'S',
 'C',
 'Q',
 'A',
 'S',
 'E',
 'S',
 'V',
 'G',
 'N',
 'T',
 'Y',
 'L',
 'N',
 'W',
 'L',
 'Q',
 'Q',
 'K',
 'S',
 'G',
 'Q',
 'S',
 'P',
 'R',
 'W',
 'L',
 'I',
 'Y',
 'Q',
 'V',
 'S',
 'K',
 'L',
 'E',
 'S',
 'G',
 'I',
 'P',
 'A',
 'R',
 'F',
 'R',
 'G',
 'S',
 'G',
 'S',
 'G',
 'T',
 'D',
 'F',
 'T',
 'F',
 'T',
 'I',
 'S',
 'R',
 'V',
 'E',
 'A',
 'E',
 'D',
 'V',
 'A',
 'H',
 'Y',
 'Y',
 'S',
 'Q',
 'Q',
 '-',
 '-',
 '-',
 '-',
 '-']