<a href="https://colab.research.google.com/github/fatemehalipour/DNA-Tokenization/blob/main/DNA_Tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pathlib import Path
import random

try:
  import sentencepiece
except:
  !pip install -q sentencepiece
  import sentencepiece
print(f"sentencepiece version: {sentencepiece.__version__}")

# install biopython, import it if available
try:
  import Bio
except:
  !pip install -q biopython
  import Bio

from Bio import SeqIO
print(f"biopython version: {Bio.__version__}")

sentencepiece version: 0.1.99
biopython version: 1.81


In [None]:
# connect to drive
from google.colab import drive
drive.mount("/content/drive/")

%cd /content/drive/My Drive/DNA Dictionary

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/DNA Dictionary


In [None]:
# Setup path to a data folder
data_path = Path("data/")

records = SeqIO.parse(data_path / "astrovirus.fasta", "fasta")

dna_sequences = {}
for record in records:
  if "N" not in str(record.seq):
    dna_sequences[record.name] = str(record.seq)

In [None]:
%%time
list_of_sequences = [seq for seq in dna_sequences.values()]
sentencepiece.SentencePieceTrainer.Train(
    sentence_iterator=iter(list_of_sequences),
    model_prefix="mysp_model",
    max_sentence_length= 10000,
    vocab_size=1000,
)

CPU times: user 9min 49s, sys: 2.71 s, total: 9min 52s
Wall time: 6min 4s


In [None]:
sp = sentencepiece.SentencePieceProcessor(model_file="mysp_model.model")
vocabs = [sp.id_to_piece(id) for id in range(sp.get_piece_size())]
vocabs

['<unk>',
 '<s>',
 '</s>',
 'A',
 'U',
 'C',
 'CA',
 'UU',
 'G',
 'UG',
 'UA',
 'CU',
 'AA',
 'AU',
 'AC',
 'CC',
 'UC',
 'GA',
 'AG',
 'GG',
 'GC',
 'CG',
 'GU',
 'UAA',
 'UGA',
 'AGA',
 'AGG',
 'CAA',
 'UGU',
 'CCU',
 'UAG',
 'UUGA',
 'CUUU',
 'UGG',
 'GGA',
 'CUU',
 'AGGA',
 'CAC',
 'GAG',
 'UUC',
 'CUGA',
 'ACU',
 'GAA',
 'GAU',
 'CAAU',
 'UGC',
 'AACA',
 'CGA',
 'CAU',
 'UCC',
 'CACU',
 'GGU',
 'GUU',
 'AAU',
 'CUG',
 'AGU',
 'UUGG',
 'AGAU',
 'UUAU',
 'UGGUG',
 'ACC',
 'UUG',
 'CUGG',
 'GUA',
 'CUAU',
 'UGAUGA',
 'GGG',
 'CACCA',
 'GCU',
 'GGC',
 'UAC',
 'UUAA',
 'AGAA',
 'GCA',
 'UGUUA',
 'AAAAA',
 'CAGU',
 'UGGCU',
 'UGGA',
 'UGUC',
 'CAUC',
 'UGUUG',
 'CCGC',
 'CCAG',
 'AAAA',
 'UUUUG',
 'GGGU',
 'AGGG',
 'AUGA',
 'UUGGU',
 'AGCU',
 'CUGC',
 'CCAA',
 'CUGU',
 'UGAGGA',
 'AAAAU',
 'AAACA',
 'ACCAC',
 'CCUC',
 'AUGGU',
 'AACAA',
 'CGG',
 'UAUGG',
 'CUCU',
 'AAGAA',
 'CCAAA',
 'GGUU',
 'CAAAU',
 'GUCA',
 'CAGGC',
 'CAAGC',
 'CAAUG',
 'AUAA',
 'CCG',
 'GUGGU',
 'AUGGC',
 'CCAU',
 

In [None]:
sequence = random.choice(list_of_sequences)
sp.Encode(sequence), sequence

([1,
  74,
  448,
  97,
  330,
  103,
  154,
  258,
  182,
  273,
  138,
  10,
  815,
  336,
  690,
  6,
  33,
  328,
  385,
  51,
  221,
  283,
  582,
  6,
  66,
  18,
  281,
  128,
  698,
  519,
  720,
  192,
  4,
  60,
  717,
  368,
  11,
  68,
  387,
  215,
  163,
  269,
  437,
  849,
  63,
  440,
  295,
  169,
  10,
  24,
  171,
  619,
  6,
  819,
  136,
  97,
  794,
  374,
  32,
  112,
  573,
  103,
  17,
  199,
  584,
  656,
  197,
  106,
  257,
  100,
  638,
  44,
  250,
  557,
  16,
  301,
  14,
  168,
  379,
  70,
  868,
  165,
  496,
  222,
  160,
  246,
  27,
  195,
  538,
  273,
  372,
  223,
  394,
  133,
  563,
  4,
  56,
  133,
  413,
  353,
  111,
  739,
  230,
  487,
  358,
  95,
  14,
  568,
  413,
  32,
  159,
  72,
  11,
  358,
  101,
  107,
  129,
  632,
  96,
  857,
  280,
  82,
  19,
  11,
  470,
  504,
  528,
  42,
  22,
  676,
  427,
  92,
  76,
  285,
  709,
  451,
  80,
  300,
  92,
  267,
  4,
  22,
  138,
  243,
  467,
  173,
  810,
  563,
  441,
  209,
  