In [1]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordPieceTrainer

# Create a tokenizer instance
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

# Set pre-tokenizer to handle whitespace splitting
tokenizer.pre_tokenizer = Whitespace()

# Create a trainer instance with special tokens
trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

In [13]:
import pandas as pd

df = pd.read_csv('s2-in.txt', sep='\t')
#text = """Machine learning is the study of computer algorithms that improve automatically through experience it is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model based on sample data, known as training data, in order to make predictions or decisions without being explicitly programmed to perform the task. Machine learning algorithms are used in a wide variety of applications, such as email filtering and computer vision, where it is difficult or infeasible to develop conventional algorithms to perform the needed tasks."""
text = df['Text'].str.cat(sep='\n')

transliteration_dict = {
    '>': 'ܐ',
    'B': 'ܒ',
    'G': 'ܓ',
    'D': 'ܕ',
    'H': 'ܗ',
    'W': 'ܘ',
    'Z': 'ܙ',
    'X': 'ܚ',
    'V': 'ܛ',
    'J': 'ܝ',
    'K': 'ܟ',
    'L': 'ܠ',
    'M': 'ܡ',
    'N': 'ܢ',
    'S': 'ܣ',
    '<': 'ܥ',
    'P': 'ܦ',
    'Y': 'ܨ',
    'Q': 'ܩ',
    'R': 'ܪ',
    'C': 'ܫ',
    'T': 'ܬ',
    '"': '',
    '#': '',
    '^': ''
    }

def transliterate(text, transliteration_dict):
    transliterated_text = ""
    i = 0
    while i < len(text):
        match = False
        # Check for multi-character transcriptions
        for length in range(2, 0, -1):  # Check for matches of length 2, 1
            if text[i:i+length] in transliteration_dict:
                transliterated_text += transliteration_dict[text[i:i+length]]
                i += length
                match = True
                break
        if not match:
            # If no match found, keep the original character
            transliterated_text += text[i]
            i += 1
    return transliterated_text

transliterated_text = transliterate(text, transliteration_dict)
print (transliterated_text)

ܘܡܠܟܐ ܕܘܝܕ ܣܐܒ ܘܥܠ ܒܫܢܝܐ ܘܡܟܣܝܢ ܗܘܘ ܠܗ ܒܠܒܘܫܐ ܘܠܐ ܫܚܢ
ܘܐܡܪܘ ܠܗ ܥܒܕܘܗܝ ܗܐ ܥܒܕܝܟ ܩܕܡܝܟ ܢܒܥܘܢ ܠܡܪܢ ܡܠܟܐ ܥܠܝܡܬܐ ܒܬܘܠܬܐ ܘܬܩܘܡ ܩܕܡ ܡܠܟܐ ܘܬܗܘܐ ܠܗ ܡܫܡܫܢܝܬܐ ܘܬܫܟܒ ܒܥܘܒܟ ܘܢܫܚܢ ܠܡܪܢ ܡܠܟܐ
ܘܒܥܘ ܥܠܝܡܬܐ ܕܫܦܝܪܐ ܒܟܠܗ ܬܚܘܡܐ ܕܐܝܣܪܝܠ ܘܐܫܟܚܘ ܠܐܒܝܫܓ ܫܝܠܘܡܝܬܐ ܘܐܝܬܝܘܗ ܠܡܠܟܐ
ܘܥܠܝܡܬܐ ܫܦܝܪܐ ܗܘܬ ܒܚܙܘܗ ܛܒ ܘܗܘܬ ܠܡܠܟܐ ܡܫܡܫܢܝܬܐ ܘܡܫܡܫܐ ܠܗ ܘܡܠܟܐ ܠܐ ܝܕܥܗ
ܘܐܕܘܢܝܐ ܒܪ ܚܓܝܬ ܡܬܪܘܪܒ ܘܐܡܪ ܐܢܐ ܐܡܠܟ ܘܥܒܕ ܠܗ ܡܪܟܒܬܐ ܘܦܪܫܐ ܘܚܡܫܝܢ ܓܒܪܝܢ ܕܪܗܛܝܢ ܗܘܘ ܩܕܡܘܗܝ
ܘܠܐ ܟܐܐ ܒܗ ܐܒܘܗܝ ܡܢ ܝܘܡܘܗܝ ܘܐܡܪ ܠܗ ܡܛܠ ܡܢܐ ܗܟܢܐ ܥܒܕ ܐܢܬ ܘܐܦ ܗܘ ܫܦܝܪ ܗܘܐ ܒܚܙܘܗ ܛܒ ܘܠܗ ܝܠܕܬ ܒܬܪ ܐܒܫܠܘܡ
ܘܗܘܘ ܦܬܓܡܘܗܝ ܥܡ ܝܘܐܒ ܒܪ ܨܘܪܝܐ ܘܥܡ ܐܒܝܬܪ ܟܗܢܐ ܘܡܥܕܪܝܢ ܒܬܪ ܐܕܘܢܝܐ
ܘܨܕܘܩ ܟܗܢܐ ܘܒܢܝܐ ܒܪ ܝܘܝܕܥ ܘܢܬܢ ܢܒܝܐ ܘܫܡܥܝ ܘܕܥܝ ܘܓܢܒܪܐ ܕܕܘܝܕ ܠܐ ܗܘܘ ܒܬܪ ܐܕܘܢܝܐ
ܘܕܒܚ ܐܕܘܢܝܐ ܥܢܐ ܘܬܘܪܐ ܘܡܦܛܡܐ ܥܠ ܟܐܦܐ ܪܒܬܐ ܕܥܠ ܓܒ ܥܝܢ ܩܨܪܐ ܘܩܪܐ ܠܟܠܗܘܢ ܐܚܘܗܝ ܒܢܝ ܡܠܟܐ ܘܠܟܠܗܘܢ ܕܒܝܬ ܝܗܘܕܐ ܘܠܥܒܕܝ ܡܠܟܐ
ܘܠܢܬܢ ܢܒܝܐ ܘܠܒܢܝܐ ܒܪ ܝܘܝܕܥ ܘܠܓܢܒܪܐ ܕܕܘܝܕ ܘܠܫܠܝܡܘܢ ܐܚܘܗܝ ܠܐ ܩܪܐ
ܘܐܡܪ ܢܬܢ ܢܒܝܐ ܠܒܬܫܒܥ ܐܡܗ ܕܫܠܝܡܘܢ ܠܐ ܫܡܥܬܝ ܕܐܡܠܟ ܐܕܘܢܝܐ ܘܡܪܢ ܕܘܝܕ ܠܐ ܝܕܥ
ܗܫܐ ܬܝ ܐܡܠܟܟܝ ܡܠܟܐ ܘܦܠܛܝ ܢܦܫܟܝ ܘܢܦܫܐ ܕܒܪܟܝ ܫܠܝܡܘܢ
ܙܠܝ ܥܘܠܝ ܠܘܬ ܡܠܟܐ ܕܘܝܕ ܘܐܡܪܝ ܠܗ ܠܐ ܐܢܬ ܡܪܝ ܡܠ

In [20]:


import sentencepiece as spm
import io
import tempfile

texts = [text]
text_data = io.StringIO(text)

# Create a temporary file to write the text data
with tempfile.NamedTemporaryFile('w', delete=False) as temp_file:
    temp_file_name = temp_file.name
    temp_file.write(text)

# Train the SentencePiece model using the temporary file
spm.SentencePieceTrainer.train(input=temp_file_name, model_prefix='syriac', vocab_size=50, character_coverage=1.0, model_type='bpe')

sp = spm.SentencePieceProcessor(model_file='syriac.model')

# Tokenize the text
tokens = sp.encode("""ܘܡܠܟܐ ܕܘܝܕ ܣܐܒ ܘܥܠ ܒܫܢܝܐ ܘܡܟܣܝܢ ܗܘܘ ܠܗ ܒܠܒܘܫܐ ܘܠܐ ܫܚܢ""", out_type=str)
ids = sp.encode(text, out_type=int)



Tokens: ['▁', 'ܘܡܠܟܐ', '▁', 'ܕܘܝܕ', '▁', 'ܣܐܒ', '▁', 'ܘܥܠ', '▁', 'ܒܫܢܝܐ', '▁', 'ܘܡܟܣܝܢ', '▁', 'ܗܘܘ', '▁', 'ܠܗ', '▁', 'ܒܠܒܘܫܐ', '▁', 'ܘܠܐ', '▁', 'ܫܚܢ']
IDs: [3, 30, 29, 36, 25, 4, 26, 27, 31, 24, 43, 25, 33, 3, 37, 29, 8, 39, 38, 28, 27, 25, 3, 30, 36, 43, 10, 13, 26, 26, 6, 35, 8, 29, 33, 26, 39, 38, 25, 3, 29, 25, 24, 39, 40, 28, 3, 25, 21, 26, 6, 35, 11, 33, 38, 31, 26, 20, 13, 25, 11, 33, 38, 31, 27, 36, 24, 41, 31, 30, 27, 36, 15, 33, 37, 9, 6, 21, 28, 7, 29, 36, 25, 11, 29, 27, 30, 34, 25, 8, 34, 26, 29, 34, 25, 3, 34, 41, 26, 30, 24, 41, 31, 30, 7, 29, 36, 25, 3, 34, 35, 26, 25, 6, 35, 7, 39, 30, 39, 28, 14, 25, 3, 34, 39, 36, 33, 8, 37, 26, 33, 36, 3, 28, 39, 40, 28, 6, 21, 28, 7, 29, 36, 25, 3, 33, 37, 26, 11, 29, 27, 30, 34, 25, 4, 39, 42, 27, 32, 25, 8, 36, 17, 18, 40, 26, 30, 25, 4, 25, 27, 43, 12, 29, 3, 25, 39, 36, 40, 26, 6, 25, 33, 27, 39, 44, 24, 39, 27, 29, 26, 30, 14, 25, 3, 25, 14, 27, 26, 35, 6, 30, 29, 36, 25, 3, 37, 29, 27, 30, 34, 25, 24, 39, 42, 27, 32, 25, 13, 

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: /var/folders/39/_gjk_0hs4pdf8rktbslyljxr0000gn/T/tmpw6rs1wk7
  input_format: 
  model_prefix: syriac
  model_type: BPE
  vocab_size: 50
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  ena

In [21]:
print("Tokens:", tokens)
print("IDs:", ids)

Tokens: ['▁', 'ܘܡܠܟܐ', '▁', 'ܕܘܝܕ', '▁', 'ܣܐܒ', '▁', 'ܘܥܠ', '▁', 'ܒܫܢܝܐ', '▁', 'ܘܡܟܣܝܢ', '▁', 'ܗܘܘ', '▁', 'ܠܗ', '▁', 'ܒܠܒܘܫܐ', '▁', 'ܘܠܐ', '▁', 'ܫܚܢ']
IDs: [3, 30, 29, 36, 25, 4, 26, 27, 31, 24, 43, 25, 33, 3, 37, 29, 8, 39, 38, 28, 27, 25, 3, 30, 36, 43, 10, 13, 26, 26, 6, 35, 8, 29, 33, 26, 39, 38, 25, 3, 29, 25, 24, 39, 40, 28, 3, 25, 21, 26, 6, 35, 11, 33, 38, 31, 26, 20, 13, 25, 11, 33, 38, 31, 27, 36, 24, 41, 31, 30, 27, 36, 15, 33, 37, 9, 6, 21, 28, 7, 29, 36, 25, 11, 29, 27, 30, 34, 25, 8, 34, 26, 29, 34, 25, 3, 34, 41, 26, 30, 24, 41, 31, 30, 7, 29, 36, 25, 3, 34, 35, 26, 25, 6, 35, 7, 39, 30, 39, 28, 14, 25, 3, 34, 39, 36, 33, 8, 37, 26, 33, 36, 3, 28, 39, 40, 28, 6, 21, 28, 7, 29, 36, 25, 3, 33, 37, 26, 11, 29, 27, 30, 34, 25, 4, 39, 42, 27, 32, 25, 8, 36, 17, 18, 40, 26, 30, 25, 4, 25, 27, 43, 12, 29, 3, 25, 39, 36, 40, 26, 6, 25, 33, 27, 39, 44, 24, 39, 27, 29, 26, 30, 14, 25, 3, 25, 14, 27, 26, 35, 6, 30, 29, 36, 25, 3, 37, 29, 27, 30, 34, 25, 24, 39, 42, 27, 32, 25, 13, 