<a href="https://colab.research.google.com/github/elias-ela/amharic-tokenizer/blob/main/Amharic_Tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentencepiece

In [38]:
import sentencepiece as spm
import time

In [39]:
dataPath = '/content/mono-am'

# **Function that processes number of lines, blank lines, sentences, and words in a give file.**

In [40]:
def processFile(filename):
  lines, blanklines, sentences, words = 0, 0, 0, 0
  try:
    file = open(dataPath, 'r')
  except IOError:
    print(f"Unable to open file '%s'" % dataPath)
  
  for line in file:
    lines += 1

    if line.startswith('\n'):
      blanklines += 1
    else:
      # assuming that each sentence ends with :: or ?
      sentences += line.count('።') + line.count('?')

    tempwords = line.split(None)

    words += len(tempwords)

  file.close()

  print(f"-"*50)
  print(f"Lines         : ", lines)
  print(f"Blank lines   : ", blanklines)
  print(f"Sentences     : ", sentences)
  print(f"Words         : ", words)

In [41]:
processFile(dataPath)

--------------------------------------------------
Lines         :  19873
Blank lines   :  1474
Sentences     :  38298
Words         :  399339


# **Train sentencePiece for Amharic using BPE (Byte Pair Encoding)**

In [42]:
start_time = time.time()
spm.SentencePieceTrainer.train(input=dataPath, model_prefix='am-bpe', vocab_size=16000, model_type="bpe", character_coverage=1.0)
print("--- %s seconds ---" % (time.time() - start_time))

--- 9.052777528762817 seconds ---


In [43]:
model = spm.SentencePieceProcessor(model_file='am-bpe.model')
model.encode('ለአማርኛ ተናጋሪዎች የቀረበ መረጃ', out_type=str)

['▁ለአ', 'ማርኛ', '▁ተናጋሪ', 'ዎች', '▁የቀረ', 'በ', '▁መረጃ']

# **Train sentencePiece for Amharic using Unigram.**

In [44]:
start_time = time.time()
spm.SentencePieceTrainer.train(input=dataPath, model_prefix='am-unigram', vocab_size=16000, model_type="unigram", character_coverage=1.0)
print("--- %s seconds ---" % (time.time() - start_time))

--- 20.374517917633057 seconds ---


In [45]:
model = spm.SentencePieceProcessor(model_file='am-unigram.model')
model.encode('ለአማርኛ ተናጋሪዎች የቀረበ መረጃ', out_type=str)

['▁ለ', 'አማርኛ', '▁ተናጋሪዎች', '▁የቀረበ', '▁መረጃ']

# **Train sentencePiece for Amharic using charcters**

In [46]:
start_time = time.time()
spm.SentencePieceTrainer.train(input=dataPath, model_prefix='am-char', vocab_size=16000, model_type="char", character_coverage=1.0)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.26238393783569336 seconds ---


In [47]:
model = spm.SentencePieceProcessor(model_file='am-char.model')
model.encode('ለአማርኛ ተናጋሪዎች የቀረበ መረጃ', out_type=str)

['▁',
 'ለ',
 'አ',
 'ማ',
 'ር',
 'ኛ',
 '▁',
 'ተ',
 'ና',
 'ጋ',
 'ሪ',
 'ዎ',
 'ች',
 '▁',
 'የ',
 'ቀ',
 'ረ',
 'በ',
 '▁',
 'መ',
 'ረ',
 'ጃ']

# **Train sentencePiece for Amharic using words**

In [48]:
start_time = time.time()
spm.SentencePieceTrainer.train(input=dataPath, model_prefix='am-word', vocab_size=16000, model_type="word", character_coverage=1.0)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.4852330684661865 seconds ---


In [49]:
model = spm.SentencePieceProcessor(model_file='am-word.model')
model.encode('ለአማርኛ ተናጋሪዎች የቀረበ መረጃ', out_type=str)

['▁ለአማርኛ', '▁ተናጋሪዎች', '▁የቀረበ', '▁መረጃ']