<a href="https://colab.research.google.com/github/elias-ela/amharic-tokenizer/blob/main/Amharic_Tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install sentencepiece



In [15]:
import sentencepiece as spm
import time

In [16]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [20]:
dataPath = 'drive/MyDrive/Colab_Notebooks/am-mono'
print(dataPath)

drive/MyDrive/Colab_Notebooks/am-mono


# **Function that processes number of lines, blank lines, sentences, and words in a give file.**

In [21]:
def processFile(filename):
  lines, blanklines, sentences, words = 0, 0, 0, 0
  try:
    file = open(dataPath, 'r')
  except IOError:
    print(f"Unable to open file '%s'" % dataPath)
  
  for line in file:
    lines += 1

    if line.startswith('\n'):
      blanklines += 1
    else:
      # assuming that each sentence ends with :: or ?
      sentences += line.count('።') + line.count('?')

    tempwords = line.split(None)

    words += len(tempwords)

  file.close()

  print(f"-"*50)
  print(f"Lines         : {lines:,d}")
  print(f"Blank lines   : {blanklines:,d}")
  print(f"Sentences     : {sentences:,d}")
  print(f"Words         : {words:,d}")

In [22]:
processFile(dataPath)

--------------------------------------------------
Lines         : 1,418,958
Blank lines   : 1,466
Sentences     : 1,354,112
Words         : 22,222,977


# **Train sentencePiece for Amharic using BPE (Byte Pair Encoding)**

In [23]:
start_time = time.time()
spm.SentencePieceTrainer.train(input=dataPath, model_prefix='am-bpe', vocab_size=16000, model_type="bpe", character_coverage=1.0)
print("--- %s seconds ---" % (time.time() - start_time))

--- 76.94041609764099 seconds ---


In [24]:
model = spm.SentencePieceProcessor(model_file='am-bpe.model')
model.encode('ለአማርኛ ተናጋሪዎች የቀረበ መረጃ።', out_type=str)

['▁ለአ', 'ማርኛ', '▁ተና', 'ጋ', 'ሪዎች', '▁የቀረበ', '▁መረጃ', '።']

# **Train sentencePiece for Amharic using Unigram.**

In [25]:
start_time = time.time()
spm.SentencePieceTrainer.train(input=dataPath, model_prefix='am-unigram', vocab_size=16000, model_type="unigram", character_coverage=1.0)
print("--- %s seconds ---" % (time.time() - start_time))

--- 266.321985244751 seconds ---


In [26]:
model = spm.SentencePieceProcessor(model_file='am-unigram.model')
model.encode('ለአማርኛ ተናጋሪዎች የቀረበ መረጃ።', out_type=str)

['▁ለአ', 'ማር', 'ኛ', '▁ተናጋሪ', 'ዎች', '▁የቀረበ', '▁መረጃ', '።']

# **Train sentencePiece for Amharic using charcters**

In [27]:
start_time = time.time()
spm.SentencePieceTrainer.train(input=dataPath, model_prefix='am-char', vocab_size=16000, model_type="char", character_coverage=1.0)
print("--- %s seconds ---" % (time.time() - start_time))

--- 16.90510058403015 seconds ---


In [28]:
model = spm.SentencePieceProcessor(model_file='am-char.model')
model.encode('ለአማርኛ ተናጋሪዎች የቀረበ መረጃ።', out_type=str)

['▁',
 'ለ',
 'አ',
 'ማ',
 'ር',
 'ኛ',
 '▁',
 'ተ',
 'ና',
 'ጋ',
 'ሪ',
 'ዎ',
 'ች',
 '▁',
 'የ',
 'ቀ',
 'ረ',
 'በ',
 '▁',
 'መ',
 'ረ',
 'ጃ',
 '።']

# **Train sentencePiece for Amharic using words**

In [29]:
start_time = time.time()
spm.SentencePieceTrainer.train(input=dataPath, model_prefix='am-word', vocab_size=16000, model_type="word", character_coverage=1.0)
print("--- %s seconds ---" % (time.time() - start_time))

--- 24.534483909606934 seconds ---


In [37]:
model = spm.SentencePieceProcessor(model_file='am-word.model')
model.encode('ለአማርኛ ተናጋሪዎች የቀረበ መረጃ።', out_type=str)

['▁ለአማርኛ▁ተናጋሪዎች', '▁የቀረበ', '▁መረጃ።']