In [1]:
import numpy as np
import pandas as pd
import tqdm
import re

import nltk
from nltk import FreqDist
import itertools

from collections.abc import Sequence
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.tokenize.treebank import TreebankWordDetokenizer

from nltk import word_tokenize
from nltk.lm import MLE
from nltk.lm.preprocessing import pad_both_ends, padded_everygram_pipeline

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer

from __future__ import unicode_literals
from hazm import *

In [2]:
file_dir = './data/'

with open(file_dir + 'hp_fa.txt', 'r') as f:
    hp_fa = f.read()
    
with open(file_dir + 'hp_en.txt', 'r') as f:
    hp_en = f.read()

# Part B

## White Space Tokenization

In [3]:
from nltk.tokenize import WhitespaceTokenizer
     
# Create a reference variable for Class WhitespaceTokenizer
tk = WhitespaceTokenizer()
     
# Use tokenize method
hp_en_tokenized = tk.tokenize(hp_en)
hp_fa_tokenized = tk.tokenize(hp_fa)

print("The number of extracted tokens(White Space Tokenization): English harry potter book", len(hp_en_tokenized))
print(f"------------------------------------------------------------------------------------------")
print("The number of extracted tokens(White Space Tokenization): Persain harry potter book", len(hp_fa_tokenized))

The number of extracted tokens(White Space Tokenization): English harry potter book 78443
------------------------------------------------------------------------------------------
The number of extracted tokens(White Space Tokenization): Persain harry potter book 96294


## Spacy Tokenizer

In [4]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.lang.fa import Persian

nlp_English = English()
tokens = nlp_English(hp_en)

hp_en_tokenized = []
for token in tokens:
    hp_en_tokenized.append(token)
    
print("The number of extracted tokens(Spacy Tokenizer): English harry potter book", len(hp_en_tokenized))

print(f"----------------------------------------------------------------------------------")
nlp_Persian = Persian()
tokens = nlp_Persian(hp_fa)

hp_fa_tokenized = []
for token in tokens:
    hp_fa_tokenized.append(token)
    
print("The number of extracted tokens(Spacy Tokenizer): Persian harry potter book", len(hp_fa_tokenized))

The number of extracted tokens(Spacy Tokenizer): English harry potter book 102406
----------------------------------------------------------------------------------
The number of extracted tokens(Spacy Tokenizer): Persian harry potter book 125677


## (BPE) Subword Tokenization.

In [5]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers

# Initialize a trainer with the BPE model
trainer = trainers.BpeTrainer(special_tokens=["<s>", "<pad>", "</s>", "<unk>"])

# Train the BPE model on your sentence
tokenizer = Tokenizer(BPE(unk_token="<unk>"))
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.decoder = decoders.ByteLevel()



tokenizer.train_from_iterator([hp_en], trainer)
hp_en_tokenized = tokenizer.encode(hp_en)


tokenizer.train_from_iterator([hp_fa], trainer)
hp_fa_tokenized = tokenizer.encode(hp_fa)

print("The number of extracted tokens(BPE Tokenizer): English harry potter book", len(hp_en_tokenized.tokens))
print(f"--------------------------------------------------------------------------------")
print("The number of extracted tokens(BPE Tokenizer): Persian harry potter book", len(hp_fa_tokenized.tokens))







The number of extracted tokens(BPE Tokenizer): English harry potter book 100012
--------------------------------------------------------------------------------
The number of extracted tokens(BPE Tokenizer): Persian harry potter book 100932


# Part C

In [6]:
# English string
sen_en = "This question is about tokenization and shows several tokenizer algorithms.Hopefully, you\nwill be able to understand how they are trained and generate tokens."
     
# Persian string
sen_fa = "این سوال در مورد قطعه بندی جملات است و چندین الگوریتم توکنایز کردن متن را نشان می دهد. امیدواریم بتوانید نحوه آموزش آنها و تولید توکن ها را درک کنید."
      

## White Space Tokenization

In [7]:
sen_en_tokenized = tk.tokenize(sen_en)
sen_fa_tokenized = tk.tokenize(sen_fa)

print("The number of extracted tokens(White Space Tokenization): English sentence", len(sen_en_tokenized))
print(f"------------------------------------------------------------------------------------------")
print("The number of extracted tokens(White Space Tokenization): Persain sentence", len(sen_fa_tokenized))

The number of extracted tokens(White Space Tokenization): English sentence 23
------------------------------------------------------------------------------------------
The number of extracted tokens(White Space Tokenization): Persain sentence 30


In [8]:
print(*sen_en_tokenized, sep = ", ")

This, question, is, about, tokenization, and, shows, several, tokenizer, algorithms.Hopefully,, you, will, be, able, to, understand, how, they, are, trained, and, generate, tokens.


In [9]:
print(*sen_fa_tokenized, sep = ", ")

این, سوال, در, مورد, قطعه, بندی, جملات, است, و, چندین, الگوریتم, توکنایز, کردن, متن, را, نشان, می, دهد., امیدواریم, بتوانید, نحوه, آموزش, آنها, و, تولید, توکن, ها, را, درک, کنید.


## Spacy Tokenizer

In [10]:
tokens = nlp_English(sen_en)
sen_en_tokenized = []
for token in tokens:
    sen_en_tokenized.append(token)
    

tokens = nlp_Persian(sen_fa)

sen_fa_tokenized = []
for token in tokens:
    sen_fa_tokenized.append(token)
    

    
print("The number of extracted tokens(White Space Tokenization): English sentence", len(sen_en_tokenized))
print(f"----------------------------------------------------------------------------------")
print("The number of extracted tokens(Spacy Tokenizer): Persian harry potter book", len(sen_fa_tokenized))

The number of extracted tokens(White Space Tokenization): English sentence 28
----------------------------------------------------------------------------------
The number of extracted tokens(Spacy Tokenizer): Persian harry potter book 33


In [11]:
print(*sen_en_tokenized, sep = ", ")

This, question, is, about, tokenization, and, shows, several, tokenizer, algorithms, ., Hopefully, ,, you, 
, will, be, able, to, understand, how, they, are, trained, and, generate, tokens, .


In [12]:
print(*sen_fa_tokenized, sep = ", ")

این, سوال, در, مورد, قطعه, بندی, جملات, است, و, چندین, الگوریتم, توکنایز, کردن, متن, را, نشان, می, دهد, ., امیدوار, یم, بتوانید, نحوه, آموزش, آنها, و, تولید, توکن, ها, را, درک, کنید, .


## BPE

In [13]:
tokenizer.train_from_iterator([hp_en], trainer)
sen_en_tokenized = tokenizer.encode(sen_en)

tokenizer.train_from_iterator([hp_fa], trainer)
sen_fa_tokenized = tokenizer.encode(sen_fa)

print("The number of extracted tokens(BPE Tokenizer): English sentence", len(sen_en_tokenized.tokens))
print(f"--------------------------------------------------------------------------------")
print("The number of extracted tokens(BPE Tokenizer): Persian sentence", len(sen_fa_tokenized.tokens))







The number of extracted tokens(BPE Tokenizer): English sentence 42
--------------------------------------------------------------------------------
The number of extracted tokens(BPE Tokenizer): Persian sentence 63


In [14]:
l = list(sen_en_tokenized.tokens)
print(*l, sep = ", ")

This, question, is, about, to, ken, i, z, ation, and, shows, several, to, ken, i, z, er, al, gor, ith, ms, ., Hopefully, ,, you, will, be, able, to, understand, how, they, are, trained, and, g, en, er, ate, to, kens, .


In [15]:
l = list(sen_fa_tokenized.tokens)
print(*l, sep = ", ")

ا, <unk>, ن, سو, ال, در, مورد, قط, عه, بند, <unk>, جم, لات, است, و, چند, <unk>, ن, ال, گور, <unk>, تم, تو, کن, ا, <unk>, ز, کردن, متن, را, نشان, م, <unk>, دهد, A, ام, <unk>, دو, ار, <unk>, م, ب, توان, <unk>, د, ن, حو, ه, آموزش, آنها, و, تول, <unk>, د, تو, کن, ها, را, درک, کن, <unk>, د, A
