In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from pathlib import Path
from collections import Counter
from scripts.transliteration_tokenizers import create_source_target_tokenizers

In [2]:
cur_dir = Path.cwd()
data_dir = cur_dir / "data"
raw_data_dir = data_dir / "raw_data"
proc_data_dir = data_dir / "processed_data"

sample_file = raw_data_dir / "sample.tsv"

train_file = proc_data_dir / "train_clean.tsv"
dev_file = raw_data_dir / "te.translit.sampled.dev.tsv"
test_file = raw_data_dir / "te.translit.sampled.test.tsv"

weighted_sample_file = proc_data_dir / "weighted_sample.tsv"
max_sample_file = proc_data_dir / "max_sample.tsv"
repeat_sample_file = proc_data_dir / "repeat_sample.tsv"

weighted_dev_file = proc_data_dir / "weighted_dev.tsv"
max_dev_file = proc_data_dir / "max_dev.tsv"
repeat_dev_file = proc_data_dir / "repeat_dev.tsv"

weighted_train_file = proc_data_dir / "weighted_train.tsv"
max_train_file = proc_data_dir / "max_train.tsv"
repeat_train_file = proc_data_dir / "repeat_train.tsv"

weighted_test_file = proc_data_dir / "weighted_test.tsv"
max_test_file = proc_data_dir / "max_test.tsv"
repeat_test_file = proc_data_dir / "repeat_test.tsv"

tgt_corpus_file =  proc_data_dir / "target_corpus.txt"
src_corpus_file = proc_data_dir / "source_corpus.txt"

In [3]:
with open(src_corpus_file, 'r',encoding='utf-8') as file:
    src_corpus = file.read()
    
with open(tgt_corpus_file, 'r',encoding='utf-8') as file:
    tgt_corpus = file.read()    

In [4]:
src_corpus_keys =set(Counter(src_corpus.replace(" ",'')).keys())
print(sorted(src_corpus_keys))
len(src_corpus_keys)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


26

In [5]:
tgt_corpus_keys =set(Counter(tgt_corpus.replace(" ",'')).keys())
print(sorted(tgt_corpus_keys))
len(tgt_corpus_keys)

['ం', 'ః', 'అ', 'ఆ', 'ఇ', 'ఈ', 'ఉ', 'ఊ', 'ఋ', 'ఎ', 'ఏ', 'ఐ', 'ఒ', 'ఓ', 'ఔ', 'క', 'ఖ', 'గ', 'ఘ', 'చ', 'ఛ', 'జ', 'ఝ', 'ఞ', 'ట', 'ఠ', 'డ', 'ఢ', 'ణ', 'త', 'థ', 'ద', 'ధ', 'న', 'ప', 'ఫ', 'బ', 'భ', 'మ', 'య', 'ర', 'ఱ', 'ల', 'ళ', 'వ', 'శ', 'ష', 'స', 'హ', 'ా', 'ి', 'ీ', 'ు', 'ూ', 'ృ', 'ె', 'ే', 'ై', 'ొ', 'ో', 'ౌ', '్']


62

### The minimum number of tokens must be number of unique tokens + 4(one each for start, end, unk and pad tokens). Even if we use lesser number of tokens, it still defaults to that

In [6]:
src_vocab_size = 24
tgt_vocab_size = 24

src_tokenizer, tgt_tokenizer = create_source_target_tokenizers(src_corpus_file,tgt_corpus_file, src_vocab_size,tgt_vocab_size)
src_tokenizer.get_vocab_size(),tgt_tokenizer.get_vocab_size()

(30, 66)

In [7]:
src_vocab_size = 100
tgt_vocab_size = 100

src_tokenizer, tgt_tokenizer = create_source_target_tokenizers(src_corpus_file,tgt_corpus_file, src_vocab_size,tgt_vocab_size)
src_tokenizer.get_vocab_size(),tgt_tokenizer.get_vocab_size()

(100, 100)

In [8]:
print(src_tokenizer.get_vocab().keys())
print("\n")
print(tgt_tokenizer.get_vocab().keys())

dict_keys(['sh', 'ak', 'ina', 'ra', 'vi', 'ani', 'du', 'bh', 'ag', 'aku', 'sth', 'aan', 'aru', 'ram', 'ti', 'ar', '</s>', 'lo', 'el', 'adh', 'm', 'd', 'p', 'oo', 'v', 'dh', 'ay', 't', 'l', 's', 'aal', 'ae', '<unk>', 'b', 'as', 'uu', 'unn', 'o', 'sam', 'alu', 'pra', 'w', 'at', 'av', '<s>', 'di', 'c', 'gaa', 'un', 'st', 'ath', 'ah', 'che', 'ari', 'ik', 'x', 'th', 'inch', 'i', 'ri', 'pr', 'a', 'y', '<pad>', 'f', 'q', 'al', 'en', 'on', 'z', 'aay', 'g', 'ulu', 'ni', 'ul', 'in', 'aar', 'ap', 'ki', 'im', 'it', 'u', 'uk', 'ru', 'il', 'e', 'r', 'aa', 'anu', 'ad', 'ut', 'j', 'aaru', 'ee', 'h', 'k', 'an', 'am', 'ch', 'n'])


dict_keys(['ై', 'ఆ', 'థ', 'ఋ', 'కు', 'ల', 'పు', 'అ', 'తు', 'ె', 'ఢ', 'ా', 'ిం', 'న్న', 'లు', 'ది', '<pad>', 'ఞ', 'ు', 'ప', 'ార', 'ద', 'వ', 'డ', 'శ', 'ఐ', 'ో', 'ష', 'ప్ర', 'ల్', 'ఈ', 'ఏ', 'ఔ', 'ఝ', 'ర', 'డు', 'రా', 'క', '</s>', 'ఛ', 'ఠ', 'ి', 'న్', 'ఓ', 'ొ', 'రి', 'రు', '<s>', 'చ', 'మ', '్', 'ృ', 'ించ', 'గా', 'లో', 'ట్', 'ారు', 'ట', 'ఉ', 'ఒ', 'ను', 'ఫ', 'ర్', '్య', 'కి', 'వి',

In [9]:
src_vocab_size = 30
tgt_vocab_size = 66

src_tokenizer, tgt_tokenizer = create_source_target_tokenizers(src_corpus_file,tgt_corpus_file, src_vocab_size,tgt_vocab_size)
src_tokenizer.get_vocab_size(),tgt_tokenizer.get_vocab_size()

(30, 66)

In [10]:
src_tokenizer_keys = set(src_tokenizer.get_vocab().keys())
tgt_tokenizer_keys = set(tgt_tokenizer.get_vocab().keys())

In [11]:
src_tokenizer_keys^src_corpus_keys,src_corpus_keys-src_tokenizer_keys

({'</s>', '<pad>', '<s>', '<unk>'}, set())

In [12]:
tgt_tokenizer_keys^tgt_corpus_keys,tgt_corpus_keys-tgt_tokenizer_keys

({'</s>', '<pad>', '<s>', '<unk>'}, set())

# So, tokenizer keys are a superset of corpus keys at those given values and only additional elements in them are start,end,unk and pad tokens. So, at the given values it acts like a character level tokenizer and for values higher than that it acts like a byte pair tokenizer

In [13]:
df_sample= pd.read_csv(sample_file, sep='\t',header = None , names=["target","source", "frequency"])
df_sample.head()

Unnamed: 0,target,source,frequency
0,అంక,amka,1
1,అంక,anka,3
2,అంకం,amkam,1
3,అంకం,ankam,2
4,అంగీకరించ,amgiikarimcha,1


In [14]:
src_encoding = src_tokenizer.encode(df_sample.source[0])
src_encoding

Encoding(num_tokens=6, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [15]:
src_encoding.ids,src_encoding.tokens

([1, 4, 16, 14, 4, 2], ['<s>', 'a', 'm', 'k', 'a', '</s>'])

In [16]:
for encoding in src_tokenizer.encode_batch(df_sample.sample(5).source.tolist()):
    print(encoding.tokens)

['<s>', 'a', 'n', 'k', 'a', 'm', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'a', 'n', 'g', 'e', 'e', 'k', 'a', 'r', 'i', 'n', 'c', 'h', 'a', 'd', 'a', 'm', '</s>', '<pad>', '<pad>']
['<s>', 'a', 'n', 'g', 'i', 'k', 'a', 'r', 'i', 'n', 'c', 'h', 'a', 'n', 'i', '</s>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'a', 'n', 'g', 'i', 'k', 'a', 'r', 'i', 'n', 'c', 'h', 'a', 'd', 'a', 'n', 'i', 'k', 'i', '</s>']
['<s>', 'a', 'n', 't', 'h', 'a', 'm', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [17]:
tgt_encoding = tgt_tokenizer.encode(df_sample.target[0])
tgt_encoding

Encoding(num_tokens=5, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [18]:
tgt_encoding.ids,tgt_encoding.tokens

([1, 6, 4, 19, 2], ['<s>', 'అ', 'ం', 'క', '</s>'])

In [19]:
for encoding in tgt_tokenizer.encode_batch(df_sample.sample(5).target.tolist()):
    print(encoding.tokens)

['<s>', 'అ', 'ం', 'గ', 'ీ', 'క', 'ర', 'ి', 'ం', 'చ', 'న', 'ి', '</s>', '<pad>', '<pad>']
['<s>', 'అ', 'ం', 'త', 'మ', 'ు', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'అ', 'ం', 'గ', 'ీ', 'క', 'ర', 'ి', 'ం', 'చ', 'ా', 'ర', 'ు', '</s>', '<pad>']
['<s>', 'అ', 'ం', 'గ', 'ీ', 'క', 'ర', 'ి', 'ం', 'చ', '</s>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'అ', 'ం', 'గ', 'ీ', 'క', 'ర', 'ి', 'ం', 'చ', 'ి', 'ం', 'ద', 'ి', '</s>']
