In [1]:
from fastai.text import * 
from pathlib import *
from functools import partial
from tqdm import tqdm
import re
import string
import shutil
import hashlib

In [2]:
import sentencepiece as spm

In [3]:
!pwd

/data/home/ubuntu/gaurav/in/fire/code-mixed-enma/tokenizer


In [4]:
# base_dir = Path('./../dataset_preparation')
# src_dir = base_dir / 'valid_en_ma_enma'
# dest_dir = base_dir / 'valid_uncased'
# files = []
# for fold in src_dir.ls():
#     files.extend(fold.ls()) 
# for file in tqdm(files):
#     filename = str(file).split('/')[-2] + '_' + str(file).split('/')[-1]
#     with open(file, 'r') as f:
#         text = f.read()
#     text = text.lower()
#     with open(dest_dir/filename, 'w', encoding='utf-8') as f:
#         f.write(text)

In [5]:
# All the data for LM is in train_uncased and valid_uncased

In [6]:
base_dir = Path('./../dataset_preparation')
train_files = (base_dir / 'train_uncased').ls()
valid_files = (base_dir / 'valid_uncased').ls()
print(len(train_files), len(valid_files))

52009 22302


In [7]:
all_files = train_files + valid_files

In [8]:
all_files[0]

PosixPath('../dataset_preparation/train_uncased/en_ma_enma_7760.txt')

In [9]:
all_files = [str(file) for file in all_files]
print(len(set(all_files)))

74311


In [10]:
flist = ','.join(all_files)

In [11]:
custom_symbols = [text.transform.FLD, 
                text.transform.TK_MAJ,
                text.transform.TK_UP,
                text.transform.TK_REP,
                text.transform.TK_WREP]

In [12]:
str_specialcases = ",".join(custom_symbols)

In [13]:
str_specialcases

'xxfld,xxmaj,xxup,xxrep,xxwrep'

In [14]:
spm.SentencePieceTrainer.Train(f'--input={flist} --model_prefix=mlen_spm --vocab_size=25000 --input_sentence_size=22500000 --unk_id=0 --bos_id=1 --eos_id=2 --pad_id=3 --unk_piece={text.transform.UNK} --bos_piece={text.transform.BOS} --eos_piece={text.transform.EOS} --pad_piece={text.transform.PAD} --user_defined_symbols={str_specialcases}')

In [15]:
sp = spm.SentencePieceProcessor()

In [16]:
sp.Load("mlen_spm.model")

True

In [18]:
sp.EncodeAsPieces('how are you'.lower())

['▁how', '▁are', '▁you']

In [19]:
sp.EncodeAsPieces('I am not satisfied with you'.lower())

['▁i', '▁am', '▁not', '▁sati', 's', 'fi', 'ed', '▁with', '▁you']

In [21]:
sp.EncodeAsPieces('മൈ നെയിം ഈസ് ഗൗരവ് '.lower())

['▁മൈ', '▁നെ', 'യി', 'ം', '▁ഈ', 'സ്', '▁ഗൗരവ', '്']

In [22]:
itos = [sp.IdToPiece(int(i)) for i in range(25000)]

In [23]:
itos

['xxunk',
 'xxbos',
 'xxeos',
 'xxpad',
 'xxfld',
 'xxmaj',
 'xxup',
 'xxrep',
 'xxwrep',
 '.',
 ',',
 'ൽ',
 '▁the',
 'ർ',
 '▁',
 'ൻ',
 's',
 '▁•',
 '▁of',
 'ൾ',
 '▁in',
 '▁a',
 '▁·',
 'ം',
 '▁and',
 'a',
 'i',
 '▁to',
 '-',
 '▁is',
 'k',
 'e',
 'u',
 '▁i',
 '്',
 '▁was',
 'n',
 '▁ഈ',
 'd',
 '▁it',
 'ു',
 'm',
 '▁ഒരു',
 "'",
 '▁oru',
 't',
 'ing',
 'y',
 '▁as',
 '▁on',
 '▁are',
 'l',
 '▁എന്ന',
 '▁that',
 'ute',
 'ൺ',
 '▁this',
 '▁by',
 'ed',
 'ങ്ങൾ',
 '▁at',
 '▁he',
 '▁for',
 'ng',
 'tti',
 'ത്',
 '▁end',
 'യുടെ',
 '▁.',
 'ti',
 'യിൽ',
 'te',
 '▁ni',
 'ത്തിൽ',
 'an',
 ':',
 'ി',
 'യും',
 'സ്',
 '▁from',
 'ma',
 '▁with',
 'ാ',
 'yu',
 'വും',
 'vu',
 '▁e',
 'മ',
 'v',
 '▁be',
 'o',
 'r',
 '▁wa',
 'na',
 '▁na',
 'ും',
 'ത',
 '▁an',
 'ത്തിന്റെ',
 'മാണ്',
 '▁india',
 'വ',
 'കൾ',
 'p',
 'യ',
 'le',
 'nn',
 'li',
 'va',
 '▁his',
 'ന',
 'ya',
 're',
 'മായ',
 'es',
 '▁s',
 '▁ആ',
 'ക്ക',
 'ta',
 'ka',
 'ക',
 'c',
 '▁-',
 'ai',
 "▁'",
 'ly',
 '▁has',
 '▁"',
 'മായി',
 'ni',
 'may',
 'ra',
 '▁അ',
 

In [24]:
[tok for tok in itos if len(tok) < 2]

['.',
 ',',
 'ൽ',
 'ർ',
 '▁',
 'ൻ',
 's',
 'ൾ',
 'ം',
 'a',
 'i',
 '-',
 'k',
 'e',
 'u',
 '്',
 'n',
 'd',
 'ു',
 'm',
 "'",
 't',
 'y',
 'l',
 'ൺ',
 ':',
 'ി',
 'ാ',
 'മ',
 'v',
 'o',
 'r',
 'ത',
 'വ',
 'p',
 'യ',
 'ന',
 'ക',
 'c',
 'g',
 '"',
 'ല',
 'സ',
 'െ',
 'h',
 'ര',
 'b',
 'റ',
 ';',
 'ണ',
 'േ',
 'ോ',
 'പ',
 'ട',
 '•',
 'ള',
 'j',
 'f',
 'ഗ',
 'ീ',
 'ദ',
 'ഡ',
 '2',
 'ബ',
 'ൂ',
 'x',
 'ഹ',
 'അ',
 'ശ',
 '/',
 'ഷ',
 'ജ',
 '°',
 'ഴ',
 '3',
 'w',
 '4',
 'ൈ',
 'എ',
 'ഇ',
 '1',
 '0',
 'ൊ',
 'z',
 'ച',
 'ഫ',
 '%',
 'ഭ',
 'ൗ',
 '5',
 '{',
 'ഥ',
 '′',
 'ഐ',
 '6',
 'ധ',
 ')',
 '9',
 'ആ',
 'ഖ',
 '8',
 '–',
 'ൌ',
 '7',
 'ൃ',
 'q',
 '}',
 'ഉ',
 'ഘ',
 'ഒ',
 'ഈ',
 'ഓ',
 '=',
 'ഠ',
 'ഏ',
 'ങ',
 '‘',
 '|',
 'ഞ',
 'ഛ',
 '·',
 'ഊ',
 'ഔ']