In [1]:
from fastai.text import * 
from pathlib import *
from functools import partial
from tqdm import tqdm
import re
import string
import shutil
import hashlib

In [2]:
import sentencepiece as spm

In [3]:
!pwd

/data/home/ubuntu/gaurav/in/fire/code-mixed-enma-2/tokenizer


In [4]:
# base_dir = Path('./../dataset_preparation')
# src_dir = base_dir / 'train_en_enma'
# dest_dir = base_dir / 'train_uncased'
# files = []
# for fold in src_dir.ls():
#     files.extend(fold.ls()) 
# for file in tqdm(files):
#     filename = str(file).split('/')[-2] + '_' + str(file).split('/')[-1]
#     with open(file, 'r') as f:
#         text = f.read()
#     text = text.lower()
#     with open(dest_dir/filename, 'w', encoding='utf-8') as f:
#         f.write(text)

In [5]:
# All the data for LM is in train_uncased and valid_uncased

In [6]:
base_dir = Path('./../dataset_preparation')
train_files = (base_dir / 'train_uncased').ls()
valid_files = (base_dir / 'valid_uncased').ls()
print(len(train_files), len(valid_files))

17336 7434


In [7]:
all_files = train_files + valid_files

In [8]:
all_files[0]

PosixPath('../dataset_preparation/train_uncased/en_enma_10447.txt')

In [9]:
all_files = [str(file) for file in all_files]
print(len(set(all_files)))

24770


In [10]:
flist = ','.join(all_files)

In [12]:
custom_symbols = [text.transform.FLD, 
                text.transform.TK_MAJ,
                text.transform.TK_UP,
                text.transform.TK_REP,
                text.transform.TK_WREP]

In [13]:
str_specialcases = ",".join(custom_symbols)

In [14]:
str_specialcases

'xxfld,xxmaj,xxup,xxrep,xxwrep'

In [15]:
spm.SentencePieceTrainer.Train(f'--input={flist} --model_prefix=mlen_spm --vocab_size=15000 --input_sentence_size=22500000 --unk_id=0 --bos_id=1 --eos_id=2 --pad_id=3 --unk_piece={text.transform.UNK} --bos_piece={text.transform.BOS} --eos_piece={text.transform.EOS} --pad_piece={text.transform.PAD} --user_defined_symbols={str_specialcases}')

In [16]:
sp = spm.SentencePieceProcessor()

In [17]:
sp.Load("mlen_spm.model")

True

In [18]:
sp.EncodeAsPieces('how are you'.lower())

['▁how', '▁are', '▁you']

In [19]:
sp.EncodeAsPieces('I am not satisfied with you'.lower())

['▁i', '▁am', '▁not', '▁sati', 's', 'fi', 'ed', '▁with', '▁you']

In [20]:
itos = [sp.IdToPiece(int(i)) for i in range(15000)]

In [21]:
itos

['xxunk',
 'xxbos',
 'xxeos',
 'xxpad',
 'xxfld',
 'xxmaj',
 'xxup',
 'xxrep',
 'xxwrep',
 '.',
 '▁the',
 ',',
 'ർ',
 'ൽ',
 's',
 'ൻ',
 '▁of',
 '▁',
 'ൾ',
 '▁in',
 '▁a',
 '▁•',
 'a',
 'i',
 '▁and',
 'k',
 '▁to',
 '▁·',
 '▁is',
 'e',
 'u',
 '▁i',
 '-',
 'n',
 '▁was',
 't',
 'm',
 'd',
 '▁oru',
 '▁it',
 'y',
 'ng',
 'l',
 "'",
 'ute',
 '▁as',
 'tti',
 'ti',
 '▁on',
 '▁are',
 '▁end',
 'te',
 '▁that',
 '▁at',
 'ing',
 '▁by',
 '▁this',
 'ed',
 '▁he',
 '▁ni',
 '▁for',
 'an',
 'ma',
 'yu',
 '▁e',
 'r',
 'o',
 'ൺ',
 '▁wa',
 'vu',
 '▁from',
 'va',
 'na',
 '▁na',
 '▁with',
 'v',
 'le',
 'nn',
 '▁be',
 'li',
 'ka',
 '▁s',
 'ya',
 '▁an',
 '▁india',
 're',
 'ta',
 'p',
 'ra',
 'es',
 'c',
 '▁his',
 'ai',
 ':',
 'ni',
 'sh',
 'may',
 '▁u',
 'mai',
 '▁aa',
 'ku',
 'ly',
 '▁pa',
 'ch',
 'si',
 '▁has',
 'lu',
 'tu',
 '▁k',
 '▁also',
 'la',
 'ile',
 'th',
 'kka',
 'tine',
 'at',
 'vi',
 '▁were',
 'sha',
 'b',
 'ri',
 '▁mu',
 '▁enna',
 'g',
 '▁ko',
 'pi',
 'il',
 'ne',
 '▁pu',
 '▁or',
 'tta',
 'in',
 'ki

In [22]:
[tok for tok in itos if len(tok) < 2]

['.',
 ',',
 'ർ',
 'ൽ',
 's',
 'ൻ',
 '▁',
 'ൾ',
 'a',
 'i',
 'k',
 'e',
 'u',
 '-',
 'n',
 't',
 'm',
 'd',
 'y',
 'l',
 "'",
 'r',
 'o',
 'ൺ',
 'v',
 'p',
 'c',
 ':',
 'b',
 'g',
 'h',
 '"',
 'j',
 '•',
 'f',
 ';',
 'x',
 '2',
 '3',
 '5',
 'z',
 'w',
 '%',
 '1',
 '/',
 '0',
 '4',
 '9',
 '്',
 '7',
 '°',
 '6',
 'ി',
 '8',
 ')',
 '’',
 '{',
 'q',
 '–',
 '}',
 '?',
 'ാ',
 '=',
 '+',
 '“',
 '‘',
 '\\',
 '|',
 'പ',
 '·',
 '′']