In [1]:
import os
import shutil
from nlp.vocab import get_unigram_from_tokenized

### print corpora stats

In [3]:
dirs = os.listdir("corpora")
for folder in dirs:
    files = os.listdir("corpora/{}".format(folder))
    for file in files:
        with open("corpora/{}/{}".format(folder, file), 'r', encoding='utf-8') as f:
            sents = f.read().split("\n")
            sents.pop(-1)
        words_per_line = [len(line.split(" ")) for line in sents]
        print("data: {} in folder: {}\n total words: {}, lines: {}, max length: {}, avg length: {}".format(
            file,
            folder,
            len(words_per_line),
            sum(words_per_line),
            max(words_per_line),
            sum(words_per_line) / len(words_per_line)
        ))

data: train.txt in folder: TED
 total words: 17000, lines: 270206, max length: 111, avg length: 15.894470588235293
data: valid.txt in folder: TED
 total words: 5000, lines: 78702, max length: 77, avg length: 15.7404
data: retrieval.txt in folder: 1K
 total words: 1000, lines: 12188, max length: 76, avg length: 12.188
data: train.txt in folder: OT
 total words: 17000, lines: 193308, max length: 56, avg length: 11.371058823529411
data: valid.txt in folder: OT
 total words: 5000, lines: 55766, max length: 53, avg length: 11.1532
data: train.txt in folder: WIKI
 total words: 17000, lines: 254247, max length: 67, avg length: 14.955705882352941
data: valid.txt in folder: WIKI
 total words: 5000, lines: 75102, max length: 71, avg length: 15.0204
data: train.txt in folder: NT
 total words: 17000, lines: 203600, max length: 62, avg length: 11.976470588235294
data: valid.txt in folder: NT
 total words: 5000, lines: 56289, max length: 49, avg length: 11.2578


### tokenizer
Generate tokenizer based on NT of vocab size = 500 1K 2K 4K

In [4]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents, Lowercase
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordPieceTrainer
from tokenizers.processors import TemplateProcessing

files = ["corpora/NT/train.txt"]
for vocab_size in [102, 500, 1000, 2000, 4000]:
    # here we insert space to each character so that we can use WordLevel model
    tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
    trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
                               vocab_size=vocab_size)
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[("[CLS]", 1), ("[SEP]", 2)],
    )
    tokenizer.train(files, trainer)
    print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))
    save_path = "tokenizer/{}".format(vocab_size)
    if not os.path.exists(save_path):
        os.mkdir(save_path)
    tokenizer.model.save(save_path)
    shutil.copy('configs/special_tokens_map.json', os.path.join(save_path, 'special_tokens_map.json'))
    shutil.copy('configs/config.json', os.path.join(save_path, 'config.json'))

Trained vocab size: 102
Trained vocab size: 500
Trained vocab size: 1000
Trained vocab size: 2000
Trained vocab size: 4000


### Fake English tokenizers

In [6]:
# combine NT 2000 with shifted NT 2k
save_path = 'tokenizer/2000_2000'
if not os.path.exists(save_path):
    os.mkdir(save_path)
with open("tokenizer/2000/vocab.txt", 'r') as f:
    vocab_list = f.read().split()
shifted_vocab = ["::" + k for k in vocab_list]
vocab_list.extend(shifted_vocab)
with open(os.path.join(save_path, "vocab.txt"), 'w') as f:
    f.writelines(i + '\n' for i in vocab_list)
shutil.copy('configs/special_tokens_map.json', os.path.join(save_path, 'special_tokens_map.json'))
shutil.copy('configs/config.json', os.path.join(save_path, 'config.json'))

# combine NT 2000 with shifted NT 500
save_path = 'tokenizer/2000_500'
if not os.path.exists(save_path):
    os.mkdir(save_path)
with open("tokenizer/500/vocab.txt", 'r') as f:
    vocab_list = f.read().split()
shifted_vocab = ["::" + k for k in vocab_list]
with open("tokenizer/2000/vocab.txt", 'r') as f:
    vocab_list = f.read().split()
vocab_list.extend(shifted_vocab)
with open(os.path.join(save_path, "vocab.txt"), 'w') as f:
    f.writelines(i + '\n' for i in vocab_list)
shutil.copy('configs/special_tokens_map.json', os.path.join(save_path, 'special_tokens_map.json'))
shutil.copy('configs/config.json', os.path.join(save_path, 'config.json'))

# combine NT 2000 with shifted NT 1000
save_path = 'tokenizer/2000_1000'
if not os.path.exists(save_path):
    os.mkdir(save_path)
with open("tokenizer/1000/vocab.txt", 'r') as f:
    vocab_list = f.read().split()
shifted_vocab = ["::" + k for k in vocab_list]
with open("tokenizer/2000/vocab.txt", 'r') as f:
    vocab_list = f.read().split()
vocab_list.extend(shifted_vocab)
with open(os.path.join(save_path, "vocab.txt"), 'w') as f:
    f.writelines(i + '\n' for i in vocab_list)
shutil.copy('configs/special_tokens_map.json', os.path.join(save_path, 'special_tokens_map.json'))
shutil.copy('configs/config.json', os.path.join(save_path, 'config.json'))

# combine NT 2000 with shifted NT 4000
save_path = 'tokenizer/2000_4000'
if not os.path.exists(save_path):
    os.mkdir(save_path)
with open("tokenizer/4000/vocab.txt", 'r') as f:
    vocab_list = f.read().split()
shifted_vocab = ["::" + k for k in vocab_list]
with open("tokenizer/2000/vocab.txt", 'r') as f:
    vocab_list = f.read().split()
vocab_list.extend(shifted_vocab)
with open(os.path.join(save_path, "vocab.txt"), 'w') as f:
    f.writelines(i + '\n' for i in vocab_list)
shutil.copy('configs/special_tokens_map.json', os.path.join(save_path, 'special_tokens_map.json'))
shutil.copy('configs/config.json', os.path.join(save_path, 'config.json'))

'tokenizer/2000_4000/config.json'

In [62]:
import torch
from transformers import AutoModelWithLMHead, AutoConfig
import pickle

with open("/home/grandee/projects/TACL_v1/data/cached_2000_NT_train.txt", 'rb') as f:
    data = pickle.load(f)
freq_list = get_unigram_from_tokenized(data)
align_list1 = [i for i, j in freq_list.items() if i not in [0, 1, 2, 3, 4]]
config = AutoConfig.from_pretrained("/home/grandee/projects/TACL_v1/configs/bert-tiny.json")
config.vocab_size = 4000
config.max_position_embeddings = 128
model = AutoModelWithLMHead.from_pretrained(
    "/home/grandee/projects/joint_align/models/cached_lm_ERV_NEW_2000_ERV_NEW_126_improve_nn_high_freq/pytorch_model.bin",
    config=config
)
with torch.no_grad():
    e1 = model.bert.embeddings.word_embeddings.weight[:2000]
    e2 = model.bert.embeddings.word_embeddings.weight[2000:]
    e1 /= torch.norm(e1, dim=-1, keepdim=True)
    e2 /= torch.norm(e2, dim=-1, keepdim=True)
    ssm = 1 - e1 @ e2.T
    nns = torch.argmin(ssm, dim=-1)
    dist = ssm[torch.arange(len(e1)), nns]
dist_list_1 = torch.argsort(dist).tolist()
dist_list_2 = nns[dist_list_1].tolist()
lists = [(i, j) for i, j in zip(dist_list_1, dist_list_2) if
         i not in [0, 1, 2, 3, 4] and j not in [0, 1, 2, 3, 4]]
list1, list2 = zip(*lists)

start = 0 // 500 * 50
list(list1[start:start + 50]), [i + 2000 for i in list2[start:start + 50]]

([191,
  624,
  582,
  38,
  717,
  737,
  722,
  373,
  455,
  674,
  807,
  503,
  753,
  664,
  703,
  708,
  733,
  728,
  625,
  735,
  803,
  618,
  690,
  603,
  472,
  592,
  578,
  758,
  495,
  704,
  422,
  658,
  617,
  614,
  620,
  615,
  685,
  652,
  638,
  594,
  461,
  414,
  621,
  458,
  441,
  539,
  453,
  537,
  456,
  613],
 [2191,
  2624,
  2582,
  2038,
  2717,
  2737,
  2722,
  2373,
  2455,
  2674,
  2807,
  2503,
  2753,
  2664,
  2703,
  2708,
  2733,
  2728,
  2625,
  2735,
  2803,
  2618,
  2690,
  2603,
  2472,
  2592,
  2578,
  2758,
  2495,
  2704,
  2422,
  2658,
  2617,
  2614,
  2620,
  2615,
  2685,
  2652,
  2638,
  2594,
  2461,
  2414,
  2621,
  2458,
  2441,
  2539,
  2453,
  2537,
  2456,
  2613])