In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import os
import json
from tqdm import tqdm
%cd /content/drive/MyDrive/MT/atmt_2022

/content/drive/MyDrive/MT/atmt_2022


# Setup

In [None]:
# copy data folder to somewhere else for testing
! cp -r /content/drive/MyDrive/MT/atmt_2022/data /content/drive/MyDrive/MT/data3

In [27]:
# vars to run the preprocess.py script
# we start from the already preprocessed data in $preprocessed
# we apply the BPE to all the files and save all of them to $BPE
data_folder = '/content/drive/MyDrive/MT/data3'

base = '/content/drive/MyDrive/MT/atmt_2022'
moses_scripts = os.path.join(base, 'scripts/../moses_scripts')
postprocess_script = os.path.join(base, 'scripts/postprocess.sh')
preprocess_script = os.path.join(base, 'preprocess.py')

preprocessed = os.path.join(data_folder, 'en-fr/preprocessed')
raw_data = os.path.join(data_folder, 'en-fr/raw')
prepared = os.path.join(data_folder, 'en-fr/prepared')
BPE = os.path.join(data_folder, 'en-fr/BPE')
test_data = os.path.join(data_folder, 'en-fr/raw/test.en')

src_lang = 'fr'
tgt_lang = 'en'

In [None]:
# $prepared need to be empty
# we have empty BPE and prepared now
! rm -rf $prepared
! mkdir $prepared
! mkdir $BPE

# Train and Evaluate Model
See BPE encoding and preprocessing below first.

In [24]:
out_file = os.path.join(base, 'assignments/03/BPE_2')
checkpoints = os.path.join(out_file, 'checkpoints')
saved_model = os.path.join(checkpoints, 'checkpoint_best.pt')
# for versioning

In [None]:
! python train.py \
--data $prepared \
--source-lang $src_lang \
--target-lang $tgt_lang \
--save-dir $checkpoints \
--cuda \
--batch-size 256

INFO: Commencing training!
INFO: COMMAND: train.py --data /content/drive/MyDrive/MT/data3/en-fr/prepared --source-lang fr --target-lang en --save-dir assignments/03/BPE_2/checkpoints --cuda --batch-size 256
INFO: Arguments: {'cuda': True, 'data': '/content/drive/MyDrive/MT/data3/en-fr/prepared', 'source_lang': 'fr', 'target_lang': 'en', 'max_tokens': None, 'batch_size': 256, 'train_on_tiny': False, 'arch': 'lstm', 'max_epoch': 10000, 'clip_norm': 4.0, 'lr': 0.0003, 'patience': 3, 'log_file': None, 'save_dir': 'assignments/03/BPE_2/checkpoints', 'restore_file': 'checkpoint_last.pt', 'save_interval': 1, 'no_save': False, 'epoch_checkpoints': False, 'encoder_embed_dim': 64, 'encoder_embed_path': None, 'encoder_hidden_size': 64, 'encoder_num_layers': 1, 'encoder_bidirectional': 'True', 'encoder_dropout_in': 0.25, 'encoder_dropout_out': 0.25, 'decoder_embed_dim': 64, 'decoder_embed_path': None, 'decoder_hidden_size': 128, 'decoder_num_layers': 1, 'decoder_dropout_in': 0.25, 'decoder_dropout

In [25]:
raw_translations = os.path.join(out_file, 'translations.txt')
bpe_processed_translations = os.path.join(out_file, "translations.bpe.txt")
final_translations = os.path.join(out_file, "translations.p.txt")

In [None]:
! python translate.py \
  --data $prepared \
  --dicts $prepared \
  --checkpoint-path $saved_model \
  --output $raw_translations \
  --cuda \
  --batch-size 1024

[2022-11-07 16:11:07] COMMAND: translate.py --data /content/drive/MyDrive/MT/data3/en-fr/prepared --dicts /content/drive/MyDrive/MT/data3/en-fr/prepared --checkpoint-path /content/drive/MyDrive/MT/atmt_2022/assignments/03/BPE_2/checkpoints/checkpoint_best.pt --output /content/drive/MyDrive/MT/atmt_2022/assignments/03/BPE_2/translations.txt --cuda --batch-size 1024
[2022-11-07 16:11:07] Arguments: {'cuda': True, 'data': '/content/drive/MyDrive/MT/data3/en-fr/prepared', 'source_lang': 'fr', 'target_lang': 'en', 'max_tokens': None, 'batch_size': 1024, 'train_on_tiny': False, 'arch': 'lstm', 'max_epoch': 10000, 'clip_norm': 4.0, 'lr': 0.0003, 'patience': 3, 'log_file': None, 'save_dir': 'assignments/03/BPE_2/checkpoints', 'restore_file': 'checkpoint_last.pt', 'save_interval': 1, 'no_save': False, 'epoch_checkpoints': False, 'encoder_embed_dim': 64, 'encoder_embed_path': None, 'encoder_hidden_size': 64, 'encoder_num_layers': 1, 'encoder_bidirectional': 'True', 'encoder_dropout_in': 0.25, 'e

In [None]:
# Now concatenate the subwords
with open(raw_translations) as f:
  lines = f.readlines()
with open(bpe_processed_translations, 'a') as f:
  for line in tqdm(lines):
    # decode line in BPE functions part
    f.write(decode_line(line))

100%|██████████| 500/500 [00:00<00:00, 120581.42it/s]


In [None]:
! bash $postprocess_script $bpe_processed_translations $final_translations en

In [None]:
! pip install sacrebleu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 14.1 MB/s 
Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting portalocker
  Downloading portalocker-2.6.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.6.0 sacrebleu-2.3.1


In [None]:
! cat $final_translations | sacrebleu $test_data

{
 "name": "BLEU",
 "score": 11.0,
 "signature": "nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.3.1",
 "verbose_score": "44.5/15.6/7.0/3.1 (BP = 1.000 ratio = 1.047 hyp_len = 4073 ref_len = 3892)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "13a",
 "smooth": "exp",
 "version": "2.3.1"
}
[0m

# BPE Functions and Training
Add on the preprocessing.py maybe?

In [29]:
import re, collections
from tqdm import tqdm

def get_vocab(filenames):
    vocab = collections.defaultdict(int)
    for filename in filenames:
      with open(filename, 'r', encoding='utf-8') as fhand:
          for line in fhand:
              words = line.strip().split()
              for word in words:
                  vocab[' '.join(list(word)) + ' </w>'] += 1

    return vocab

def get_stats(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

def get_tokens_from_vocab(vocab):
    tokens_frequencies = collections.defaultdict(int)
    vocab_tokenization = {}
    for word, freq in vocab.items():
        word_tokens = word.split()
        for token in word_tokens:
            tokens_frequencies[token] += freq
        vocab_tokenization[''.join(word_tokens)] = word_tokens
    return tokens_frequencies, vocab_tokenization

def measure_token_length(token):
    if token[-4:] == '</w>':
        return len(token[:-4]) + 1
    else:
        return len(token)

def tokenize_word(string, sorted_tokens, unknown_token='</u>'):
    
    if string == '':
        return []
    if sorted_tokens == []:
        return [unknown_token]

    string_tokens = []
    for i in range(len(sorted_tokens)):
        token = sorted_tokens[i]
        token_reg = re.escape(token.replace('.', '[.]'))

        matched_positions = [(m.start(0), m.end(0)) for m in re.finditer(token_reg, string)]
        if len(matched_positions) == 0:
            continue
        substring_end_positions = [matched_position[0] for matched_position in matched_positions]

        substring_start_position = 0
        for substring_end_position in substring_end_positions:
            substring = string[substring_start_position:substring_end_position]
            string_tokens += tokenize_word(string=substring, sorted_tokens=sorted_tokens[i+1:], unknown_token=unknown_token)
            string_tokens += [token]
            substring_start_position = substring_end_position + len(token)
        remaining_substring = string[substring_start_position:]
        string_tokens += tokenize_word(string=remaining_substring, sorted_tokens=sorted_tokens[i+1:], unknown_token=unknown_token)
        break
    return string_tokens

def encode(word):
  if word in vocab_tokenization:
    return vocab_tokenization[word]
  else:
    return tokenize_word(string=word, sorted_tokens=sorted_tokens, unknown_token='</u>')

def decode_line(line):
  words = line.split('</w>')
  new_line = ''
  for word in words:
    new_line = new_line + ' ' + decode_word(word)
  return new_line

def decode_word(word):
  subwords = word.split(' ')
  return ''.join(subwords)

In [None]:
# learn BPE segmentation for both languages
vocab = get_vocab([os.path.join(preprocessed, 'train.en'), os.path.join(preprocessed, 'train.fr')])

tokens_frequencies, vocab_tokenization = get_tokens_from_vocab(vocab)

num_merges = 10000
print("## Training BPE model...")
for i in tqdm(range(num_merges)):
    pairs = get_stats(vocab)
    if not pairs:
        break
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    tokens_frequencies, vocab_tokenization = get_tokens_from_vocab(vocab)

sorted_tokens_tuple = sorted(tokens_frequencies.items(), key=lambda item: (measure_token_length(item[0]), item[1]), reverse=True)
sorted_tokens = [token for (token, freq) in sorted_tokens_tuple]

print('Saving vocab_tokenization and sorted_tokens to $preprocessed...')
with open(os.path.join(preprocessed, 'vocab_tokenization.json'), 'w') as f: 
    json.dump(vocab_tokenization, f)
with open(os.path.join(preprocessed, 'sorted_tokens.txt'), "w") as f:
    f.write("\n".join(sorted_tokens))

## Training BPE model...


100%|██████████| 10000/10000 [15:59<00:00, 10.42it/s]


Saving vocab_tokenization and sorted_tokens to $preprocessed...


In [None]:
# now apply BPE on all files and save to $BPE
# first need to add </w> to the end of each word
# will be useful for separating the words from each other
langs = ['fr', 'en']
for lang in langs:
  for fname in ['train', 'test', 'valid', 'tiny_train']:
    with open(os.path.join(preprocessed, fname + '.' + lang)) as f:
      lines = f.readlines()
    with open(os.path.join(BPE, fname + '.' + lang), 'a') as f:
      for line in tqdm(lines):
        words = line[:-1].split(' ')
        new_words = []
        new_line = ''
        for word in words:
          new_line = new_line + ' ' + ''.join(encode(word + '</w>'))
        f.write(new_line + '\n')

100%|██████████| 10000/10000 [00:00<00:00, 88855.78it/s]
100%|██████████| 500/500 [01:30<00:00,  5.52it/s]
100%|██████████| 500/500 [01:25<00:00,  5.82it/s]
100%|██████████| 1000/1000 [02:48<00:00,  5.95it/s]
100%|██████████| 10000/10000 [00:00<00:00, 128758.37it/s]
100%|██████████| 500/500 [01:07<00:00,  7.44it/s]
100%|██████████| 500/500 [00:55<00:00,  9.09it/s]
100%|██████████| 1000/1000 [02:01<00:00,  8.26it/s]


# Now Rest of the Preprocessing

In [None]:
# train truecase model for languages in BPE.
! perl $moses_scripts/train-truecaser.perl --model $BPE/tm.$lang --corpus $BPE/train.$src_lang
! perl $moses_scripts/train-truecaser.perl --model $BPE/tm.$lang --corpus $BPE/train.$tgt_lang

In [None]:
! python $preprocess_script \
    --source-lang $src_lang \
    --target-lang $tgt_lang \
    --dest-dir $prepared \
    --train-prefix $BPE/train \
    --valid-prefix $BPE/valid \
    --test-prefix $BPE/test \
    --tiny-train-prefix $BPE/tiny_train \
    --threshold-src 1 \
    --threshold-tgt 1 \
    --num-words-src 4000 \
    --num-words-tgt 4000

[2022-11-07 16:03:23] COMMAND: /content/drive/MyDrive/MT/atmt_2022/preprocess.py --source-lang fr --target-lang en --dest-dir /content/drive/MyDrive/MT/data3/en-fr/prepared --train-prefix /content/drive/MyDrive/MT/data3/en-fr/BPE/train --valid-prefix /content/drive/MyDrive/MT/data3/en-fr/BPE/valid --test-prefix /content/drive/MyDrive/MT/data3/en-fr/BPE/test --tiny-train-prefix /content/drive/MyDrive/MT/data3/en-fr/BPE/tiny_train --threshold-src 1 --threshold-tgt 1 --num-words-src 4000 --num-words-tgt 4000
[2022-11-07 16:03:23] Arguments: {'source_lang': 'fr', 'target_lang': 'en', 'train_prefix': '/content/drive/MyDrive/MT/data3/en-fr/BPE/train', 'tiny_train_prefix': '/content/drive/MyDrive/MT/data3/en-fr/BPE/tiny_train', 'valid_prefix': '/content/drive/MyDrive/MT/data3/en-fr/BPE/valid', 'test_prefix': '/content/drive/MyDrive/MT/data3/en-fr/BPE/test', 'dest_dir': '/content/drive/MyDrive/MT/data3/en-fr/prepared', 'threshold_src': 1, 'num_words_src': 4000, 'threshold_tgt': 1, 'num_words_t

# BPE Usage

In [30]:
# to load the BPE model simply do:
with open(os.path.join(preprocessed, 'sorted_tokens.txt'), 'r') as f:
  sorted_tokens = f.read().split('\n')
with open(os.path.join(preprocessed, 'vocab_tokenization.json'), 'r') as f:
  vocab_tokenization = json.load(f)

In [31]:
line = 'hello how are you iloveapples'
words = line.split(' ')
new_words = []
new_line = ''
for word in words:
  new_line = new_line + ' ' + ' '.join(encode(word+'</w>'))

In [32]:
new_line

' hello</w> how</w> are</w> you</w> ilo ve apples</w>'

In [33]:
decode_line(new_line)

' hello how are you iloveapples '