<a href="https://colab.research.google.com/github/gyasifred/msc-thesis/blob/main/build_subword_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook will build a subword tokenizer using the [text.BertTokenizer](https://www.tensorflow.org/text/guide/subwords_tokenizer) for French, English and Akan-Twi.

In [2]:
!pip install -q -U tensorflow-text-nightly==2.11.0.dev20220817
!pip install -U -q tf-nightly==2.11.0.dev20220817

[K     |████████████████████████████████| 5.9 MB 4.1 MB/s 
[K     |████████████████████████████████| 582.0 MB 13 kB/s 
[K     |████████████████████████████████| 439 kB 70.4 MB/s 
[K     |████████████████████████████████| 1.7 MB 67.7 MB/s 
[K     |████████████████████████████████| 5.9 MB 38.0 MB/s 
[?25h

In [3]:
import tensorflow_text as text
import tensorflow as tf
import time
# import BERT tool for building tokenizer
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [4]:
# This code was adapted from https://github.com/GhanaNLP/kasa/blob/master/Kasa/Preprocessing.py
# A subclass of the kasafrench for preprocessing data
# import required library
import re
import unicodedata


class Preprocessing:
    # dummy initialization method
    def __init__(self):
        # initialize with some default parameters here later
        pass

    # read in parallel twi - english dataset
    def read_parallel_dataset(self, filepath_1, filepath_2, filepath_3=None):
        if filepath_3 != None:
            # read first language data
            lang_1 = []
            with open(filepath_1, encoding='utf-8') as file:
                line = file.readline()
                cnt = 1
                while line:
                    lang_1.append(line.strip())
                    line = file.readline()
                    cnt += 1

            # read second language data
            lang_2 = []
            with open(filepath_2, encoding='utf-8') as file:

                # twi=file.read()
                line = file.readline()
                cnt = 1
                while line:
                    lang_2.append(line.strip())
                    line = file.readline()
                    cnt += 1
            # Read third Language data
            lang_3 = []
            with open(filepath_3, encoding='utf-8') as file:
                line = file.readline()
                cnt = 1
                while line:
                    lang_3.append(line.strip())
                    line = file.readline()
                    cnt += 1

            return lang_1, lang_2, lang_3
            
        else:
            # read first language data
            lang_1 = []
            with open(filepath_1, encoding='utf-8') as file:
                line = file.readline()
                cnt = 1
                while line:
                    lang_1.append(line.strip())
                    line = file.readline()
                    cnt += 1

            # read second language data
            lang_2 = []
            with open(filepath_2, encoding='utf-8') as file:

                # twi=file.read()
                line = file.readline()
                cnt = 1
                while line:
                    lang_2.append(line.strip())
                    line = file.readline()
                    cnt += 1

            return lang_1, lang_2

    # Define a helper function to remove string accents

    def removeStringAccent(self, s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
        )

    # normalize input twi sentence
    def normalize_twi(self, s):
        s = self.removeStringAccent(s)
        s = s.lower()
        s = re.sub(r'([!.?])', r' \1', s)
        s = re.sub(r'[^a-zA-Z.ƆɔɛƐ!?’]+', r' ', s)
        s = re.sub(r'\s+', r' ', s)
        return s

    # normalize input french sentence
    def normalize_FrEn(self, s):
        s = self.removeStringAccent(s)
        s = s.lower()
        s = re.sub(r'([!.?])', r' \1', s)
        s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
        s = re.sub(r'\s+', r' ', s)
        return s
    
    def writeTotxt(self,destination,data):
        with open(destination, 'w') as f:
            for line in data:
                 f.write(f"{line}\n")

In [5]:
# import preprocessing class 

#from tft.preprocessing import Preprocessing # note form of library import

# Create an instance of tft preprocessing class
preprocessor = Preprocessing()

# Read raw parallel dataset
raw_data_twi, raw_data_fr, raw_eng_data = preprocessor.read_parallel_dataset(
    filepath_1='/content/drive/MyDrive/verified_twi.txt',
    filepath_2='/content/drive/MyDrive/verified_english_french.txt',
    filepath_3='/content/drive/MyDrive/verified_english.txt'
)

# Normalize the raw data
raw_data_fr = [preprocessor.normalize_FrEn(data) for data in raw_data_fr]
raw_data_twi = [preprocessor.normalize_twi(data) for data in raw_data_twi]
raw_data_eng = [preprocessor.normalize_FrEn(data) for data in raw_eng_data ]

In [6]:
# write the preprocess dataset to a file
preprocessor.writeTotxt('raw_data_twi.txt',raw_data_twi)
preprocessor.writeTotxt('raw_data_fr.txt',raw_data_fr)
preprocessor.writeTotxt('raw_data_eng.txt',raw_data_eng)

 Build TF datasets from input sentences in ALL languages

In [7]:
# read the raw datasets
lines_dataset_fr = tf.data.TextLineDataset('/content/raw_data_fr.txt')
lines_dataset_tw = tf.data.TextLineDataset('/content/raw_data_twi.txt')
lines_dataset_eng= tf.data.TextLineDataset('/content/raw_data_eng.txt')

In [8]:
# combine languages into single dataset
combined = tf.data.Dataset.zip((lines_dataset_tw, lines_dataset_fr,lines_dataset_eng))

In [9]:
# set tokenizer parameters and add reserved tokens; input files already lower-cased, but
# lower_case option does NFD normalization, which is needed
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]
# main parameter here that could be tuned is vocab size
bert_vocab_args = dict(
  # The target vocabulary size
  vocab_size = 5000,
  # Reserved tokens that must be included in the vocabulary
  reserved_tokens=reserved_tokens,
  # Arguments for `text.BertTokenizer`
  bert_tokenizer_params=bert_tokenizer_params,
  # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
  learn_params={},
  )

In [10]:
# build French vocab file (takes several mins)
# this is the bert_vocab module building its vocab file from the raw French sentences
%%time
fr_vocab = bert_vocab.bert_vocab_from_dataset(
    lines_dataset_fr,
    **bert_vocab_args
    )

CPU times: user 53.7 s, sys: 1.5 s, total: 55.2 s
Wall time: 45.9 s


In [12]:
# confirm French sub-word vocab built correctly (last line will look strange; this is expected with
# bert_vocab)
print(fr_vocab[:10])
print(fr_vocab[1000:1010])
print(fr_vocab[-10:])
print(len(fr_vocab))

['[PAD]', '[UNK]', '[START]', '[END]', '!', '.', '?', 'a', 'b', 'c']
['##tion', '##tres', '##usement', 'amour', 'arrivera', 'avaient', 'batiment', 'blanc', 'canadien', 'completement']
['vouloir', 'voulu', 'vus', '##!', '##.', '##?', '##j', '##q', '##v', '##w']
2096


In [13]:
# function to write the build vocab to file
# this file will be used to build tokenizer
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

In [15]:
# write French Vocabs to file
write_vocab_file('fr_vocab.txt', fr_vocab)

In [16]:
# build Twi vocab file 
%%time
twi_vocab = bert_vocab.bert_vocab_from_dataset(
lines_dataset_tw,
**bert_vocab_args
)

CPU times: user 41.1 s, sys: 1.52 s, total: 42.7 s
Wall time: 33.9 s


In [17]:
# confirm TWI sub-word vocab built correctly (last line will look strange; this is expected with
# bert_vocab)
print(twi_vocab[:10])
print(twi_vocab[100:110])
print(twi_vocab[1000:1010])
print(twi_vocab[-10:])
print(len(twi_vocab))

['[PAD]', '[UNK]', '[START]', '[END]', '!', '.', '?', 'a', 'b', 'c']
['di', 'paa', 'sen', 'ɔyɛ', 'misusuw', 'bɛn', 'pɛ', '##m', 'kwan', 'ankasa']
['gyaade', 'kotoku', 'mabrɛ', 'mansusuw', 'menom', 'nnera', 'nokwasɛm', 'nɔma', 'pefee', 'tia']
['##.', '##?', '##b', '##j', '##p', '##q', '##v', '##x', '##z', '##’']
1993


In [18]:
# write twi vocabs to file
write_vocab_file('twi_vocab.txt', twi_vocab)

In [19]:
# build English vocab file
%%time
eng_vocab = bert_vocab.bert_vocab_from_dataset(
    lines_dataset_eng,
    **bert_vocab_args
)

CPU times: user 47.3 s, sys: 1.54 s, total: 48.9 s
Wall time: 36.5 s


In [21]:
# confirm TWI sub-word vocab built correctly (last line will look strange; this is expected with
# bert_vocab)
print(eng_vocab[:10])
print(eng_vocab[100:110])
print(eng_vocab[1000:1010])
print(eng_vocab[-10:])
print(len(eng_vocab))

['[PAD]', '[UNK]', '[START]', '[END]', '!', '.', '?', 'a', 'b', 'c']
['out', 'were', 'get', 'doesn', 'who', 'one', 'by', '##d', 'going', 'would']
['##te', '##um', 'accept', 'anybody', 'boyfriend', 'case', 'cats', 'chinese', 'concert', 'correct']
['workers', 'yellow', '##!', '##.', '##?', '##c', '##j', '##q', '##v', '##z']
1905


In [22]:
# write english vocabs to file
write_vocab_file('eng_vocab.txt', eng_vocab)

In [23]:
# Build full language-agnostic tokenizer class; this is standard Google code
# import BERT tool for building tokenizer
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
import tensorflow as tf
import tensorflow_text as text
import pathlib
import numpy as np
import re


class CustomTokenizer(tf.Module):
    def __init__(self, reserved_tokens, vocab_path):
        self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
        self._reserved_tokens = reserved_tokens
        self._vocab_path = tf.saved_model.Asset(vocab_path)
        vocab = pathlib.Path(vocab_path).read_text().splitlines()
        self.vocab = tf.Variable(vocab)
        # Create the signatures for export:
        # Include a tokenize signature for a batch of strings.
        self.tokenize.get_concrete_function(
            tf.TensorSpec(shape=[None], dtype=tf.string))
        # Include `detokenize` and `lookup` signatures for:
        # * `Tensors` with shapes [tokens] and [batch, tokens]
        # * `RaggedTensors` with shape [batch, tokens]
        self.detokenize.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.detokenize.get_concrete_function(
            tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))
        self.lookup.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.lookup.get_concrete_function(
            tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))
        # These `get_*` methods take no arguments
        self.get_vocab_size.get_concrete_function()
        self.get_vocab_path.get_concrete_function()
        self.get_reserved_tokens.get_concrete_function()

    @tf.function
    def add_start_end(self,ragged):
        START = tf.argmax(tf.constant(self._reserved_tokens) == "[START]")
        END = tf.argmax(tf.constant(self._reserved_tokens) == "[END]")
        count = ragged.bounding_shape()[0]
        starts = tf.fill([count, 1], START)
        ends = tf.fill([count, 1], END)
        return tf.concat([starts, ragged, ends], axis=1)

    # Function to remove reserved tokens after detokenization
    @tf.function
    def cleanup_text(self, reserved_tokens,token_txt):
        # Drop the reserved tokens, except for "[UNK]".
        bad_tokens = [re.escape(tok)
                      for tok in reserved_tokens if tok != "[UNK]"]
        bad_token_re = "|".join(bad_tokens)
        bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
        result = tf.ragged.boolean_mask(token_txt, ~bad_cells)
        # Join them into strings.
        result = tf.strings.reduce_join(result, separator=' ', axis=-1)
        return result

    @tf.function
    def tokenize(self, strings):
        enc = self.tokenizer.tokenize(strings)
        # Merge the `word` and `word-piece` axes.
        enc = enc.merge_dims(-2, -1)
        enc = self.add_start_end(enc)
        return enc

    @tf.function
    def detokenize(self, tokenized):
        words = self.tokenizer.detokenize(tokenized)
        return self.cleanup_text(self._reserved_tokens, words)

    @tf.function
    def lookup(self, token_ids):
        return tf.gather(self.vocab, token_ids)

    @tf.function
    def get_vocab_size(self):
        return tf.shape(self.vocab)[0]

    @tf.function
    def get_vocab_path(self):
        return self._vocab_path

    @tf.function
    def get_reserved_tokens(self):
        return tf.constant(self._reserved_tokens)

In [24]:
# Instantiate tokenizer class for both TWI and FRENCH
tokenizers = tf.Module()
tokenizers.fr = CustomTokenizer(reserved_tokens, 'fr_vocab.txt')
tokenizers.twi = CustomTokenizer(reserved_tokens, 'twi_vocab.txt')
tokenizers.eng = CustomTokenizer(reserved_tokens, 'eng_vocab.txt')

In [25]:
# Save tokenizer model
model_name = '/content/drive/MyDrive/translate_frengtwi_converter'
tf.saved_model.save(tokenizers, model_name)

In [26]:
# Verify tokenizer model can be reloaded
tokenizers = tf.saved_model.load(model_name)
print(tokenizers.fr.get_vocab_size().numpy())
print(tokenizers.twi.get_vocab_size().numpy())
print(tokenizers.eng.get_vocab_size().numpy())

2096
1993
1905


In [28]:
# Verify tokenizer works on test sentence
tokens = tokenizers.fr.tokenize(['je suis étudiant'])
text_tokens = tokenizers.fr.lookup(tokens)
text_tokens

<tf.RaggedTensor [[b'[START]', b'je', b'suis', b'etudiant', b'[END]']]>

In [29]:
# Remove token markers to get original sentence
round_trip = tokenizers.fr.detokenize(tokens)
print(round_trip.numpy()[0].decode('utf-8'))

je suis etudiant


In [30]:
# Verify if tokenizer work on test Twi sentence
tokens = tokenizers.twi.tokenize(['Obiara ani gyee na akokoduru no ho .'])
text_tokens = tokenizers.twi.lookup(tokens)
print(text_tokens)

<tf.RaggedTensor [[b'[START]', b'obiara', b'ani', b'gyee', b'na', b'akokoduru', b'no',
  b'ho', b'.', b'[END]']]>


In [31]:
# Remove token markers to get original sentence
round_trip = tokenizers.twi.detokenize(tokens)
print(round_trip.numpy()[0].decode('utf-8'))

obiara ani gyee na akokoduru no ho .


In [32]:
# Verify if tokenizer work on test Twi sentence
tokens = tokenizers.eng.tokenize(['Patience is key to happiness'])
text_tokens = tokenizers.eng.lookup(tokens)
print(text_tokens)

<tf.RaggedTensor [[b'[START]', b'patience', b'is', b'key', b'to', b'happiness', b'[END]']]>


In [33]:
# Remove token markers to get original sentence
round_trip = tokenizers.eng.detokenize(tokens)
print(round_trip.numpy()[0].decode('utf-8'))

patience is key to happiness
