<a href="https://colab.research.google.com/github/gyasifred/msc-thesis/blob/main/tokenizer_seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Tensorflow

In [2]:
!pip install -q -U tensorflow-text-nightly==2.11.0.dev20220817
!pip install -U -q tf-nightly==2.11.0.dev20220817

In [24]:
# import libraries
import tensorflow as tf

import tensorflow_text as tf_text
import pickle

## Preprocess data

In [14]:
# This code was adapted from https://github.com/GhanaNLP/kasa/blob/master/Kasa/Preprocessing.py
# import required library
import re
import unicodedata


class Preprocessing:
    # dummy initialization method
    def __init__(self):
        # initialize with some default parameters here later
        pass

    # read in parallel twi - french dataset
    def read_parallel_dataset(self, filepath_1, filepath_2):

        # read language 1
        lang_1 = []
        with open(filepath_1, encoding='utf-8') as file:
            line = file.readline()
            cnt = 1
            while line:
                lang_1.append(line.strip())
                line = file.readline()
                cnt += 1

        # read language 2
        lang_2 = []
        with open(filepath_2, encoding='utf-8') as file:

            # twi=file.read()
            line = file.readline()
            cnt = 1
            while line:
                lang_2.append(line.strip())
                line = file.readline()
                cnt += 1

      

        return lang_1, lang_2

    # Define a helper function to remove string accents

    def removeStringAccent(self, s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
        )

    # normalize input twi sentence
    def normalize_twi(self, s):
        s = self.removeStringAccent(s)
        s = re.sub(r'([!.?])', r' \1', s)
        s = re.sub(r'[^a-zA-Z.ƆɔɛƐ!?’]+', r' ', s)
        s = re.sub(r'\s+', r' ', s)
        return s

    # normalize input french and English sentence
    def normalize_fr(self, s):
        s = self.removeStringAccent(s)
        s = re.sub(r'([!.?])', r' \1', s)
        s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
        s = re.sub(r'\s+', r' ', s)
        return s

In [15]:
preprocessor = Preprocessing()

# Read raw parallel dataset
raw_data_twi, raw_data_fr = preprocessor.read_parallel_dataset(
    filepath_1='/content/verified_twi.txt',
    filepath_2='/content/verified_french.txt')

In [17]:
# Normalize the raw data
raw_data_fr = [preprocessor.normalize_fr(data) for data in raw_data_fr]
raw_data_twi = [preprocessor.normalize_twi(data) for data in raw_data_twi]

In [18]:
# define function to write text to txt file
def writeTotxt(destination,data):
  with open(destination, 'w') as f:
    for line in data:
        f.write(f"{line}\n")

In [19]:
# write the preprocess french and twi lines to a file
writeTotxt('twi_lines.txt',raw_data_twi)
writeTotxt('fr_lines.txt',raw_data_fr)

## Create Tokenizer

In [20]:
# buld tf dataset
full_dataset_fr = tf.data.TextLineDataset('/content/fr_lines.txt')
full_dataset_tw = tf.data.TextLineDataset('/content/twi_lines.txt')

Vecrorization

In [22]:
# add start and end tokens
def tf_start_and_end_tokens(text):
    # Split accented characters.
    #text = tf_text.normalize_utf8(text, 'NFKD')
    text = tf.strings.lower(text)
    # Strip whitespace.
    text = tf.strings.strip(text)

    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
    return text

# set maximum vocaburary size
max_vocab_size = 10000
# Process twi as input
twi_tokenizer = tf.keras.layers.TextVectorization(
    standardize=tf_start_and_end_tokens,
    max_tokens=max_vocab_size)
twi_tokenizer.adapt(full_dataset_tw)

# Process french as output
french_tokenizer = tf.keras.layers.TextVectorization(
    standardize=tf_start_and_end_tokens,
    max_tokens=max_vocab_size)

french_tokenizer.adapt(full_dataset_fr)

In [23]:
#verify tokenizer
# Print few lines of our tokenizers vocabulary and length
print(f'French Tokenizer:',french_tokenizer.get_vocabulary()[:10])
print(f'French Tokenizer size:',len(french_tokenizer.get_vocabulary()))

print()
print(f'TWI Tokenizer:',twi_tokenizer.get_vocabulary()[-10:])
print(f'TWI Tokenizer size:',len(twi_tokenizer.get_vocabulary()))


French Tokenizer: ['', '[UNK]', '[START]', '[END]', '.', 'a', 'de', 'je', 'est', 'il']
French Tokenizer size: 9570

TWI Tokenizer: ['abamu', 'abambu', 'abada', 'abaafo', 'aakwantuo', 'aa', '.r', '.meda', '.ma', '.abena']
TWI Tokenizer size: 7750


In [26]:
# test on simple sentenct
twi_tokenizer("Dɔ n nti na abofra no suɔ")

<tf.Tensor: shape=(9,), dtype=int64, numpy=array([   2,  440,   33,   60,    8,  215,    5, 5491,    3])>

In [38]:
twi_tokenizer.get_vocabulary()[:10]

['', '[UNK]', '[START]', '[END]', '.', 'no', 'sɛ', 'a', 'na', 'so']

## Save Tokenizer

In [32]:
#save Twi Tokenizer
# Pickle the config,vocabs and weights
pickle.dump({'config': twi_tokenizer.get_config(),
             'vocabulary':twi_tokenizer.get_vocabulary(),
             'weights': twi_tokenizer.get_weights()}
            , open("twiwords_tokenizer.pkl", "wb"))


In [33]:
# Save French Tokenizer
#save Twi Tokenizer
# Pickle the config,vocabs and weights
pickle.dump({'config': french_tokenizer.get_config(),
             'vocabulary':french_tokenizer.get_vocabulary(),
             'weights': french_tokenizer.get_weights()}
            , open("frenchwords_tokenizer.pkl", "wb"))


## Reload Tokenizers and test

In [34]:
#reload twi tokenizer
tw_t= pickle.load(open("twiwords_tokenizer.pkl", "rb"))
new_twi_tokenizer = tf.keras.layers.TextVectorization.from_config(tw_t['config'])
# You have to call `adapt` with some dummy data (BUG in Keras)
new_twi_tokenizer.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
new_twi_tokenizer.set_weights(tw_t['weights'])
new_twi_tokenizer.set_vocabulary(tw_t['vocabulary'])


In [35]:
# test new twi_tokenizer
#test on simple sentenct
new_twi_tokenizer("Dɔ n nti na abofra no suɔ")

<tf.Tensor: shape=(9,), dtype=int64, numpy=array([   2,  440,   33,   60,    8,  215,    5, 5491,    3])>

In [37]:
#get vocabulary
new_twi_tokenizer.get_vocabulary()[:10]

['', '[UNK]', '[START]', '[END]', '.', 'no', 'sɛ', 'a', 'na', 'so']

In [40]:
# reload French Tokenizer
#reload twi tokenizer
tw_t= pickle.load(open("frenchwords_tokenizer.pkl", "rb"))
new_french_tokenizer = tf.keras.layers.TextVectorization.from_config(tw_t['config'])
# You have to call `adapt` with some dummy data (BUG in Keras)
new_french_tokenizer.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
new_french_tokenizer.set_weights(tw_t['weights'])
new_french_tokenizer.set_vocabulary(tw_t['vocabulary'])


In [41]:
# test new french_tokenizer
#test on simple sentenct
new_twi_tokenizer("Elle a acheté deux douzaines d'œufs.")

<tf.Tensor: shape=(8,), dtype=int64, numpy=array([2, 1, 7, 1, 1, 1, 1, 3])>

In [43]:
#get vocabulary
new_french_tokenizer.get_vocabulary()[-10:]

['abattre',
 'abasourdis',
 'abasourdi',
 'abandonnerait',
 'abandonnerais',
 'abandonnee',
 'abandonna',
 'abaisser',
 '.r',
 '.a']