<a href="https://colab.research.google.com/github/gyasifred/msc-thesis/blob/main/tokenizer_seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install Tensorflow

In [20]:
!pip install -q -U tensorflow-text-nightly==2.11.0.dev20220817
!pip install -U -q tf-nightly==2.11.0.dev20220817

In [21]:
# import libraries
import tensorflow as tf
import tensorflow_text as tf_text
import pickle

## Preprocess data

In [22]:
# This code was adapted from https://github.com/GhanaNLP/kasa/blob/master/Kasa/Preprocessing.py
# A subclass of the kasafranse for preprocessing data
# import required library
import re
import unicodedata


class Preprocessing:
    # dummy initialization method
    def __init__(self):
        # initialize with some default parameters here later
        pass

    # read in parallel twi - english dataset
    def read_parallel_dataset(self, filepath_1, filepath_2, filepath_3=None):
        if filepath_3 != None:
            # read first language data
            lang_1 = []
            with open(filepath_1, encoding='utf-8') as file:
                line = file.readline()
                cnt = 1
                while line:
                    lang_1.append(line.strip())
                    line = file.readline()
                    cnt += 1

            # read second language data
            lang_2 = []
            with open(filepath_2, encoding='utf-8') as file:

                # twi=file.read()
                line = file.readline()
                cnt = 1
                while line:
                    lang_2.append(line.strip())
                    line = file.readline()
                    cnt += 1
            # Read third Language data
            lang_3 = []
            with open(filepath_3, encoding='utf-8') as file:
                line = file.readline()
                cnt = 1
                while line:
                    lang_3.append(line.strip())
                    line = file.readline()
                    cnt += 1

            return lang_1, lang_2, lang_3
            
        else:
            # read first language data
            lang_1 = []
            with open(filepath_1, encoding='utf-8') as file:
                line = file.readline()
                cnt = 1
                while line:
                    lang_1.append(line.strip())
                    line = file.readline()
                    cnt += 1

            # read second language data
            lang_2 = []
            with open(filepath_2, encoding='utf-8') as file:

                # twi=file.read()
                line = file.readline()
                cnt = 1
                while line:
                    lang_2.append(line.strip())
                    line = file.readline()
                    cnt += 1

            return lang_1, lang_2

    # Define a helper function to remove string accents

    def removeStringAccent(self, s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
        )

    # normalize input twi sentence
    def normalize_twi(self, s):
        s = self.removeStringAccent(s)
        s = s.lower()
        s = re.sub(r'([!.?])', r' \1', s)
        s = re.sub(r'[^a-zA-Z.ƆɔɛƐ!?’]+', r' ', s)
        s = re.sub(r'\s+', r' ', s)
        return s

    # normalize input french sentence
    def normalize_FrEn(self, s):
        s = self.removeStringAccent(s)
        s = s.lower()
        s = re.sub(r'([!.?])', r' \1', s)
        s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
        s = re.sub(r'\s+', r' ', s)
        return s
    
    def writeTotxt(self,destination,data):
        with open(destination, 'w') as f:
            for line in data:
                 f.write(f"{line}\n")

In [23]:
# import preprocessing class 
# Create an instance of tft preprocessing class
preprocessor = Preprocessing()

# Read raw parallel dataset
raw_data_twi, raw_data_fr, raw_eng_data = preprocessor.read_parallel_dataset(
    filepath_1='/content/drive/MyDrive/verified_twi.txt',
    filepath_2='/content/drive/MyDrive/verified_english_french.txt',
    filepath_3='/content/drive/MyDrive/verified_english.txt'
)

# Normalize the raw data
raw_data_fr = [preprocessor.normalize_FrEn(data) for data in raw_data_fr]
raw_data_twi = [preprocessor.normalize_twi(data) for data in raw_data_twi]
raw_data_eng = [preprocessor.normalize_FrEn(data) for data in raw_eng_data ]

In [24]:
# write the preprocess dataset to a file
preprocessor.writeTotxt('raw_data_twi.txt',raw_data_twi)
preprocessor.writeTotxt('raw_data_fr.txt',raw_data_fr)
preprocessor.writeTotxt('raw_data_eng.txt',raw_data_eng)

## Create Tokenizer

build tf dataset

In [25]:
# read the raw datasets
lines_dataset_fr = tf.data.TextLineDataset('/content/raw_data_fr.txt')
lines_dataset_tw = tf.data.TextLineDataset('/content/raw_data_twi.txt')
lines_dataset_eng= tf.data.TextLineDataset('/content/raw_data_eng.txt')

Vecrorization

In [26]:
# add start and end tokens
def tf_start_and_end_tokens(text):
    # Split accented characters.
    text = tf_text.normalize_utf8(text, 'NFKD')
    text = tf.strings.lower(text)
    # Strip whitespace.
    text = tf.strings.strip(text)

    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
    return text

# set maximum vocaburary size
max_vocab_size = 10000
# Process twi as input
twi_tokenizer = tf.keras.layers.TextVectorization(
    standardize=tf_start_and_end_tokens,
    max_tokens=max_vocab_size)
twi_tokenizer.adapt(lines_dataset_tw)

# Process french as output
french_tokenizer = tf.keras.layers.TextVectorization(
    standardize=tf_start_and_end_tokens,
    max_tokens=max_vocab_size)

french_tokenizer.adapt(lines_dataset_fr)

# Process french as output
english_tokenizer = tf.keras.layers.TextVectorization(
    standardize=tf_start_and_end_tokens,
    max_tokens=max_vocab_size)

english_tokenizer.adapt(lines_dataset_eng)

In [27]:
#verify tokenizer
# Print few lines of our tokenizers vocabulary and length
print(f'French Tokenizer:',french_tokenizer.get_vocabulary()[:10])
print(f'French Tokenizer size:',len(french_tokenizer.get_vocabulary()))

print()
print(f'TWI Tokenizer:',twi_tokenizer.get_vocabulary()[-10:])
print(f'TWI Tokenizer size:',len(twi_tokenizer.get_vocabulary()))
print()
print(f'English Tokenizer:',english_tokenizer.get_vocabulary()[-10:])
print(f'English Tokenizer size:',len(english_tokenizer.get_vocabulary()))

French Tokenizer: ['', '[UNK]', '[START]', '[END]', '.', 'a', 'de', 'je', 'est', 'il']
French Tokenizer size: 9553

TWI Tokenizer: ['abamu', 'abambu', 'abada', 'abaafo', 'aakwantuo', 'aa', '.r', '.meda', '.ma', '.abena']
TWI Tokenizer size: 7551

English Tokenizer: ['abstract', 'abstained', 'absorbs', 'absences', 'abiding', 'abhor', 'abducted', 'abbreviation', '.r', '.a']
English Tokenizer size: 7479


In [28]:
# test on simple sentenct
twi_tokenizer("Dɔ n nti na abofra no suɔ")

<tf.Tensor: shape=(9,), dtype=int64, numpy=array([   2,  442,   35,   59,    8,  216,    5, 5293,    3])>

In [29]:
twi_tokenizer.get_vocabulary()[:10]

['', '[UNK]', '[START]', '[END]', '.', 'no', 'sɛ', 'a', 'na', 'so']

## Save Tokenizer

In [30]:
#save Twi Tokenizer
# Pickle the config,vocabs and weights
pickle.dump({'config': twi_tokenizer.get_config(),
             'vocabulary':twi_tokenizer.get_vocabulary(),
             'weights': twi_tokenizer.get_weights()}
            , open("/content/drive/MyDrive/twi_tokenizer.pkl", "wb"))


In [31]:
# Save French Tokenizer
#save Twi Tokenizer
# Pickle the config,vocabs and weights
pickle.dump({'config': french_tokenizer.get_config(),
             'vocabulary':french_tokenizer.get_vocabulary(),
             'weights': french_tokenizer.get_weights()}
            , open("/content/drive/MyDrive/french_tokenizer.pkl", "wb"))


In [32]:
# Save English Tokenizer
#save Twi Tokenizer
# Pickle the config,vocabs and weights
pickle.dump({'config': english_tokenizer.get_config(),
             'vocabulary':english_tokenizer.get_vocabulary(),
             'weights': english_tokenizer.get_weights()}
            , open("/content/drive/MyDrive/english_tokenizer.pkl", "wb"))


## Reload Tokenizers and test

In [33]:
def loadtokenizer(filepath):
    tmp = pickle.load(open(filepath, "rb"))
    temp = tf.keras.layers.TextVectorization.from_config(tmp['config'])
    # You have to call `adapt` with some dummy data (BUG in Keras)
    temp.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
    temp.set_weights(tmp['weights'])
    temp.set_vocabulary(tmp['vocabulary'])
    return temp
  

In [34]:
# reload french tokenizer and test
french_tokenizer1 = loadtokenizer('/content/drive/MyDrive/french_tokenizer.pkl')
# test on simple sentence
french_tokenizer1("Vous devriez parler au professeur vous meme .")


<tf.Tensor: shape=(10,), dtype=int64, numpy=array([  2,  13, 408, 102,  45, 315,  13, 116,   4,   3])>

In [35]:
# reload twi tokenizer and test
twi_tokenizer1 = loadtokenizer('/content/drive/MyDrive/twi_tokenizer.pkl')
# test on simple sentence
twi_tokenizer1("Dɔ n nti na abofra no suɔ")


<tf.Tensor: shape=(9,), dtype=int64, numpy=array([   2,  442,   35,   59,    8,  216,    5, 5293,    3])>

In [37]:
# reload english tokenizer and test
english_tokenizer1 = loadtokenizer('/content/drive/MyDrive/english_tokenizer.pkl')
# test on simple sentence
english_tokenizer1("The true meaning of life")


<tf.Tensor: shape=(7,), dtype=int64, numpy=array([   2,    6,  259, 1481,   18,  180,    3])>