<a href="https://colab.research.google.com/github/gyasifred/msc-thesis/blob/main/text_vectorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install Tensorflow

In [1]:
!pip install "tensorflow-text==2.8.*"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-text==2.8.*
  Downloading tensorflow_text-2.8.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 2.7 MB/s 
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.8.2


In [2]:
# import libraries
import tensorflow as tf
import tensorflow_text as tf_text
import pickle

## Preprocess data

In [3]:
# This code was adapted from https://github.com/GhanaNLP/kasa/blob/master/Kasa/Preprocessing.py
# A subclass of the kasafranse for preprocessing data
# import required library
import re
import unicodedata


class Preprocessing:
    # dummy initialization method
    def __init__(self):
        # initialize with some default parameters here later
        pass

    # read in parallel twi - english dataset
    def read_parallel_dataset(self, filepath_1, filepath_2, filepath_3=None):
        if filepath_3 != None:
            # read first language data
            lang_1 = []
            with open(filepath_1, encoding='utf-8') as file:
                line = file.readline()
                cnt = 1
                while line:
                    lang_1.append(line.strip())
                    line = file.readline()
                    cnt += 1

            # read second language data
            lang_2 = []
            with open(filepath_2, encoding='utf-8') as file:

                # twi=file.read()
                line = file.readline()
                cnt = 1
                while line:
                    lang_2.append(line.strip())
                    line = file.readline()
                    cnt += 1
            # Read third Language data
            lang_3 = []
            with open(filepath_3, encoding='utf-8') as file:
                line = file.readline()
                cnt = 1
                while line:
                    lang_3.append(line.strip())
                    line = file.readline()
                    cnt += 1

            return lang_1, lang_2, lang_3
            
        else:
            # read first language data
            lang_1 = []
            with open(filepath_1, encoding='utf-8') as file:
                line = file.readline()
                cnt = 1
                while line:
                    lang_1.append(line.strip())
                    line = file.readline()
                    cnt += 1

            # read second language data
            lang_2 = []
            with open(filepath_2, encoding='utf-8') as file:

                # twi=file.read()
                line = file.readline()
                cnt = 1
                while line:
                    lang_2.append(line.strip())
                    line = file.readline()
                    cnt += 1

            return lang_1, lang_2

    # Define a helper function to remove string accents

    def removeStringAccent(self, s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
        )

    # normalize input twi sentence
    def normalize_twi(self, s):
        s = self.removeStringAccent(s)
        s = s.lower()
        s = re.sub(r'([!.?])', r' \1', s)
        s = re.sub(r'[^a-zA-Z.ƆɔɛƐ!?’]+', r' ', s)
        s = re.sub(r'\s+', r' ', s)
        return s

    # normalize input french sentence
    def normalize_FrEn(self, s):
        s = self.removeStringAccent(s)
        s = s.lower()
        s = re.sub(r'([!.?])', r' \1', s)
        s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
        s = re.sub(r'\s+', r' ', s)
        return s
    
    def writeTotxt(self,destination,data):
        with open(destination, 'w') as f:
            for line in data:
                 f.write(f"{line}\n")

In [4]:
# import preprocessing class 
# Create an instance of tft preprocessing class
preprocessor = Preprocessing()

# Read raw parallel dataset
raw_data_twi, raw_data_fr, raw_eng_data = preprocessor.read_parallel_dataset(
    filepath_1='/content/drive/MyDrive/verified_twi.txt',
    filepath_2='/content/drive/MyDrive/verified_english_french.txt',
    filepath_3='/content/drive/MyDrive/verified_english.txt'
)

# Normalize the raw data
raw_data_fr = [preprocessor.normalize_FrEn(data) for data in raw_data_fr]
raw_data_twi = [preprocessor.normalize_twi(data) for data in raw_data_twi]
raw_data_eng = [preprocessor.normalize_FrEn(data) for data in raw_eng_data ]

In [5]:
# write the preprocess dataset to a file
preprocessor.writeTotxt('raw_data_twi.txt',raw_data_twi)
preprocessor.writeTotxt('raw_data_fr.txt',raw_data_fr)
preprocessor.writeTotxt('raw_data_eng.txt',raw_data_eng)

## Create Tokenizer

build tf dataset

In [6]:
# read the raw datasets
lines_dataset_fr = tf.data.TextLineDataset('/content/raw_data_fr.txt')
lines_dataset_tw = tf.data.TextLineDataset('/content/raw_data_twi.txt')
lines_dataset_eng= tf.data.TextLineDataset('/content/raw_data_eng.txt')

Vecrorization

In [7]:
class ProcessTokenizer:
    def __init__(self) -> None:
        pass

    def tf_start_and_end_tokens(self, text):
        # Strip whitespace.
        text = tf.strings.strip(text)

        text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
        return text

    def build_tokenizer(self,ds, max_vocab_size):
        # Process twi as input
        tmp = tf.keras.layers.TextVectorization(
            standardize=self.tf_start_and_end_tokens,
            max_tokens=max_vocab_size)
        tmp.adapt(ds)
        return tmp
    def savetokenizer(self, filepath, tokenizer):
        return pickle.dump({'config': tokenizer.get_config(),
                            'vocabulary': tokenizer.get_vocabulary(),
                            'weights': tokenizer.get_weights()},
                           open(filepath, "wb"))

    def loadtokenizer(self, filepath):
        tmp = pickle.load(open(filepath, "rb"))
        temp = tf.keras.layers.TextVectorization.from_config(tmp['config'])
        # You have to call `adapt` with some dummy data (BUG in Keras)
        temp.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
        temp.set_weights(tmp['weights'])
        temp.set_vocabulary(tmp['vocabulary'])
        return temp

In [8]:
# instaintiate ProcessTokenizer class
tokenizer_preprocess = ProcessTokenizer()

# set maximum vocaburary size
max_vocab_size = 5000

In [9]:
%%time
# Process twi as input
twi_tokenizer = tokenizer_preprocess.build_tokenizer(
    lines_dataset_tw, max_vocab_size)

CPU times: user 23.3 s, sys: 1.61 s, total: 24.9 s
Wall time: 24.2 s


In [10]:
%%time
# Process french as output
french_tokenizer = tokenizer_preprocess.build_tokenizer(
    lines_dataset_fr, max_vocab_size)


CPU times: user 21.7 s, sys: 1.62 s, total: 23.3 s
Wall time: 20.6 s


In [11]:
%%time
# Process english as output
english_tokenizer = tokenizer_preprocess.build_tokenizer(
    lines_dataset_eng, max_vocab_size)

CPU times: user 22 s, sys: 1.52 s, total: 23.5 s
Wall time: 20.3 s


In [12]:
#verify tokenizer
# Print few lines of our tokenizers vocabulary and length
print(f'French Tokenizer:',french_tokenizer.get_vocabulary()[:10])
print(f'French Tokenizer size:',len(french_tokenizer.get_vocabulary()))

print()
print(f'TWI Tokenizer:',twi_tokenizer.get_vocabulary()[-10:])
print(f'TWI Tokenizer size:',len(twi_tokenizer.get_vocabulary()))
print()
print(f'English Tokenizer:',english_tokenizer.get_vocabulary()[-10:])
print(f'English Tokenizer size:',len(english_tokenizer.get_vocabulary()))

French Tokenizer: ['', '[UNK]', '[START]', '[END]', '.', 'a', 'de', 'je', 'est', 'il']
French Tokenizer size: 5000

TWI Tokenizer: ['wonhwehwɛ', 'wonguan', 'wonfie', 'wonfata', 'wonantew', 'wommɔɔ', 'wommu', 'wommra', 'wommpɛ', 'wommisa']
TWI Tokenizer size: 5000

English Tokenizer: ['smashed', 'smartest', 'smallpox', 'smallest', 'slugged', 'sloshed', 'sloppy', 'slope', 'slogan', 'slit']
English Tokenizer size: 5000


In [13]:
# test on simple sentenct
twi_tokenizer("Dɔ n nti na abofra no suɔ")

<tf.Tensor: shape=(9,), dtype=int64, numpy=array([  2,   1,  35,  59,   8, 216,   5,   1,   3])>

## Save Tokenizer

In [15]:
# save French tokenizer
tokenizer_preprocess.savetokenizer(
    '/content/drive/MyDrive/french_tokenizer.pkl', french_tokenizer)

# save english tokenizer
tokenizer_preprocess.savetokenizer(
    '/content/drive/MyDrive/english_tokenizer.pkl', english_tokenizer)

# save Twi tokenizer
tokenizer_preprocess.savetokenizer('/content/drive/MyDrive/twi_tokenizer.pkl', twi_tokenizer)



## Reload Tokenizers and test

In [16]:
# reload french tokenizer and test
french_tokenizer1 = tokenizer_preprocess.loadtokenizer(
    '/content/drive/MyDrive/french_tokenizer.pkl')
# test on simple sentence
french_tokenizer1("Vous devriez parler au professeur vous meme .")


<tf.Tensor: shape=(10,), dtype=int64, numpy=array([  2,   1, 408, 102,  45, 315,  13, 116,   4,   3])>

In [17]:
# reload english tokenizer and test
english_tokenizer1 = tokenizer_preprocess.loadtokenizer(
    '/content/drive/MyDrive/english_tokenizer.pkl')
# test on simple sentence
english_tokenizer1("The true meaning of life")


<tf.Tensor: shape=(7,), dtype=int64, numpy=array([   2,    1,  259, 1481,   18,  180,    3])>

In [18]:
# reload Twi tokenizer and test
twi_tokenizer1 = tokenizer_preprocess.loadtokenizer('/content/drive/MyDrive/twi_tokenizer.pkl')
# test on simple sentence
twi_tokenizer1("Dɔ n nti na abofra no suɔ")

<tf.Tensor: shape=(9,), dtype=int64, numpy=array([  2,   1,  35,  59,   8, 216,   5,   1,   3])>