In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import os
%matplotlib inline
sns.set()
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import re
import time 

In [None]:

from text import Alphabet,levenshtein, wer, cer
from transformer import Transformer, CustomSchedule, evaluate ,accuracy_function

In [None]:
#  Create Alphabet object for languages to convert from text to labels and vice versa
alphabet_ar = Alphabet('alphabet_ar_ANETAC.txt')
alphabet_en = Alphabet('alphabet_en_ANETAC.txt')

In [None]:
# df_ar=pd.read_csv('EN-AR Translit/train.ar',names=['arabic'],header=None)
# df_en=pd.read_csv('EN-AR Translit/train.en',names=['english'],header=None)

# df_ar_val=pd.read_csv('EN-AR Translit/dev.ar',names=['arabic'],header=None)
# df_en_val=pd.read_csv('EN-AR Translit/dev.en',names=['english'],header=None)

# df1=pd.concat([df_ar,df_en],axis=1)
# df2=pd.concat([df_ar_val,df_en_val],axis=1)
# df3=pd.read_excel('transliterations_ar2en.xlsx')
# df=pd.concat([df1,df2,df3],axis=0)

In [None]:
# Read SAS dataset 
df = pd.read_excel('transliterations_ar2en.xlsx')

In [None]:
df.head()

Unnamed: 0,arabic,english
0,آب,AB
1,آبي,ABE
2,آتي,ATI
3,آثار,AASER
4,آدم,ADAM


In [None]:
# add start and end token for arabic and english samples
df['arabic'] = df['arabic'].apply(lambda x : '<'+ x +'>')
df['english'] = df['english'].apply(lambda x : '<'+ str(x).lower()+'>')

In [None]:
#  remove the tashkeel in the arabic text
noise = re.compile(""" ّ    | # Tashdid
َ    | # Fatha
ً    | # Tanwin Fath
ُ    | # Damma
ٌ    | # Tanwin Damm
ِ    | # Kasra
ٍ    | # Tanwin Kasr
ْ    | # Sukun
ـ     # Tatwil/Kashida
""", re.VERBOSE)
df['arabic_encoded'] = df['arabic'].apply(lambda x : alphabet_ar.encode(re.sub(noise, '',x)))
df['english_encoded'] = df['english'].apply(lambda x : alphabet_en.encode(x))

## Set hyperparameters

In [None]:
BUFFER_SIZE = 20000
BATCH_SIZE = 16

embedding_dim = 32
units = 128
n, d = 64, 64   #n, d = 256, 128
num_layers = 2
d_model = 64
dff = 64
num_heads = 2
dropout_rate = 0.3

vocab_inp_size = alphabet_ar.size()+1
vocab_tar_size = alphabet_en.size()+1

checkpoint_path = "./checkpoints/train4"

In [None]:
#  function to encode the text to tensorflow tensor.
def encode(txt_ar,txt_en):
  text = re.sub(noise, '', txt_ar.numpy().decode('utf8'))
  ar=tf.convert_to_tensor(alphabet_ar.encode(text),dtype=tf.int64)
  en=tf.convert_to_tensor(alphabet_en.encode(txt_en.numpy().decode('utf8')),dtype=tf.int64)
  return ar,en
#  Wraps a python function into a TensorFlow op that executes it eagerly.  
def tf_encode(txt_ar, txt_en):
    result_ar, result_en = tf.py_function(encode, [txt_ar, txt_en], [tf.int64, tf.int64])
    return result_ar, result_en

# create tensorflow dataset
dataset = tf.data.Dataset.from_tensor_slices((np.array(df['arabic'].values),np.array(df['english'].values)))
dataset = dataset.map(tf_encode, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([None], [None]))
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)



## Optimizer

In [None]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

## Loss and metrics

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [None]:
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  
  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)




In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

## Training and checkpointing

In [None]:
# initialize the model
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=alphabet_ar.size()+1,
    target_vocab_size=alphabet_en.size()+1, 
    pe_input=50, 
    pe_target=50,
    rate=dropout_rate)

### Checkpointing

Create the checkpoint path and the checkpoint manager. This will be used to save checkpoints every `n` epochs.

In [None]:
ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')

In [None]:
EPOCHS = 30

In [None]:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
  
  with tf.GradientTape() as tape:
    predictions, _ = transformer(inp, tar_inp, 
                                 True, 
                                 enc_padding_mask, 
                                 combined_mask, 
                                 dec_padding_mask)
    loss = loss_function(tar_real, predictions)

  gradients = tape.gradient(loss, transformer.trainable_variables)    
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  
  train_loss(loss)
  train_accuracy(accuracy_function(tar_real, predictions))

### Training Loop

In [None]:

for epoch in range(EPOCHS):
  start = time.time()
  
  train_loss.reset_states()
  train_accuracy.reset_states()
  
  # inp -> portuguese, tar -> english
  for (batch, (inp, tar)) in enumerate(dataset):
    train_step(inp, tar)
    
    if batch % 50 == 0:
      print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')
      
  if (epoch + 1) % 5 == 0:
    ckpt_save_path = ckpt_manager.save()
    print (f'Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}')
    
  print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

  print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')

Epoch 1 Batch 0 Loss 4.2024 Accuracy 0.0388
Epoch 1 Batch 50 Loss 4.0718 Accuracy 0.0376
Epoch 1 Batch 100 Loss 3.8440 Accuracy 0.0772
Epoch 1 Batch 150 Loss 3.6582 Accuracy 0.1056
Epoch 1 Batch 200 Loss 3.5205 Accuracy 0.1208
Epoch 1 Batch 250 Loss 3.4173 Accuracy 0.1317
Epoch 1 Batch 300 Loss 3.3297 Accuracy 0.1431
Epoch 1 Batch 350 Loss 3.2594 Accuracy 0.1537
Epoch 1 Batch 400 Loss 3.1917 Accuracy 0.1659
Epoch 1 Batch 450 Loss 3.1284 Accuracy 0.1790
Epoch 1 Batch 500 Loss 3.0738 Accuracy 0.1908
Epoch 1 Batch 550 Loss 3.0196 Accuracy 0.2021
Epoch 1 Batch 600 Loss 2.9684 Accuracy 0.2132
Epoch 1 Batch 650 Loss 2.9229 Accuracy 0.2221
Epoch 1 Batch 700 Loss 2.8823 Accuracy 0.2298
Epoch 1 Batch 750 Loss 2.8440 Accuracy 0.2366
Epoch 1 Batch 800 Loss 2.8048 Accuracy 0.2439
Epoch 1 Batch 850 Loss 2.7716 Accuracy 0.2501
Epoch 1 Batch 900 Loss 2.7384 Accuracy 0.2560
Epoch 1 Batch 950 Loss 2.7080 Accuracy 0.2612
Epoch 1 Batch 1000 Loss 2.6801 Accuracy 0.2663
Epoch 1 Batch 1050 Loss 2.6521 Accur

## Evaluation

In [None]:
#  evaluate the model single word
word = 'ذهب'
print(evaluate(word, alphabet_ar, alphabet_en, transformer))
df.loc[df['arabic']=='<'+word+'>']


<sab>


Unnamed: 0,arabic,english,arabic_encoded,english_encoded
32610,<ذهب>,<saab>,"[1, 18, 36, 10, 2]","[1, 21, 3, 3, 4, 2]"


In [None]:
#  read the test data of ANETAC dataset
df_ar = pd.read_csv('EN-AR Translit/test.ar',names=['arabic'],header=None)
df_en = pd.read_csv('EN-AR Translit/test.en',names=['english'],header=None)

df_test = pd.concat([df_ar,df_en],axis=1)

In [None]:
# Apply the model to the test data and remove the < and >
df_test['model']= df_test['arabic'].apply(lambda x:evaluate(x,alphabet_ar,alphabet_en,transformer))
df_test['model'] = df_test['model'].apply(lambda x:x.replace('<','').replace('>',''))

In [None]:
# Calaculate word error rate (WER) and character error rate (CER)
wers=[]
cers=[]
for _,row in df_test.iterrows():
  wers.append(wer(row['english'],row['model']))
  cers.append(cer(row['english'],row['model']))

df_test['wer'] = wers
df_test['cer'] = cers

df_test.to_excel('test3.xlsx')

print(df_test['wer'].mean())
print(df_test['cer'].mean())

0.7770404777704047
0.22938223576909683


In [None]:
# to do 
# more data from Quran transliterations 
# tri input 
# hyperparamter tuning 
# include the tashkeel in the input vecb 
