<a href="https://colab.research.google.com/github/contentlab-io/ai_language_translation/blob/main/NMT7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:


!pip install keras_self_attention

#from translate.storage.tmx import tmxfile
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import re
import array


import warnings
warnings.filterwarnings("ignore")
import numpy as np
import string
from numpy import array, argmax, random, take
#for processing imported data
import tensorflow as tf
import pandas as pd
#the RNN routines
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector

from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import optimizers

#optional imports if you want to generate statistical graphs of the DMT
#import matplotlib.pyplot as plt
#from keras.utils import plot_model
#import pydot


from gensim.models import Word2Vec
from gensim.test.utils import common_texts
from keras_self_attention import SeqSelfAttention

# function to read raw text file
def read_text(filename):
        # open the file
        file = open(filename, mode='rt', encoding='utf-8')
        
        # read all text
        text = file.read()
        file.close()
        return text

		
# split a text into sentences
def to_lines(text):
      sents = text.strip().split('\n')
      sents = [i.split('\t') for i in sents]
      return sents

### tokenizer ###
def tokenization(lines):
        #print(lines)
        tokenizer = Tokenizer()

        tokenizer.fit_on_texts(lines)
        return tokenizer

### encode ###
def encode_sequences(tokenizer, length, lines):
         # integer encode sequences
         seq = tokenizer.texts_to_sequences(lines)
         # pad sequences with 0 values
         seq = pad_sequences(seq, maxlen=length, padding='post')
         return seq
### custom split train/test ###


def split1(lines):
        train=[]
        test=[]
        l=len(lines)
        for i in range(0,l-1):
                if (i%8!=0):
                        test.append(lines[i])
                else:
                        train.append(lines[i])
        return [train,test]

def split_file(fname1,fname2):
        content_array = []
        with open(fname1) as f:
                #Content_list is the list that contains the read lines.     
                for line in f:
                        tokens = line.split('\t')
                        content_array.append(tokens[0])
                        
                with open(fname2, "w") as txt_file:
                    for line in content_array:
                        txt_file.write(line+"\n")

		

def define_model(in_vocab,out_vocab, in_timesteps,out_timesteps,units,use_attention=1,use_word2vec=1,corpus=None):
      model = Sequential()
      if use_word2vec==0 :
        model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
      else :
        model_w2v = Word2Vec(corpus_file=corpus, size=50, window=5, min_count=1, workers=4)
        model_w2v.mask_zero = True
        model.add(model_w2v.wv.get_keras_embedding(train_embeddings=True))
        
      model.add(LSTM(units))
      model.add(RepeatVector(out_timesteps))

      if use_attention == 1:
        model.add(SeqSelfAttention(attention_activation='sigmoid'))

      model.add(LSTM(units, return_sequences=True))
      model.add(Dense(out_vocab, activation='softmax'))
      return model

def get_word(n, tokenizer):
      for word, index in tokenizer.word_index.items():
          if index == n:
              return word
      return None

def train_model(path_to_data,path_to_model,use_attention=1,use_word2vec=1):

  data = read_text(path_to_data)
  en_ru = to_lines(data)
  en_ru = array(en_ru)

  #print(en_ru)

  # prepare english tokenizer
  en_tokenizer = tokenization(en_ru[:, 0])
  en_vocab_size = len(en_tokenizer.word_index) + 1

  en_length = 8
  #print('English Vocabulary Size: %d' % en_vocab_size)

  # prepare Russian tokenizer
  ru_tokenizer = tokenization(en_ru[:, 1])
  ru_vocab_size = len(ru_tokenizer.word_index) + 1

  ru_length = 8
  #print('Target Vocabulary Size: %d' % ru_vocab_size)

  from sklearn.model_selection import train_test_split

  # split data into train and test set
  train, test = train_test_split(en_ru, test_size=0.2, random_state = 12)

  # prepare training data
  #input == english
  trainX = encode_sequences(en_tokenizer, en_length, train[:, 0])

  #output == russian
  trainY = encode_sequences(ru_tokenizer, ru_length, train[:, 1])

  # prepare validation data
  #input == english
  testX = encode_sequences(en_tokenizer, en_length, test[:, 0])

  #output == russian
  testY = encode_sequences(ru_tokenizer, ru_length, test[:, 1])

  corpus=None

  if use_word2vec==1 :
    corpus="tmp.txt"
    split_file(path_to_data,corpus)
      
  # model compilation
  model = define_model(en_vocab_size, ru_vocab_size, en_length, ru_length, 512,use_attention,use_word2vec,corpus)
  rms = optimizers.RMSprop(lr=0.001)
  model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

  #filename = 'model13'
  filename = path_to_model

  checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

  with tf.device('/device:GPU:0'):
    # train model
    history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
                      epochs=30, batch_size=512, validation_split = 0.2,callbacks=[checkpoint], 
                      verbose=1)
            
#end function

def translate(path_to_data, path_to_file, path_to_model):

  data = read_text(path_to_data)
  en_ru = to_lines(data)
  en_ru = array(en_ru)

  #print(en_ru)

  # prepare english tokenizer
  en_tokenizer = tokenization(en_ru[:, 0])
  en_vocab_size = len(en_tokenizer.word_index) + 1

  en_length = 8
  print('English Vocabulary Size: %d' % en_vocab_size)

  # prepare Russian tokenizer
  ru_tokenizer = tokenization(en_ru[:, 1])
  ru_vocab_size = len(ru_tokenizer.word_index) + 1

  print('ru_vocab_size Vocabulary Size: %d' % ru_vocab_size)

  data2 = read_text(path_to_file)
  en_ru2 = to_lines(data2)
  en_ru2 = array(en_ru2)

  testX1 = encode_sequences(en_tokenizer, en_length, en_ru2[:, 0])

  model = load_model(path_to_model)

  #predict (from english to dutch)
  preds = model.predict_classes(testX1.reshape((testX1.shape[0],testX1.shape[1])))

  #actuals_text=[]
  preds_text = []
  inputs_text=[]

  idx=0

  for i in preds:
        idx=idx+1
        temp = []
        for j in range(len(i)):
              t = get_word(i[j], ru_tokenizer)
              if j > 0:
                  if (t == get_word(i[j-1], ru_tokenizer)) or (t == None):
                      temp.append('')
                  else:
                      temp.append(t)
              else:
                    if(t == None):
                            temp.append('')
                    else:
                            temp.append(t)
        preds_text.append(' '.join(temp))
      
  idx=0
	   
  for i in testX1:
        idx=idx+1

        temp = []
        for j in range(len(i)):
              t = get_word(i[j], en_tokenizer)
              if j > 0:
                  if (t == get_word(i[j-1], en_tokenizer)) or (t == None):
                      temp.append('')
                  else:
                      temp.append(t)
              else:
                    if(t == None):
                            temp.append('')
                    else:
                            temp.append(t)
        inputs_text.append(' '.join(temp))

  pd.set_option("display.max_rows", None, "display.max_columns", None)


  pred_df = pd.DataFrame({'input' : inputs_text , 'model translation' : preds_text})

  print(pred_df)

#end function



train_model("fra.txt","model_fra2",1,1)
translate("fra.txt","test.txt","model_fra2")
