<a href="https://colab.research.google.com/github/elishatofunmi/Machine-Translation/blob/master/language_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf
import keras

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os, sys
os.chdir('/content/drive/My Drive/machine translation/')

In [5]:
# load data

data = pd.read_csv('eng_-french.csv')
data.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [6]:
data.shape

(175621, 2)

In [7]:
data.columns

Index(['English words/sentences', 'French words/sentences'], dtype='object')

In [8]:
len(np.unique(data['English words/sentences']))

123100

In [9]:
'Run ?'.split(' ')

['Run', '?']

In [10]:
def uniqueTokens(data):
    dict_Tokens = {}
    length = []
    for k in data.columns:
        dataList = []
        for b in data[k]:
            for m in b.split(' '):
                length.append(len(b.split(' ')))
                if m.lower() not in dataList:
                    dataList.append(m.lower())
                else:
                    pass
        dict_Tokens[k+ ' Tokens'] = dataList
    return dict_Tokens, max(length)

DictTokens, max_length = uniqueTokens(data)

In [11]:
data.columns

Index(['English words/sentences', 'French words/sentences'], dtype='object')

In [12]:
DictTokens.keys()

dict_keys(['English words/sentences Tokens', 'French words/sentences Tokens'])

In [13]:
len(DictTokens['English words/sentences Tokens'])

25639

In [14]:
len(DictTokens['French words/sentences Tokens'])

46416

In [15]:
max_length

55

In [16]:
x,y = data['English words/sentences'], data['French words/sentences']

In [17]:
# lowercase all letters
x_lower = x.apply(lambda x:x.lower())
y_lower = y.apply(lambda x:x.lower())

In [18]:
import re
x_lower = x_lower.apply(lambda x:re.sub("'",'',x))
y_lower = y_lower.apply(lambda x:re.sub("'",'',x))

In [19]:
import string
exclude = set(string.punctuation) # set of all special characters
#remove all the special characters
x_lower = x_lower.apply(lambda x:''.join(ch for ch in x if ch not in exclude))
y_lower = y_lower.apply(lambda x:''.join(ch for ch in x if ch not in exclude))

In [20]:
# remove all numbers from text
digits = string.digits
remove_digits = str.maketrans('','',digits)
x_lower = x_lower.apply(lambda x:x.translate(remove_digits))
y_lower = y_lower.apply(lambda x:x.translate(remove_digits))

In [21]:
# remove extra spaces
x_lower = x_lower.apply(lambda x: x.strip())
y_lower = y_lower.apply(lambda x: x.strip())

In [22]:
# add start and end tokens to target sequences
y_lower = y_lower.apply(lambda x: 'START_ '+x +' _END')

In [23]:
y_lower[:2]

0    START_ salut _END
1    START_ cours _END
Name: French words/sentences, dtype: object

In [24]:
# get english and igbo vocabulary
all_eng_words = set()
for eng in x_lower:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)
            
            
all_french_words = set()

for igb in y_lower:
    for word in igb.split():
        if word not in all_french_words:
            all_french_words.add(word)

In [25]:
print(len(all_eng_words))
print(len(all_french_words))

14409
29397


In [26]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_french_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_french_words)
num_encoder_tokens, num_decoder_tokens

(14409, 29397)

In [27]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

reverse_input_char_index = dict((i,word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i,word) for word, i in target_token_index.items())


In [28]:
x_lower[:5]

0     hi
1    run
2    run
3    who
4    wow
Name: English words/sentences, dtype: object

In [29]:
'  elaiel'.strip(' ')

'elaiel'

In [30]:
y_lower[0].split('_')[1].strip(' ')

'salut'

In [31]:
y_lower[:4]

0     START_ salut _END
1     START_ cours _END
2    START_ courez _END
3       START_ qui _END
Name: French words/sentences, dtype: object

In [32]:
def EncodeAndPadX(data, dictInfo, padsize):
    OutputData = []
    for k in data:
        dummyData = []
        for m in k.split(' '):
            try:
                dummyData.append(dictInfo[m])
            except KeyError:
                pass
        LenDummyData= len(dummyData)
        padLength = padsize - LenDummyData
        pad = [dummyData.append(0) for i in range(padLength)]
        OutputData.append(np.array(dummyData))
    return np.array(OutputData)

def EncodeAndPadY(data, dictInfo, padsize):
    OutputData = []
    for k in data:
        dummyData = []
        for m in k.split('_'):
            try:
                dummyData.append(dictInfo[m.strip(' ')])
            except KeyError:
                pass
        LenDummyData= len(dummyData)
        padLength = padsize - LenDummyData
        pad = [dummyData.append(0) for i in range(padLength)]
        OutputData.append(np.array(dummyData))
    return np.array(OutputData)

xLower = EncodeAndPadX(x_lower, input_token_index, 55)
yLower = EncodeAndPadY(y_lower, target_token_index, 55)

In [33]:
xLower.shape

(175621, 55)

In [34]:
[i for i in range(10,10)]

[]

In [52]:
def miniBatch(xData, batches):
  DataX = []
  remainder = (len(xData)%batches) * -1
  n_batches = int((len(xData)//batches))
  start = 0
  stop = batches 
  for i in range(n_batches):
    DataX.append(xData[start:stop])
    start = stop
    stop += batches

  DataX.append(xData[remainder:])
  return DataX

def batchData(xLower, yLower, batch = 10):
  xbatch = miniBatch(xLower, batch)
  ybatch = miniBatch(yLower, batch)
  return zip(np.array(xbatch), np.array(ybatch))


x_trial = [[1,2,3],
           [2,3,4],
           [3,4,5],
           [3,4,5],
           [3,4,5]]

y_trial = [[1,2,3],
           [2,3,4],
           [3,4,5],
           [3,4,5],
           [3,4,5]]

mx = batchData(x_trial, y_trial, batch = 2)

In [53]:
for i, j in mx:
  print(np.array(i), np.array(j))

[[1 2 3]
 [2 3 4]] [[1 2 3]
 [2 3 4]]
[[3 4 5]
 [3 4 5]] [[3 4 5]
 [3 4 5]]
[[3 4 5]] [[3 4 5]]


In [54]:
max_encoder_seq_length = 55
max_decoder_seq_length = 55
num_encoder_tokens = 14409

def enc_dec_input_target(x_input, max_encoder_seq_length, max_decoder_seq_length): #create a dummy array of zeros for both input and target
  encoder_input_data = np.zeros(
      (len(x_input), max_encoder_seq_length, num_encoder_tokens),
      dtype='float32')

  decoder_input_data = np.zeros(
      (len(x_input), max_decoder_seq_length, num_decoder_tokens),
      dtype='float32')
  decoder_target_data = np.zeros(
      (len(x_input), max_decoder_seq_length, num_decoder_tokens),
      dtype='float32')
  
  return encoder_input_data, decoder_input_data, decoder_target_data



def TransformEncDecInputTarget(input_texts, target_texts, max_encoder_seq_length, max_decoder_seq_length): # Encode non_zero x_input element with 1's.
  encoder_input_data, decoder_input_data, decoder_target_data = enc_dec_input_target(input_texts,max_encoder_seq_length, max_decoder_seq_length)
  for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.
    decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.
    decoder_target_data[i, t:, target_token_index[' ']] = 1.

  return encoder_input_data, decoder_input_data, decoder_target_data

In [55]:
#define model architecture
from keras.models import Input, Model
from keras.layers import LSTM, Dense

max_encoder_seq_length = 55
max_decoder_seq_length = 55
num_encoder_tokens = 14409
latent_dim = 256
num_decoder_tokens = 29397
batch_size = 200
n_epochs = 20


# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

os.mkdir('train_ckpt')
os.chdir('./train_ckpt')
checkpoint_path = "train_ckpt/cp.ckpt"

In [56]:
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True,
    save_freq=20)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
                metrics=['accuracy'])

# Run training in batches
for x in next(batchData(xLower, yLower, batch = 1000)):
  xout, yout, zout = TransformEncDecInputTarget(x[0], x[1], max_encoder_seq_length, max_decoder_seq_length)
  model.fit([xout, yout], zout,
            batch_size=batch_size,
            epochs=epochs,
            validation_split=0.2, callbacks=[cp_callback])
  model.load_weights(checkpoint_path)
# Save model
model.save('sdata.h5')

TypeError: ignored

In [None]:

encoder_input_data = np.zeros(
    (len(xLower), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')


decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')


In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(xLower, yLower, test_size = 0.2)
x_train.shape, x_test.shape

((140496, 55), (35125, 55))

In [None]:
xTrain = tf.convert_to_tensor(x_train)
xTest = tf.convert_to_tensor(x_test)
yTrain = tf.convert_to_tensor(y_train)
yTest = tf.convert_to_tensor(y_test)

In [None]:
xTrain

<tf.Tensor: shape=(140496, 55), dtype=int64, numpy=
array([[ 6330, 14179,  6330, ...,     0,     0,     0],
       [14360,  8322, 12964, ...,     0,     0,     0],
       [14360,  7089,  6330, ...,     0,     0,     0],
       ...,
       [14041, 14360,   356, ...,     0,     0,     0],
       [ 6330,  3841,  7104, ...,     0,     0,     0],
       [12780,  4917, 13010, ...,     0,     0,     0]])>

In [None]:
x_train.shape, y_train.shape, y_test.shape, x_test.shape

((140496, 55), (140496, 55), (35125, 55), (35125, 55))

In [None]:
def manEncode(f, vocab_size):
  data = []
  for seq in f:

    f_zero = np.zeros((len(seq),vocab_size))

    for i,seq2 in enumerate(seq):
      f_zero[i,seq2] = 1

    data.append(f_zero)
  return data

yTrainEncode = manEncode(y_train, 29397)
yTestEncode = manEncode(y_test, 29397)


In [None]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

num_encoder_tokens = 14409
latent_dim = 224
num_decoder_tokens = 29397

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

In [None]:
np.array(data).shape

(2, 3, 20)

In [None]:
f[1,0]=2

In [None]:
f

array([[0., 0., 0.],
       [2., 0., 0.]])

In [None]:
# max_length_src = 55
# max_length_tar = 55
# def generate_batch(X = x_train, y = y_train, batch_size = 100):
#     while True:
#         for j in range(0, len(X), batch_size):
#             encoder_input_data = np.zeros((batch_size, max_length_src), dtype = 'float32')
#             decoder_input_data = np.zeros((batch_size, max_length_tar), dtype = 'float32')
#             decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens), dtype = 'float32')
            
#             for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
#                 for t, word in enumerate(input_text.split()):
#                     encoder_input_data[i,t] = input_token_index[word] # encoder input seq
#                 for t, word in enumerate(target_text.split()):
#                     if t<len(target_text.split())-1:
#                         decoder_input_data[i,t] = target_token_index[word] # decoder input seq
                        
#                     if t>0:
#                         # decoder target sequence (one hot decoder)
#                         # does not include the START_ token
#                         # Offset by one timestep
#                         decoder_target_data[i,t-1, target_token_index[word]] =1
#             yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [None]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.layers import Input, LSTM, Dense,TimeDistributed
# define the model



In [None]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

latent_dim = 256

# Define an input sequence and process it.
encoder_inputs = Input(shape=(55,))
encoder = Embedding(14409, latent_dim, mask_zero=True,input_length =55)(encoder_inputs)
encoder_outputs, state_h, state_c= LSTM(latent_dim, return_sequences=True,return_state=True)(encoder)

print('encoder', encoder_outputs)
print('state_h',state_h)
print('state_c', state_c)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]


# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(55,))
decoder = Embedding(29397, latent_dim, mask_zero=True,input_length =55)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_state=True,return_sequences=True)
decoder_outputs, _, _ = decoder_lstm(decoder,initial_state=encoder_states)

print('decoder', decoder_outputs)

decoder_dense = TimeDistributed(Dense(num_decoder_tokens, activation='softmax'))(decoder_outputs)


print('decoder', decoder_dense)


# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model(inputs =[encoder_inputs, decoder_inputs], outputs = [decoder_dense])

encoder Tensor("lstm/PartitionedCall:1", shape=(None, 55, 256), dtype=float32)
state_h Tensor("lstm/PartitionedCall:2", shape=(None, 256), dtype=float32)
state_c Tensor("lstm/PartitionedCall:3", shape=(None, 256), dtype=float32)
decoder Tensor("lstm_1/PartitionedCall:1", shape=(None, 55, 256), dtype=float32)
decoder Tensor("time_distributed/Reshape_1:0", shape=(None, 55, 29397), dtype=float32)


In [None]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 55)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 55)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 55, 256)      3688704     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 55, 256)      7525632     input_2[0][0]                    
_______________________________________________________________________________________

In [None]:
yTrainTarget = np.eye(29397)[yTrain]

In [None]:
yTrainTarget.shape

In [None]:
model([xTrain,yTrain])

In [None]:
# Run training
batch_size = 100
epochs = 30
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit(x=[xTrain, yTrain],y=[yTrain],
          batch_size=batch_size,
          epochs=epochs)

Epoch 1/30


ValueError: ignored