In [2]:
import pandas as pd
import numpy as np

In [3]:
import tensorflow as tf
import keras

In [4]:
# load data

data = pd.read_csv('eng_-french.csv')
data.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [5]:
data.shape

(175621, 2)

In [6]:
data.columns

Index(['English words/sentences', 'French words/sentences'], dtype='object')

In [7]:
len(np.unique(data['English words/sentences']))

123100

In [8]:
'Run ?'.split(' ')

['Run', '?']

In [9]:
def uniqueTokens(data):
    dict_Tokens = {}
    length = []
    for k in data.columns:
        dataList = []
        for b in data[k]:
            for m in b.split(' '):
                length.append(len(b.split(' ')))
                if m.lower() not in dataList:
                    dataList.append(m.lower())
                else:
                    pass
        dict_Tokens[k+ ' Tokens'] = dataList
    return dict_Tokens, max(length)

DictTokens, max_length = uniqueTokens(data)

In [10]:
data.columns

Index(['English words/sentences', 'French words/sentences'], dtype='object')

In [11]:
DictTokens.keys()

dict_keys(['English words/sentences Tokens', 'French words/sentences Tokens'])

In [12]:
len(DictTokens['English words/sentences Tokens'])

25639

In [13]:
len(DictTokens['French words/sentences Tokens'])

46416

In [14]:
max_length

55

In [15]:
x,y = data['English words/sentences'], data['French words/sentences']

In [16]:
# lowercase all letters
x_lower = x.apply(lambda x:x.lower())
y_lower = y.apply(lambda x:x.lower())

In [17]:
import re
x_lower = x_lower.apply(lambda x:re.sub("'",'',x))
y_lower = y_lower.apply(lambda x:re.sub("'",'',x))

In [18]:
import string
exclude = set(string.punctuation) # set of all special characters
#remove all the special characters
x_lower = x_lower.apply(lambda x:''.join(ch for ch in x if ch not in exclude))
y_lower = y_lower.apply(lambda x:''.join(ch for ch in x if ch not in exclude))

In [19]:
# remove all numbers from text
digits = string.digits
remove_digits = str.maketrans('','',digits)
x_lower = x_lower.apply(lambda x:x.translate(remove_digits))
y_lower = y_lower.apply(lambda x:x.translate(remove_digits))

In [20]:
# remove extra spaces
x_lower = x_lower.apply(lambda x: x.strip())
y_lower = y_lower.apply(lambda x: x.strip())

In [21]:
# add start and end tokens to target sequences
y_lower = y_lower.apply(lambda x: 'START_ '+x +' _END')

In [22]:
y_lower[:2]

0    START_ salut _END
1    START_ cours _END
Name: French words/sentences, dtype: object

In [23]:
# get english and igbo vocabulary
all_eng_words = set()
for eng in x_lower:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)
            
            
all_french_words = set()

for igb in y_lower:
    for word in igb.split():
        if word not in all_french_words:
            all_french_words.add(word)

In [24]:
print(len(all_eng_words))
print(len(all_french_words))

14409
29397


In [25]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_french_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_french_words)
num_encoder_tokens, num_decoder_tokens

(14409, 29397)

In [26]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

reverse_input_char_index = dict((i,word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i,word) for word, i in target_token_index.items())


In [27]:
x_lower[:5]

0     hi
1    run
2    run
3    who
4    wow
Name: English words/sentences, dtype: object

In [28]:
'  elaiel'.strip(' ')

'elaiel'

In [29]:
y_lower[0].split('_')[1].strip(' ')

'salut'

In [30]:
def EncodeAndPadX(data, dictInfo, padsize):
    OutputData = []
    for k in data:
        dummyData = []
        for m in k.split(' '):
            try:
                dummyData.append(dictInfo[m])
            except KeyError:
                pass
        LenDummyData= len(dummyData)
        padLength = padsize - LenDummyData
        pad = [dummyData.append(0) for i in range(padLength)]
        OutputData.append(np.array(dummyData))
    return np.array(OutputData)

def EncodeAndPadY(data, dictInfo, padsize):
    OutputData = []
    for k in data:
        dummyData = []
        for m in k.split('_'):
            try:
                dummyData.append(dictInfo[m.strip(' ')])
            except KeyError:
                pass
        LenDummyData= len(dummyData)
        padLength = padsize - LenDummyData
        pad = [dummyData.append(0) for i in range(padLength)]
        OutputData.append(np.array(dummyData))
    return np.array(OutputData)

xLower = EncodeAndPadX(x_lower, input_token_index, 55)
yLower = EncodeAndPadY(y_lower, target_token_index, 55)

In [31]:
yLower.shape

(175621, 55)

In [32]:
yLower[:3]

array([[23981,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0],
       [ 5403,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0],
       [ 5384,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,   

In [33]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(xLower, yLower, test_size = 0.2)
x_train.shape, x_test.shape

((140496, 55), (35125, 55))

In [34]:
x_train[:3]

array([[12774,  6206, 13907, 13717,  8305, 12780,  7150,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0],
       [14105,   410,  6330,  8483, 12400,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0],
       [12774, 13907, 13045,  8249,  4752,     0,     0,     0,     0,
            0,     0,     0,     0,     0,   

In [35]:
xTrain = tf.convert_to_tensor(x_train)
xTest = tf.convert_to_tensor(x_test)
yTrain = tf.convert_to_tensor(y_train)
yTest = tf.convert_to_tensor(y_test)

In [36]:
xTrain

<tf.Tensor: shape=(140496, 55), dtype=int64, numpy=
array([[12774,  6206, 13907, ...,     0,     0,     0],
       [14105,   410,  6330, ...,     0,     0,     0],
       [12774, 13907, 13045, ...,     0,     0,     0],
       ...,
       [11251,  1838, 12964, ...,     0,     0,     0],
       [ 6330,   610, 14360, ...,     0,     0,     0],
       [ 6330,  5905, 12780, ...,     0,     0,     0]])>

In [37]:
x_train[:1]

array([[12774,  6206, 13907, 13717,  8305, 12780,  7150,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0]])

In [38]:
# max_length_src = 55
# max_length_tar = 55
# def generate_batch(X = x_train, y = y_train, batch_size = 100):
#     while True:
#         for j in range(0, len(X), batch_size):
#             encoder_input_data = np.zeros((batch_size, max_length_src), dtype = 'float32')
#             decoder_input_data = np.zeros((batch_size, max_length_tar), dtype = 'float32')
#             decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens), dtype = 'float32')
            
#             for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
#                 for t, word in enumerate(input_text.split()):
#                     encoder_input_data[i,t] = input_token_index[word] # encoder input seq
#                 for t, word in enumerate(target_text.split()):
#                     if t<len(target_text.split())-1:
#                         decoder_input_data[i,t] = target_token_index[word] # decoder input seq
                        
#                     if t>0:
#                         # decoder target sequence (one hot decoder)
#                         # does not include the START_ token
#                         # Offset by one timestep
#                         decoder_target_data[i,t-1, target_token_index[word]] =1
#             yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [39]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.layers import Input, LSTM, Dense,TimeDistributed
# define the model



In [40]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

latent_dim = 256

# Define an input sequence and process it.
encoder_inputs = Input(shape=(55,))
encoder = Embedding(14409, latent_dim, mask_zero=True,input_length =55)(encoder_inputs)
encoder_outputs, state_h, state_c= LSTM(latent_dim, return_sequences=True,return_state=True)(encoder)

print('encoder', encoder_outputs)
print('state_h',state_h)
print('state_c', state_c)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]


# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(55,))
decoder = Embedding(29397, latent_dim, mask_zero=True,input_length =55)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder,initial_state=encoder_states)

print('decoder', decoder_outputs)

decoder_dense = Dense(num_decoder_tokens, activation='softmax')(decoder_outputs)


print('decoder', decoder_outputs)


# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model(Input =[encoder_inputs, decoder_inputs], Output = [decoder_dense])

encoder Tensor("lstm/Identity:0", shape=(None, 55, 256), dtype=float32)
state_h Tensor("lstm/Identity_1:0", shape=(None, 256), dtype=float32)
state_c Tensor("lstm/Identity_2:0", shape=(None, 256), dtype=float32)
decoder Tensor("lstm_1/Identity:0", shape=(None, 256), dtype=float32)
decoder Tensor("lstm_1/Identity:0", shape=(None, 256), dtype=float32)


TypeError: ('Keyword argument not understood:', 'Input')

In [None]:
model.summary()

In [140]:
yTrain.shape

TensorShape([140496, 55])

In [137]:
# Run training
batch_size = 100
epochs = 30
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit(x[xTrain, yTrain],[yTrain],
          batch_size=batch_size,
          epochs=epochs)

Epoch 1/30


AssertionError: in user code:

    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:531 train_step  **
        y_pred = self(x, training=True)
    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:927 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/network.py:717 call
        return self._run_internal_graph(
    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/network.py:899 _run_internal_graph
        assert str(id(x)) in tensor_dict, 'Could not compute output ' + str(x)

    AssertionError: Could not compute output Tensor("dense_14/Identity:0", shape=(None, 29397), dtype=float32)
