In [1]:
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf
import keras

In [3]:
# load data

data = pd.read_csv('eng_-french.csv')
data.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [4]:
data.shape

(175621, 2)

In [5]:
data.columns

Index(['English words/sentences', 'French words/sentences'], dtype='object')

In [6]:
len(np.unique(data['English words/sentences']))

123100

In [7]:
'Run ?'.split(' ')

['Run', '?']

In [8]:
def uniqueTokens(data):
    dict_Tokens = {}
    length = []
    for k in data.columns:
        dataList = []
        for b in data[k]:
            for m in b.split(' '):
                length.append(len(b.split(' ')))
                if m.lower() not in dataList:
                    dataList.append(m.lower())
                else:
                    pass
        dict_Tokens[k+ ' Tokens'] = dataList
    return dict_Tokens, max(length)

DictTokens, max_length = uniqueTokens(data)

In [9]:
data.columns

Index(['English words/sentences', 'French words/sentences'], dtype='object')

In [10]:
DictTokens.keys()

dict_keys(['English words/sentences Tokens', 'French words/sentences Tokens'])

In [11]:
len(DictTokens['English words/sentences Tokens'])

25639

In [12]:
len(DictTokens['French words/sentences Tokens'])

46416

In [13]:
max_length

55

In [14]:
x,y = data['English words/sentences'], data['French words/sentences']

In [15]:
# lowercase all letters
x_lower = x.apply(lambda x:x.lower())
y_lower = y.apply(lambda x:x.lower())

In [16]:
import re
x_lower = x_lower.apply(lambda x:re.sub("'",'',x))
y_lower = y_lower.apply(lambda x:re.sub("'",'',x))

In [17]:
import string
exclude = set(string.punctuation) # set of all special characters
#remove all the special characters
x_lower = x_lower.apply(lambda x:''.join(ch for ch in x if ch not in exclude))
y_lower = y_lower.apply(lambda x:''.join(ch for ch in x if ch not in exclude))

In [18]:
# remove all numbers from text
digits = string.digits
remove_digits = str.maketrans('','',digits)
x_lower = x_lower.apply(lambda x:x.translate(remove_digits))
y_lower = y_lower.apply(lambda x:x.translate(remove_digits))

In [19]:
# remove extra spaces
x_lower = x_lower.apply(lambda x: x.strip())
y_lower = y_lower.apply(lambda x: x.strip())

In [20]:
# add start and end tokens to target sequences
y_lower = y_lower.apply(lambda x: 'START_ '+x +' _END')

In [21]:
y_lower[:2]

0    START_ salut _END
1    START_ cours _END
Name: French words/sentences, dtype: object

In [24]:
# get english and igbo vocabulary
all_eng_words = set()
for eng in x_lower:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)
            
            
all_french_words = set()

for igb in y_lower:
    for word in igb.split():
        if word not in all_french_words:
            all_french_words.add(word)

In [25]:
print(len(all_eng_words))
print(len(all_french_words))

14409
29397


In [27]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_french_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_french_words)
num_encoder_tokens, num_decoder_tokens

(14409, 29397)

In [28]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

reverse_input_char_index = dict((i,word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i,word) for word, i in target_token_index.items())


In [29]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_lower, y_lower, test_size = 0.2)
x_train.shape, x_test.shape

((140496,), (35125,))

In [33]:
max_length_src = 55
max_length_tar = 55
def generate_batch(X = x_train, y = y_train, batch_size = 100):
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src), dtype = 'float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar), dtype = 'float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens), dtype = 'float32')
            
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i,t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i,t] = target_token_index[word] # decoder input seq
                        
                    if t>0:
                        # decoder target sequence (one hot decoder)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i,t-1, target_token_index[word]] =1
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [32]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

latent_dim = 22

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [42]:
# Run training
batch_size = 100
epochs = 30
input_d= generate_batch(x_train, y_train)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit(input_d,
          batch_size=batch_size,
          epochs=epochs)

Epoch 1/30


ValueError: in user code:

    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:531 train_step  **
        y_pred = self(x, training=True)
    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:927 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/network.py:717 call
        return self._run_internal_graph(
    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/network.py:888 _run_internal_graph
        output_tensors = layer(computed_tensors, **kwargs)
    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent.py:654 __call__
        return super(RNN, self).__call__(inputs, **kwargs)
    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:885 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs,
    /home/odemakinde/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/input_spec.py:176 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) + ' of layer ' +

    ValueError: Input 0 of layer lstm is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [None, None]
