In [1]:
import numpy

In [6]:
import collections
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences   
from keras.models import Sequential , Model
from keras.layers import Dense, Input,Embedding ,LSTM, Bidirectional, Dropout, TimeDistributed ,GRU , Activation , RepeatVector 
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy



In [7]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 4276047656908852225
xla_global_id: -1
]


In [13]:
def load_data(path):
    input_file = path
    with open(input_file, 'r') as f:
        data = f.read()
    return data.split('\n')
english_sentences = load_data('data/english')
french_sentences = load_data('data/french')

In [14]:
english_sentences[:5]

['new jersey is sometimes quiet during autumn , and it is snowy in april .',
 'the united states is usually chilly during july , and it is usually freezing in november .',
 'california is usually quiet during march , and it is usually hot in june .',
 'the united states is sometimes mild during june , and it is cold in september .',
 'your least liked fruit is the grape , but my least liked is the apple .']

In [15]:
english_word_count = collections.Counter(word for sentence in english_sentences for word in sentence.split())
french_word_count = collections.Counter(word for sentence in french_sentences for word in sentence.split())
print(f"Unique English words: {len(english_word_count)}")
print(f"Unique French words: {len(french_word_count)}")

Unique English words: 227
Unique French words: 355


In [16]:
def tokenizer(x):
    tok = Tokenizer()
    tok.fit_on_texts(x)
    return tok.texts_to_sequences(x), tok

text_sentences = [
    'I love programming',
    'Python is great for data science',
    'Machine learning is fascinating',
    'Deep learning is a subset of machine learning',
    'Natural language processing is a complex field'
]

text_tokenized, text_tokenizer = tokenizer(text_sentences)
print("Tokenized Sequences:", text_tokenized)


Tokenized Sequences: [[5, 6, 7], [8, 1, 9, 10, 11, 12], [3, 2, 1, 13], [14, 2, 1, 4, 15, 16, 3, 2], [17, 18, 19, 1, 4, 20, 21]]


In [18]:
def pad(x , length=None):
    if length is None:
        length = max(len(i) for i in x)
    return pad_sequences(x , maxlen=length , padding='post')
text_pad = pad(text_tokenized)
print("Padded Sequences:\n", text_pad)

Padded Sequences:
 [[ 5  6  7  0  0  0  0  0]
 [ 8  1  9 10 11 12  0  0]
 [ 3  2  1 13  0  0  0  0]
 [14  2  1  4 15 16  3  2]
 [17 18 19  1  4 20 21  0]]


In [20]:
def preprocess(x,y):
    preprocess_x, x_tk = tokenizer(x)
    preprocess_y, y_tk = tokenizer(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
    
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
    
    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(english_sentences, french_sentences)

max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)
    

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 345
