<a href="https://colab.research.google.com/github/deliciousushi/nullClass_training/blob/main/Untitled7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import collections
import numpy as np
import json
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Embedding, GRU, LSTM, Bidirectional, Dropout, Activation, TimeDistributed, RepeatVector
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [3]:
def load_data(path):
  input_file = path
  with open(input_file, "r") as f:
    data = f.read()
  return data.split('\n')

english_sentence = load_data('/content/small_vocab_en.csv')
french_sentence = load_data('/content/small_vocab_fr.csv')

In [4]:
english_sentence[1]
print(french_sentence[1])

les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .


In [5]:
english_word_counter = collections.Counter([word for sentence in english_sentence for word in sentence.split()])
french_word_counter = collections.Counter([word for sentence in french_sentence for word in sentence.split()])

print('{} English_words.'.format(len([word for sentence in english_sentence for word in sentence.split()])))
print('{} unique english words '.format(len(english_word_counter)))
print('10 most commomn words:')
print('"' + '" "'.join(list(zip(*english_word_counter.most_common(10)))[0]) + '"')
print()

print('{} French_words.'.format(len([word for sentence in french_sentence for word in sentence.split()])))
print("{} unique French words.".format(len(french_word_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_word_counter.most_common (10)))[0]) + '"')


1823250 English_words.
227 unique english words 
10 most commomn words:
"is" "," "." "in" "it" "during" "the" "but" "and" "sometimes"

1961295 French_words.
355 unique French words.
10 Most common words in the French dataset:
"est" "." "," "en" "il" "les" "mais" "et" "la" "parfois"


In [6]:
english_word_counter = collections.Counter([word for sentence in english_sentence for word in sentence.split()])
french_word_counter = collections.Counter([word for sentence in french_sentence for word in sentence.split()])

# Corrected print statements
print('{} English words'.format(len([word for sentence in english_sentence for word in sentence.split()])))
print('{} Unique English words'.format(len(english_word_counter)))
print('10 most common English words:', english_word_counter.most_common(10))

print('{} French words'.format(len([word for sentence in french_sentence for word in sentence.split()])))
print('{} Unique French words'.format(len(french_word_counter)))
print('10 most common French words:', french_word_counter.most_common(10))

1823250 English words
227 Unique English words
10 most common English words: [('is', 205858), (',', 140897), ('.', 129039), ('in', 75525), ('it', 75137), ('during', 74933), ('the', 67628), ('but', 63987), ('and', 59850), ('sometimes', 37746)]
1961295 French words
355 Unique French words
10 most common French words: [('est', 196809), ('.', 135619), (',', 123135), ('en', 105768), ('il', 84079), ('les', 65255), ('mais', 63987), ('et', 59851), ('la', 49861), ('parfois', 37746)]


In [7]:
def tokenize(x):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(x)
  return tokenizer.texts_to_sequences(x), tokenizer

text_sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "By Jove, my quick study of lexicography won a prize.",
    "This is a short sentence."]
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)

print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print(' Sequence {} in x'.format(sample_i + 1))
    print(' Input: {}'.format(sent))
    print(' Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

 Sequence 1 in x
 Input: The quick brown fox jumps over the lazy dog.
 Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
 Sequence 2 in x
 Input: By Jove, my quick study of lexicography won a prize.
 Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
 Sequence 3 in x
 Input: This is a short sentence.
 Output: [18, 19, 3, 20, 21]


In [8]:
def pad(x, length=None):
  if length is None:
    length = max([len(sentence) for sentence in x])
  return pad_sequences (x, maxlen=length, padding='post')

test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
  print('Sequence {} in x'.format(sample_i + 1))
  print(' Input: {}'.format(np.array(token_sent)))
  print(' Output: {}'.format(pad_sent))

Sequence 1 in x
 Input: [1 2 4 5 6 7 1 8 9]
 Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
 Input: [10 11 12  2 13 14 15 16  3 17]
 Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
 Input: [18 19  3 20 21]
 Output: [18 19  3 20 21  0  0  0  0  0]


In [9]:
def preprocess (x,y):
  preprocess_x, x_tk = tokenize(x)
  preprocess_y, y_tk = tokenize(y)
  preprocess_X = pad(preprocess_x)
  preprocess_y = pad(preprocess_y)
  preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

  return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(english_sentence, french_sentence)
#preproc_english_sentences = np.array(preproc_english_sentences)
#preproc_french_sentences = np.array(preproc_french_sentences)



In [10]:
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

In [11]:
max_english_sequence_length = max(len(seq) for seq in preproc_english_sentences)
max_french_sequence_length = max(len(seq) for seq in preproc_french_sentences)

# Pad sequences
preproc_english_sentences = pad(preproc_english_sentences, length=max_english_sequence_length)
preproc_french_sentences = pad(preproc_french_sentences, length=max_french_sequence_length)

# Convert lists to NumPy arrays
preproc_english_sentences = np.array(preproc_english_sentences)
preproc_french_sentences = np.array(preproc_french_sentences)

english_vocab_size = len(english_tokenizer.word_index) + 1  # Add 1 for padding token
french_vocab_size = len(french_tokenizer.word_index) + 1  # Add 1 for padding token

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 200
French vocabulary size: 345


In [12]:
def logits_to_text (logits, tokenizer):
  index_to_words = {id: word for word, id in tokenizer.word_index.items()}
  index_to_words[0] = '<PAD>'

  return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

**SIMPLE MODEL**

In [13]:
def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
  learning_rate = 0.005
  model = Sequential()

  model.add(GRU(256, input_shape = input_shape[1:], return_sequences = True))
  model.add(TimeDistributed(Dense(1024, activation='relu')))
  model.add(Dropout(0.5))
  model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))

  model.compile(loss = sparse_categorical_crossentropy,
                optimizer = Adam(learning_rate),
                metrics = ['accuracy'])
  return model

tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

simple_rnn_model = simple_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

simple_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1834, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f96ee778d30>

In [14]:
print("Prediciton:")
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sentence[:1])

print("\n original text:")
print(english_sentence[:1])

Prediciton:
new jersey est parfois calme en mois de il et il est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

 original text:
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


In [17]:
def bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
  learning_rate = 0.005
  model = Sequential()
  model.add(Bidirectional (GRU(128, return_sequences=True), input_shape=input_shape[1:]))
  model.add(TimeDistributed(Dense(1024, activation='relu')))
  model.add(Dropout(0.5))
  model.add(TimeDistributed(Dense(french_vocab_size, activation="softmax")))

  model.compile(loss = sparse_categorical_crossentropy,
                optimizer = Adam(learning_rate),
                metrics = ['accuracy'])
  return model

tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

bd_rnn_model = bd_model(
    tmp_x.shape, max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

print(bd_rnn_model.summary())
bd_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirection  (None, 21, 256)           100608    
 al)                                                             
                                                                 
 time_distributed_2 (TimeDi  (None, 21, 1024)          263168    
 stributed)                                                      
                                                                 
 dropout_1 (Dropout)         (None, 21, 1024)          0         
                                                                 
 time_distributed_3 (TimeDi  (None, 21, 345)           353625    
 stributed)                                                      
                                                                 
Total params: 717401 (2.74 MB)
Trainable params: 717401 (2.74 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

<keras.src.callbacks.History at 0x7f96e8981720>

In [18]:
print("Prediciton:")
print(logits_to_text(bd_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sentence[:1])

print("\n original text:")
print(english_sentence[:1])

Prediciton:
new jersey est parfois calme en mois et il il il neigeux en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

 original text:
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


In [19]:
def bidirectional_embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
  learning_rate = 0.005
  model = Sequential()
  model.add(Embedding(english_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
  model.add(Bidirectional (GRU(256, return_sequences=True)))
  model.add(TimeDistributed(Dense(1024, activation='relu')))
  model.add(Dropout(0.5))
  model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))

  model.compile(loss=sparse_categorical_crossentropy,
                optimizer=Adam(learning_rate),
                metrics=['accuracy'])

  return model

tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))

embed_rnn_model = bidirectional_embed_model( tmp_x.shape, max_french_sequence_length, english_vocab_size, french_vocab_size)
print(embed_rnn_model.summary())
embed_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 21, 256)           51200     
                                                                 
 bidirectional_1 (Bidirecti  (None, 21, 512)           789504    
 onal)                                                           
                                                                 
 time_distributed_4 (TimeDi  (None, 21, 1024)          525312    
 stributed)                                                      
                                                                 
 dropout_2 (Dropout)         (None, 21, 1024)          0         
                                                                 
 time_distributed_5 (TimeDi  (None, 21, 345)           353625    
 stributed)                                                      
                                                      

<keras.src.callbacks.History at 0x7f96ea8f3a00>

In [20]:
print("Prediciton:")
print(logits_to_text(embed_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sentence[:1])

print("\n original text:")
print(english_sentence[:1])

Prediciton:
new jersey est parfois calme pendant l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

 original text:
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


In [21]:
embed_rnn_model.save('english_to_french_model')

#serialize English Tokenizer to JSON
with open('english_tokenizer.json', 'w', encoding='utf8') as f:
      f.write(json.dumps(english_tokenizer.to_json(), ensure_ascii=False))

#Serialize French Tokenizer to JSON
with open('french_tokenizer.json', 'w', encoding='utf8') as f:
      f.write(json.dumps(french_tokenizer.to_json(), ensure_ascii=False))

#Save max Lengths
max_french_sequence_length_json = max_french_sequence_length
with open('sequence_length.json', 'w', encoding='utf8') as f:
      f.write(json.dumps(max_french_sequence_length_json, ensure_ascii=False))