# Importing dependencies

## Module importing

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Reading dataset

In [39]:
#read text files
def read_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = [line.strip() for line in file]
    return sentences
tamil_sentences = read_sentences('train.txt')
english_sentences = read_sentences('trainen.txt')

# Pre-processing

In [40]:
# Tokenize Tamil sentences
tamil_tokenizer = Tokenizer()
tamil_tokenizer.fit_on_texts(tamil_sentences)
tamil_sequences = tamil_tokenizer.texts_to_sequences(tamil_sentences)

# Tokenize English sentences
english_tokenizer = Tokenizer()
english_tokenizer.fit_on_texts(english_sentences)
english_sequences = english_tokenizer.texts_to_sequences(english_sentences)

# Pad sequences to ensure uniform length
max_length = max(len(seq) for seq in tamil_sequences + english_sequences)
tamil_padded = pad_sequences(tamil_sequences, maxlen=max_length, padding='post')
english_padded = pad_sequences(english_sequences, maxlen=max_length, padding='post')

# Model definition

## Model architecture

In [46]:
# Define the model architecture
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tamil_tokenizer.word_index)+1, output_dim=256, input_length=max_length),
    tf.keras.layers.LSTM(256),
    tf.keras.layers.RepeatVector(max_length),
    tf.keras.layers.LSTM(256, return_sequences=True),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(english_tokenizer.word_index)+1, activation='softmax'))
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

## Model training

In [None]:
model.fit(tamil_padded, english_padded, epochs=10, batch_size=64)

# Model evaluation

## Loss and accuracy

In [None]:
loss, accuracy = model.evaluate(tamil_padded, english_padded)

# Print the evaluation results
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

## Evaluation with example

In [22]:
def translate(sentence):
    sequence = tamil_tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    translation = model.predict(padded_sequence)
    print(translation)
    translation = tf.argmax(translation, axis=-1).numpy()[0]
    print(translation)
    translated_sentence = ' '.join([list(english_tokenizer.word_index.keys())[idx-1] for idx in translation if idx != 0])
    return translated_sentence

In [23]:
tamil_sentence = "vanakam"
translated_sentence = translate(tamil_sentence)
print(f"Tamil: {tamil_sentence}")
print(f"English: {translated_sentence}")

[[[1.38579565e-03 9.94906306e-01 9.57807487e-08 4.19133812e-07
   1.56862450e-06 6.87762986e-06 3.65471188e-03 4.43824392e-05]
  [9.98389482e-01 6.29042741e-04 1.56911684e-09 1.07814566e-08
   4.61561562e-08 3.89372588e-07 6.61383237e-05 9.14991717e-04]
  [9.99983311e-01 3.00964234e-06 1.67736484e-11 6.76802364e-11
   5.98433691e-10 1.05891971e-08 1.04758260e-06 1.26326495e-05]
  [9.99993086e-01 9.61565547e-07 7.37199624e-12 2.55637421e-11
   2.70523076e-10 4.41256010e-09 4.27448839e-07 5.57348176e-06]]]
[1 0 0 0]
Tamil: vanakam
English: hello
