<a href="https://colab.research.google.com/github/dixy52-beep/COLAB_Custom_Translator/blob/main/Colab_Translator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import random

def read_data(file_path):
    input_texts = []
    target_texts = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split('/')
            if len(parts) == 2:
                input_text, target_text = parts
                input_texts.append(input_text.strip())
                # Adding <start> and <end> to the target text
                target_texts.append('<start> ' + target_text.strip() + ' <end>')
            else:
                print("Skipping line with invalid format:", line.strip())
    return input_texts, target_texts


# Define the file path
file_path = 'traduzioni.txt'

# Load the dataset
input_texts, target_texts = read_data(file_path)

# Print a random pair
random_index = random.randint(0, len(input_texts) - 1)
print("Input:", input_texts[random_index])
print("Target:", target_texts[random_index])

# Print lengths of input and target texts
print(input_texts)
print(target_texts)



In [None]:
import tensorflow as tf
import numpy as np

# Tokenize the input and target texts
input_tokenizer = tf.keras.preprocessing.text.Tokenizer()
input_tokenizer.fit_on_texts(input_texts)
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, padding='post')

target_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
target_tokenizer.fit_on_texts(target_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)
target_sequences = tf.keras.preprocessing.sequence.pad_sequences(target_sequences, padding='post')

# Define the model
embedding_dim = 16
units = 32

encoder_inputs = tf.keras.layers.Input(shape=(None,))
encoder_embedding = tf.keras.layers.Embedding(len(input_tokenizer.word_index) + 1, embedding_dim)
encoder_outputs, state_h, state_c = tf.keras.layers.LSTM(units, return_state=True)(encoder_embedding(encoder_inputs))
encoder_states = [state_h, state_c]

decoder_inputs = tf.keras.layers.Input(shape=(None,))
decoder_embedding = tf.keras.layers.Embedding(len(target_tokenizer.word_index) + 1, embedding_dim)
decoder_lstm = tf.keras.layers.LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding(decoder_inputs), initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(len(target_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile and train the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.fit([input_sequences, target_sequences[:, :-1]], target_sequences[:, 1:], epochs=3500)

# Define inference models
encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)

decoder_state_input_h = tf.keras.layers.Input(shape=(units,))
decoder_state_input_c = tf.keras.layers.Input(shape=(units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding(decoder_inputs), initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = tf.keras.models.Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# Get the maximum target sequence length
max_target_length = max(len(sequence) for sequence in target_sequences[:, 1:])

# Define a function for inference
def translate_sentence(input_sentence):
    input_sequence = input_tokenizer.texts_to_sequences([input_sentence])
    input_sequence = tf.keras.preprocessing.sequence.pad_sequences(input_sequence, maxlen=input_sequences.shape[1], padding='post')
    encoder_states_value = encoder_model.predict(input_sequence)
    target_sequence = np.zeros((1, 1))
    target_sequence[0, 0] = target_tokenizer.word_index['<start>']
    translated_sentence = ''
    stop_condition = False
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_sequence] + encoder_states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        # Get reverse word index for target tokenizer
        reverse_target_word_index = {index: word for word, index in target_tokenizer.word_index.items()}

        sampled_word = reverse_target_word_index[sampled_token_index]
        translated_sentence += sampled_word + ' '
        if sampled_word == '<end>' or len(translated_sentence.split()) > max_target_length:
            stop_condition = True
        target_sequence = np.zeros((1, 1))
        target_sequence[0, 0] = sampled_token_index
        encoder_states_value = [h, c]
    return translated_sentence

# Translate input sentences
for input_sentence in input_texts:
    print('Input sentence:', input_sentence)
    print('Translated sentence:', translate_sentence(input_sentence))


[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Epoch 20/3500
Epoch 21/3500
Epoch 22/3500
Epoch 23/3500
Epoch 24/3500
Epoch 25/3500
Epoch 26/3500
Epoch 27/3500
Epoch 28/3500
Epoch 29/3500
Epoch 30/3500
Epoch 31/3500
Epoch 32/3500
Epoch 33/3500
Epoch 34/3500
Epoch 35/3500
Epoch 36/3500
Epoch 37/3500
Epoch 38/3500
Epoch 39/3500
Epoch 40/3500
Epoch 41/3500
Epoch 42/3500
Epoch 43/3500
Epoch 44/3500
Epoch 45/3500
Epoch 46/3500
Epoch 47/3500
Epoch 48/3500
Epoch 49/3500
Epoch 50/3500
Epoch 51/3500
Epoch 52/3500
Epoch 53/3500
Epoch 54/3500
Epoch 55/3500
Epoch 56/3500
Epoch 57/3500
Epoch 58/3500
Epoch 59/3500
Epoch 60/3500
Epoch 61/3500
Epoch 62/3500
Epoch 63/3500
Epoch 64/3500
Epoch 65/3500
Epoch 66/3500
Epoch 67/3500
Epoch 68/3500
Epoch 69/3500
Epoch 70/3500
Epoch 71/3500
Epoch 72/3500
Epoch 73/3500
Epoch 74/3500
Epoch 75/3500
Epoch 76/3500
Epoch 77/3500
Epoch 78/3500
Epoch 79/3500
Epoch 80/3500
Epoch 81/3500
Epoch 82/3500
Epoch 83/3500
Epoch 84/3500
Epoch 85/3500
Epoch 86/350

In [None]:
# Define your input text
input_text = "amo la musica"

# Get the translation
translation = translate_sentence(input_text)

# Print the input text and its translation
print('Input sentence:', input_text)
print('Translated sentence:', translation)
