Завдання щодо генерації текстів або машинного перекладу (на вибір) на базі рекурентних мереж або трансформерів (на вибір).  
Вирішіть завдання щодо генерації текстів або машинного перекладу. Особливо вітаються україномовні моделі.  

Було обрано завдання машиного перекладу на базі рекурентних мереж  

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import warnings

warnings.filterwarnings('ignore')

# 1. Обробка датасету
file_path = './ukr-eng/ukr.txt'  # Вкажіть шлях до вашого датасету
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Виділення речень
english_sentences = []
ukrainian_sentences = []

for line in lines:
    parts = line.split('\t')
    if len(parts) >= 2:
        english_sentences.append(parts[0].strip())  # Англійське речення
        ukrainian_sentences.append(parts[1].strip())  # Українське речення

# Видалення дублювань
unique_pairs = list(set(zip(english_sentences, ukrainian_sentences)))
english_sentences, ukrainian_sentences = zip(*unique_pairs)

# 2. Лімітуємо кількість даних
data_limit = 5000  # Максимальна кількість пар для використання
english_sentences = english_sentences[:data_limit]
ukrainian_sentences = ukrainian_sentences[:data_limit]

# Лімітуємо словник
MAX_NUM_WORDS = 20000

# Додаємо спеціальні токени до речень
start_token = '<s>'
end_token = '<e>'
ukrainian_sentences = [f"{start_token} {sentence} {end_token}" for sentence in ukrainian_sentences]

tokenizer_eng = Tokenizer(num_words=MAX_NUM_WORDS, filters='', lower=True)
tokenizer_ukr = Tokenizer(num_words=MAX_NUM_WORDS, filters='', lower=True)

tokenizer_eng.fit_on_texts(english_sentences)
tokenizer_ukr.fit_on_texts(ukrainian_sentences)

input_sequences = tokenizer_eng.texts_to_sequences(english_sentences)
target_sequences = tokenizer_ukr.texts_to_sequences(ukrainian_sentences)

# Лімітуємо довжину речень
MAX_SEQ_LENGTH = 30

filtered_input_sequences = []
filtered_target_sequences = []

for input_seq, target_seq in zip(input_sequences, target_sequences):
    if len(input_seq) <= MAX_SEQ_LENGTH and len(target_seq) <= MAX_SEQ_LENGTH:
        filtered_input_sequences.append(input_seq)
        filtered_target_sequences.append(target_seq)

input_sequences = filtered_input_sequences
target_sequences = filtered_target_sequences

# Паддінг
encoder_input_data = pad_sequences(input_sequences, maxlen=MAX_SEQ_LENGTH, padding='post')
decoder_input_data = pad_sequences(target_sequences, maxlen=MAX_SEQ_LENGTH, padding='post')

# Визначаємо розмір словників
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_ukr = len(tokenizer_ukr.word_index) + 1

# 3. Створення моделі Seq2Seq
LATENT_DIM = 256
EMBEDDING_DIM = 128

# Енкодер
encoder_inputs = Input(shape=(MAX_SEQ_LENGTH,), name="encoder_input")
encoder_embedding = Embedding(vocab_size_eng, EMBEDDING_DIM, mask_zero=True)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(LATENT_DIM, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Декодер
decoder_inputs = Input(shape=(MAX_SEQ_LENGTH,), name="decoder_input")
decoder_embedding = Embedding(vocab_size_ukr, EMBEDDING_DIM, mask_zero=True)(decoder_inputs)
decoder_lstm, _, _ = LSTM(LATENT_DIM, return_sequences=True, return_state=True)(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_ukr, activation='softmax')
decoder_outputs = decoder_dense(decoder_lstm)

# Повна модель
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 4. Генератор даних
output_signature = (
    {
        "encoder_input": tf.TensorSpec(shape=(None, MAX_SEQ_LENGTH), dtype=tf.int32),
        "decoder_input": tf.TensorSpec(shape=(None, MAX_SEQ_LENGTH), dtype=tf.int32)
    },
    tf.TensorSpec(shape=(None, MAX_SEQ_LENGTH, vocab_size_ukr), dtype=tf.float32)
)

def data_generator(input_data, target_data, batch_size):
    for i in range(0, len(input_data), batch_size):
        encoder_input_batch = input_data[i:i + batch_size]
        decoder_input_batch = target_data[i:i + batch_size]

        if len(encoder_input_batch) < batch_size:
            break  # уникаємо незаповнених батчів

        decoder_target_batch = np.zeros((len(decoder_input_batch), MAX_SEQ_LENGTH, vocab_size_ukr), dtype='float32')
        for j, seq in enumerate(decoder_input_batch):
            for t in range(1, len(seq)):
                decoder_target_batch[j, t - 1, seq[t]] = 1.0

        yield {
            "encoder_input": encoder_input_batch,
            "decoder_input": decoder_input_batch
        }, decoder_target_batch

BATCH_SIZE = 64
dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(encoder_input_data, decoder_input_data, BATCH_SIZE),
    output_signature=output_signature
).repeat()

# 5. Навчання моделі
STEPS_PER_EPOCH = max(1, len(encoder_input_data) // BATCH_SIZE)

model.fit(
    dataset,
    steps_per_epoch=STEPS_PER_EPOCH,
    epochs=10
)

# 6. Інференс (переклад)
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(LATENT_DIM,))
decoder_state_input_c = Input(shape=(LATENT_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm_outputs, state_h, state_c = LSTM(LATENT_DIM, return_sequences=True, return_state=True)(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_lstm_outputs)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

def translate_sentence(input_sentence):
    input_seq = tokenizer_eng.texts_to_sequences([input_sentence])
    input_seq = pad_sequences(input_seq, maxlen=MAX_SEQ_LENGTH, padding='post')
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_ukr.word_index[start_token]

    decoded_sentence = ''
    stop_condition = False
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer_ukr.index_word.get(sampled_token_index, '')
        if sampled_word == end_token or len(decoded_sentence.split()) > MAX_SEQ_LENGTH:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()

# 7. Тестування перекладу
test_sentence = "Hi."
print("Input:", test_sentence)
print("Translated:", translate_sentence(test_sentence))

Epoch 1/10


2024-12-28 14:12:08.243637: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 430833664 exceeds 10% of free system memory.


[1m 1/78[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m18:18[0m 14s/step - accuracy: 0.0000e+00 - loss: 8.7908

2024-12-28 14:12:09.861335: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 430833664 exceeds 10% of free system memory.


[1m 2/78[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:33[0m 1s/step - accuracy: 0.0283 - loss: 8.7901      

2024-12-28 14:12:11.102209: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 430833664 exceeds 10% of free system memory.


[1m 3/78[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:28[0m 1s/step - accuracy: 0.0620 - loss: 8.7893

2024-12-28 14:12:12.208965: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 430833664 exceeds 10% of free system memory.


[1m 4/78[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1:26[0m 1s/step - accuracy: 0.1127 - loss: 8.7885

2024-12-28 14:12:13.341561: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 430833664 exceeds 10% of free system memory.


[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 1s/step - accuracy: 0.4417 - loss: 7.3255
Epoch 2/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 2s/step - accuracy: 0.0762 - loss: 5.0536
Epoch 3/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 1s/step - accuracy: 0.0804 - loss: 4.7480
Epoch 4/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 1s/step - accuracy: 0.0821 - loss: 4.5357
Epoch 5/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 1s/step - accuracy: 0.0835 - loss: 4.3723
Epoch 6/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 1s/step - accuracy: 0.0886 - loss: 4.2154
Epoch 7/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 1s/step - accuracy: 0.0925 - loss: 4.0600
Epoch 8/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 1s/step - accuracy: 0.0949 - loss: 3.9052
Epoch 9/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m