In [None]:
df = pd.read_csv('en-ru.txt', names=['en', 'ru', 'attr'], usecols=['en', 'ru'], sep='\t')
df = df.sample(frac=1, random_state=42)
df = df.reset_index(drop=True)

In [None]:
def clean_text_ru(text):
    text = normalize('NFD', text.lower())
    text = re.sub('[^А-Яа-я ]+', '', text)
    return text

def clean_text(text):
    text = normalize('NFD', text.lower())
    text = re.sub('[^A-Za-z ]+', '', text)
    return text

def clean_and_prepare_text(text):
    text = '[start] ' + clean_text(text) + ' [end]'
    return text

df['ru'] = df['ru'].apply(lambda row: clean_text_ru(row))
df['en'] = df['en'].apply(lambda row: clean_and_prepare_text(row))

In [None]:
en = df['en']
ru = df['ru']

en_max_len = max(len(line.split()) for line in en)
ru_max_len = max(len(line.split()) for line in ru)
sequence_len = max(en_max_len, ru_max_len)

In [4]:
en_tokenizer = Tokenizer()
en_tokenizer.fit_on_texts(en)
en_sequences = en_tokenizer.texts_to_sequences(en)
en_x = pad_sequences(en_sequences, maxlen=en_max_len, padding='post')

ru_tokenizer = Tokenizer()
ru_tokenizer.fit_on_texts(ru)
ru_sequences = ru_tokenizer.texts_to_sequences(ru)
ru_y = pad_sequences(ru_sequences, maxlen=ru_max_len, padding='post')

In [None]:
en_vocab_size = len(en_tokenizer.word_index) + 1
ru_vocab_size = len(ru_tokenizer.word_index) + 1

print(f'Vocabulary size (English): {en_vocab_size}')
print(f'Vocabulary size (Russian): {ru_vocab_size}')

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

model = Sequential()
model.add(Embedding(en_vocab_size, 256, input_length=en_max_len, mask_zero=True))
model.add(LSTM(256))
model.add(RepeatVector(ru_max_len))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.4))
model.add(TimeDistributed(Dense(ru_vocab_size, activation='softmax')))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary(line_length=100)

In [None]:
callback = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
hist = model.fit(en_x, ru_y, epochs=50, batch_size=50, validation_split=0.2, callbacks=[callback])

In [None]:
def translate_text(text, model, en_tokenizer, ru_tokenizer, en_max_len):
    sequence = en_tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=en_max_len, padding='post')
    prediction = model.predict(padded_sequence, verbose=0)[0]
    indexes = [np.argmax(idx) for idx in prediction]
    return ru_tokenizer.sequences_to_texts([indexes])[0]

In [None]:
translate_text('IT students, en_max_len')