<a href="https://colab.research.google.com/github/chetanRaJ222/Brain_Tumor_Segmentation_BCP/blob/main/translation(hun_to_eng).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
with open('hun_eng_pairs_train.txt', encoding='utf-8') as f:
    train = [line.rstrip() for line in f]

SEPARATOR = '<sep>'
train_input, train_target = map(
    list, zip(*[pair.split(SEPARATOR) for pair in train])
)

In [39]:
import re

def preprocess_sentence(s):
    s = s.lower()
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    s = re.sub(r'\s+', ' ', s)
    return s.strip()

train_input = [preprocess_sentence(s) for s in train_input]
train_target = [preprocess_sentence(s) for s in train_target]

# ADD TOKENS HERE (AFTER preprocessing)
train_target = ['<sos> ' + s + ' <eos>' for s in train_target]

In [40]:
from tensorflow.keras.preprocessing.text import Tokenizer

source_tokenizer = Tokenizer(oov_token='<unk>')
target_tokenizer = Tokenizer(oov_token='<unk>')

source_tokenizer.fit_on_texts(train_input)
target_tokenizer.fit_on_texts(train_target)

In [42]:
print('sos' in target_tokenizer.word_index)
print('eos' in target_tokenizer.word_index)
print(list(target_tokenizer.word_index.keys())[:20])

True
True
['<unk>', 'sos', 'eos', 'i', 'is', "don't", 'care', 'you', 'the', 'this', 'what', 'say', 'take', 'of', 'my', 'children', 'meet', 'real', 'me', 'love']


In [43]:
train_input_seq = source_tokenizer.texts_to_sequences(train_input)
train_target_seq = target_tokenizer.texts_to_sequences(train_target)

decoder_input_seq = [s[:-1] for s in train_target_seq]
decoder_target_seq = [s[1:] for s in train_target_seq]

In [44]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len_hun = max(len(s) for s in train_input_seq)
max_len_eng = max(len(s) for s in train_target_seq)

encoder_input = pad_sequences(train_input_seq, maxlen=max_len_hun, padding='post')
decoder_input = pad_sequences(decoder_input_seq, maxlen=max_len_eng, padding='post')
decoder_target = pad_sequences(decoder_target_seq, maxlen=max_len_eng, padding='post')

# Prevent <unk> appearing for padding
source_tokenizer.index_word[0] = ''
target_tokenizer.index_word[0] = ''

In [45]:
from keras.layers import Input, Embedding, LSTM, Dense
from keras.models import Model

embedding_dim = 128
hidden_dim = 256

# Encoder
encoder_inputs = Input(shape=(max_len_hun,))
enc_emb = Embedding(source_vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
_, state_h, state_c = LSTM(hidden_dim, return_state=True)(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_len_eng,))
dec_emb = Embedding(target_vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(hidden_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_outputs = Dense(target_vocab_size, activation='softmax')(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

model.summary()

In [46]:
model.fit(
    [encoder_input, decoder_input],
    decoder_target,
    batch_size=64,
    epochs=25
)

Epoch 1/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - loss: 3.4650
Epoch 2/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - loss: 3.4516
Epoch 3/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step - loss: 3.4376
Epoch 4/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step - loss: 3.4220
Epoch 5/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step - loss: 3.4040
Epoch 6/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step - loss: 3.3824
Epoch 7/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step - loss: 3.3559
Epoch 8/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step - loss: 3.3223
Epoch 9/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step - loss: 3.2791
Epoch 10/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - loss: 3.2226
Epoch 11/25
[1m1/1[

<keras.src.callbacks.history.History at 0x7f816e116990>

In [47]:
encoder_model = Model(encoder_inputs, encoder_states)

In [50]:
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

# Decoder inputs (1 token at a time)
decoder_input_single = Input(shape=(1,), name='decoder_input_single')

state_h_input = Input(shape=(hidden_dim,), name='state_h_input')
state_c_input = Input(shape=(hidden_dim,), name='state_c_input')
decoder_states_inputs = [state_h_input, state_c_input]

# Get correct trained layers
decoder_embedding_layer = model.get_layer('embedding_5')   # decoder embedding
decoder_lstm_layer = model.get_layer('lstm_5')             # decoder LSTM
decoder_dense_layer = model.get_layer('dense_2')           # output layer

# Embedding → LSTM → Dense
decoder_embedded = decoder_embedding_layer(decoder_input_single)

decoder_outputs, h, c = decoder_lstm_layer(
    decoder_embedded,
    initial_state=decoder_states_inputs
)

decoder_outputs = decoder_dense_layer(decoder_outputs)

# Final decoder model
decoder_model = Model(
    [decoder_input_single] + decoder_states_inputs,
    [decoder_outputs, h, c]
)

In [54]:
def translate(sentence, max_len=30):
    sentence = preprocess_sentence(sentence)
    seq = source_tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_len_hun, padding='post')

    h, c = encoder_model.predict(seq)
    current_token = target_tokenizer.word_index['sos']
    result = []

    for _ in range(max_len):
        output, h, c = decoder_model.predict(
            [np.array([[current_token]]), h, c]
        )

        current_token = np.argmax(output[0, 0])
        word = target_tokenizer.index_word.get(current_token, '')

        if word == 'eos':
            break

        result.append(word)

    return ' '.join(result)

In [56]:
print('sos' in target_tokenizer.word_index)
print('eos' in target_tokenizer.word_index)

True
True


In [58]:
print(translate("Találkozz az igazi énnel"))
print(translate("Vigyázz a gyerekeimre!"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
i


In [59]:
print(translate("Szeretlek"))
print(translate("Köszönöm"))
print(translate("Jó reggelt"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 245ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
i
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
i
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step

