setup

In [1]:
import numpy as np
import keras
import os
from pathlib import Path

KeyboardInterrupt: 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("/Users/dana/Desktop/dl project/character_normalization_pairs_cleaned_output.csv")

# Split train (80%) and temp (20%)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Split temp into dev (50%) and test (50%) → each gets 10% of the original data
dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Check sizes
print(f"Train: {len(train_df)}, Dev: {len(dev_df)}, Test: {len(test_df)}")

# Save if needed
train_df.to_csv("/Users/dana/Desktop/dl project/train.csv", index=False)
dev_df.to_csv("/Users/dana/Desktop/dl project/dev.csv", index=False)
test_df.to_csv("/Users/dana/Desktop/dl project/test.csv", index=False)


Train: 2400, Dev: 300, Test: 300


In [None]:
import pandas as pd
import tensorflow as tf
import random

# Path to your dataset
data_path = "/Users/dana/Desktop/dl project/train.csv"

# Load the dataset
df = pd.read_csv(data_path)

# Show first few rows
print(df.head())

# Define training parameters
batch_size = 64
epochs = 100
latent_dim = 256
num_samples = len(df)  # Use all available samples
seed = 42
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

print(f"Loaded dataset with {num_samples} samples")


           input             target
0        бастысы            бастысы
1          турып              тұрып
2     отыргызган         отырғызған
3  марапатталган  \tмарапатталған\t
4          кенес            кеңес  
Loaded dataset with 2400 samples


In [None]:
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().strip().split("\n")  # Remove extra spaces

# Skip the header row and process the data
for line in lines[1: min(num_samples, len(lines))]:  
    if "," not in line:
        continue  # Skip empty or malformed lines

    input_text, target_text = line.split(",")  # Use comma instead of tab
    target_text = "\t" + target_text + "\n"  # Add start and end markers

    input_texts.append(input_text)
    target_texts.append(target_text)

    for char in input_text:
        input_characters.add(char)
    for char in target_text:
        target_characters.add(char)

input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max(len(txt) for txt in input_texts)
max_decoder_seq_length = max(len(txt) for txt in target_texts)

print("Number of samples:", len(input_texts))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype="float32",
)
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype="float32",
)
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype="float32",
)

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
    encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
    decoder_target_data[i, t:, target_token_index[" "]] = 1.0


Number of samples: 2399
Number of unique input tokens: 45
Number of unique output tokens: 48
Max sequence length for inputs: 20
Max sequence length for outputs: 28


build the model

In [None]:
# Define an input sequence and process it.
encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
encoder = keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=100,
    validation_split=0.1,
)
# Save model
model.save("/Users/dana/Desktop/dl project/100_epochs_s2s_model.keras")

Epoch 1/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 66ms/step - accuracy: 0.6237 - loss: 1.9649 - val_accuracy: 0.7065 - val_loss: 1.1860
Epoch 2/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 57ms/step - accuracy: 0.7127 - loss: 1.1530 - val_accuracy: 0.7028 - val_loss: 1.1702
Epoch 3/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 56ms/step - accuracy: 0.7173 - loss: 1.1031 - val_accuracy: 0.7174 - val_loss: 1.0401
Epoch 4/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 59ms/step - accuracy: 0.7300 - loss: 1.0133 - val_accuracy: 0.7187 - val_loss: 1.0231
Epoch 5/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 59ms/step - accuracy: 0.7366 - loss: 0.9712 - val_accuracy: 0.7290 - val_loss: 0.9478
Epoch 6/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 59ms/step - accuracy: 0.7420 - loss: 0.9396 - val_accuracy: 0.7298 - val_loss: 0.9386
Epoch 7/100
[1m34/34[0m [

Run inference (sampling)

encode input and retrieve initial decoder state

run one step of decoder with this initial state and a "start of sequence" token as target. Output will be the next target token.

Repeat with the current target token and current states

In [None]:
# Define sampling models
# Restore the model and construct the encoder and decoder.
model = keras.models.load_model("/Users/dana/Desktop/dl project/100_epochs_s2s_model.keras")

encoder_inputs = model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(latent_dim,))
decoder_state_input_c = keras.Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())


def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index["\t"]] = 1.0

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value, verbose=0
        )

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = [h, c]
    return decoded_sentence

In [None]:
for seq_index in range(50):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    print("Input sentence:", input_texts[seq_index])
    print("Decoded sentence:", decoded_sentence)

-
Input sentence: бастысы
Decoded sentence:  бастысы

-
Input sentence: турып
Decoded sentence:  тұрып

-
Input sentence: отыргызган
Decoded sentence:  отырғыздық

-
Input sentence: марапатталган
Decoded sentence:  марапаттарлың

-
Input sentence: кенес
Decoded sentence:  кеңес  

-
Input sentence: шыгармашылык
Decoded sentence:  шығаршашылық

-
Input sentence: оралган
Decoded sentence:  оралады	

-
Input sentence: улык
Decoded sentence: 	ұлық	

-
Input sentence: жарым
Decoded sentence:  жарым

-
Input sentence: диалогка
Decoded sentence:  дарлақтың

-
Input sentence: карттарымыздан
Decoded sentence:  қартатырымызда

-
Input sentence: дереу
Decoded sentence: дереу

-
Input sentence: инфракурылым
Decoded sentence:  нинұрақтылмес

-
Input sentence: барі
Decoded sentence:  бария

-
Input sentence: болганымен
Decoded sentence:  болғанымен  

-
Input sentence: башкуртша
Decoded sentence:  бақшартты

-
Input sentence: жак
Decoded sentence:  жақ	

-
Input sentence: онай
Decoded sentence:  оңа

In [None]:
import sacrebleu

def compute_bleu(reference, hypothesis):
    return sacrebleu.sentence_bleu(hypothesis, [reference]).score

total_bleu = 0.0
num_samples = 50  # Or use len(input_texts)

for seq_index in range(num_samples):
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)

    reference = target_texts[seq_index].strip()
    hypothesis = decoded_sentence.strip()

    bleu_score = compute_bleu(reference, hypothesis)
    total_bleu += bleu_score

    print("-")
    print(f"Input: {input_texts[seq_index]}")
    print(f"Reference: {reference}")
    print(f"Prediction: {hypothesis}")
    print(f"BLEU Score: {bleu_score:.2f}")

# Compute average BLEU score
average_bleu = total_bleu / num_samples
print(f"\nAverage BLEU Score: {average_bleu:.2f}")



-
Input: бастысы
Reference: бастысы
Prediction: бастысы
BLEU Score: 100.00
-
Input: турып
Reference: тұрып
Prediction: тұрып
BLEU Score: 100.00
-
Input: отыргызган
Reference: отырғызған
Prediction: отырғыздық
BLEU Score: 0.00
-
Input: марапатталган
Reference: марапатталған
Prediction: марапаттарлың
BLEU Score: 0.00
-
Input: кенес
Reference: кеңес
Prediction: кеңес
BLEU Score: 100.00
-
Input: шыгармашылык
Reference: шығармашылық
Prediction: шығаршашылық
BLEU Score: 0.00
-
Input: оралган
Reference: оралған
Prediction: оралады
BLEU Score: 0.00
-
Input: улык
Reference: ұлық
Prediction: ұлық
BLEU Score: 100.00
-
Input: жарым
Reference: жарым
Prediction: жарым
BLEU Score: 100.00
-
Input: диалогка
Reference: диалогқа
Prediction: дарлақтың
BLEU Score: 0.00
-
Input: карттарымыздан
Reference: қарттарымыздан
Prediction: қартатырымызда
BLEU Score: 0.00
-
Input: дереу
Reference: дереу
Prediction: дереу
BLEU Score: 100.00
-
Input: инфракурылым
Reference: инфрақұрылым
Prediction: нинұрақтылмес
BLEU S

epochs 100 - Average BLEU Score: 32.00

epochs 100 - Average BLEU Score: 56.00 (dataset increased to 3000)

epochs 300 - Average BLEU Score: 98.00

epochs 150 w early stopping - Average BLEU Score: 94.00

In [None]:
from tensorflow.keras.models import load_model

model_100 = load_model("/Users/dana/Desktop/dl project/less_data/100_epochs_s2s_model.keras")
model_300 = load_model("/Users/dana/Desktop/dl project/less_data/300_epochs_s2s_model.keras")  # Adjust if needed
model_150_early_stopping = load_model("/Users/dana/Desktop/dl project/less_data/150_epochs_early_stopping_s2s_model.keras")
model_100_data_increased = load_model("/Users/dana/Desktop/dl project/100_epochs_s2s_model.keras")


In [None]:
# Initialize test data arrays
encoder_test_data = np.zeros(
    (len(test_df), max_encoder_seq_length, num_encoder_tokens), dtype="float32"
)
decoder_test_data = np.zeros(
    (len(test_df), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
decoder_target_test = np.zeros(
    (len(test_df), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)

# Process each test sample
for i, (input_text, target_text) in enumerate(zip(test_df["input"], test_df["target"])):
    input_text = str(input_text)  # Ensure input_text is a string
    target_text = str(target_text)  # Ensure target_text is a string

    for t, char in enumerate(input_text):
        if char in input_token_index:  # Ensure the character exists in training vocab
            encoder_test_data[i, t, input_token_index[char]] = 1.0
    encoder_test_data[i, t + 1 :, input_token_index[" "]] = 1.0  # Padding

    target_text = "\t" + target_text + "\n"  # Add start and end markers
    for t, char in enumerate(target_text):
        if char in target_token_index:
            decoder_test_data[i, t, target_token_index[char]] = 1.0
            if t > 0:
                decoder_target_test[i, t - 1, target_token_index[char]] = 1.0
    decoder_test_data[i, t + 1 :, target_token_index[" "]] = 1.0  # Padding
    decoder_target_test[i, t:, target_token_index[" "]] = 1.0



In [None]:
# Evaluate models properly
loss_100, acc_100 = model_100.evaluate(
    [encoder_test_data, decoder_test_data], decoder_target_test
)
loss_300, acc_300 = model_300.evaluate(
    [encoder_test_data, decoder_test_data], decoder_target_test
)

loss_150, acc_150 = model_150_early_stopping.evaluate(
    [encoder_test_data, decoder_test_data], decoder_target_test
)

loss_100_data_increased, acc_100_data_increased = model_100_data_increased.evaluate(
    [encoder_test_data, decoder_test_data], decoder_target_test
)

print(f"100 epochs - Loss: {loss_100}, Accuracy: {acc_100}")
print(f"300 epochs - Loss: {loss_300}, Accuracy: {acc_300}")
print(f"150 epochs - Loss: {loss_150}, Accuracy: {acc_150}")
print(f"100 epochs - Loss: {loss_100_data_increased}, Accuracy: {acc_100_data_increased}")


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.7377 - loss: 1.8171
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.7359 - loss: 2.9873
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.7344 - loss: 3.0047
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.8711 - loss: 0.4789
100 epochs - Loss: 1.7866599559783936, Accuracy: 0.7389285564422607
300 epochs - Loss: 2.9398956298828125, Accuracy: 0.7383333444595337
150 epochs - Loss: 2.9501419067382812, Accuracy: 0.7372618913650513
100 epochs - Loss: 0.4822866916656494, Accuracy: 0.871666669845581


Accuracy is similar for both (small improvement at 300 epochs).

Loss is much higher at 300 epochs → This suggests overfitting.

The model has memorized training data instead of generalizing well.
It struggles with new (test) data.

In [None]:
preds_100 = model_100.predict([encoder_test_data, decoder_test_data])
preds_300 = model_300.predict([encoder_test_data, decoder_test_data])

# Example: Print first test sample
print("Model (100 epochs) Prediction:", preds_100[0])
print("Model (300 epochs) Prediction:", preds_300[0])


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Model (100 epochs) Prediction: [[3.2895666e-02 4.7676418e-05 4.0806106e-01 ... 6.6174733e-05
  2.0410349e-03 3.5746729e-03]
 [1.1848162e-03 2.5758054e-04 2.1448459e-03 ... 1.0539153e-04
  1.6127259e-03 5.0183279e-03]
 [2.6871778e-05 6.7502573e-05 6.0543658e-07 ... 1.3142207e-04
  1.2852052e-04 9.7041702e-05]
 ...
 [9.1742782e-09 8.9145203e-08 9.9999666e-01 ... 5.3400090e-10
  2.1502541e-10 2.8338000e-08]
 [1.0000387e-08 7.7693812e-08 9.9999702e-01 ... 4.0884443e-10
  1.7675846e-10 2.2647091e-08]
 [1.1179472e-08 7.9912589e-08 9.9999654e-01 ... 3.5991504e-10
  1.6550590e-10 1.9457390e-08]]
Model (300 epochs) Prediction: [[1.80835486e-04 6.58743948e-10 8.87625277e-01 ... 1.94641274e-07
  2.28985955e-05 1.64100318e-04]
 [9.13821282e-07 5.66816316e-10 2.03385844e-05 ... 4.74317119e-07
  2.00994091e-05 1.75508831e-04]
 [1.05572141e-07 5.34687850e-09 