In [29]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
import numpy as np

In [30]:
def parse_conllu_grouped(file_path):
    """
    Parses a CoNLL-U file and groups forms and lemmas for each sentence.

    This function reads a CoNLL-U file and structures each sentence as a 
    single tuple. The tuple contains two lists: the first is a list of all 
    word forms in the sentence, and the second is a list of all their 
    corresponding lemmas.

    Args:
        file_path (str): The path to the .conllu file.

    Returns:
        list: A list of tuples. Each tuple represents a sentence and
              is structured as ([list_of_forms], [list_of_lemmas]).
              Returns an empty list if the file cannot be read.
    """
    all_sentences_grouped = []
    current_forms = []
    current_lemmas = []
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                # A blank line signifies the end of a sentence
                if line == '\n':
                    if current_forms:
                        # ONLY append the sentence if its length is 128 or less
                        if len(current_forms) <= 128:
                            sentence_tuple = (current_forms, current_lemmas)
                            all_sentences_grouped.append(sentence_tuple)
                        
                        # Always reset for the next sentence
                        current_forms = []
                        current_lemmas = []
                    continue

                # Ignore comment lines
                if line.startswith('#'):
                    continue

                columns = line.strip().split('\t')
                
                if len(columns) == 10:
                    token_id = columns[0]
                    # Process only single tokens, ignoring multiword/empty ones
                    if token_id.isdigit():
                        # Append form and lemma to their respective lists. can add more paramters from file with the desired index here
                        current_forms.append(columns[1])
                        current_lemmas.append(columns[3])

        # Add the very last sentence if it meets the length criteria
        if current_forms and len(current_forms) <= 128:
            sentence_tuple = (current_forms, current_lemmas)
            all_sentences_grouped.append(sentence_tuple)
            
    except FileNotFoundError:
        print(f"Error: The file at '{file_path}' was not found.")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []
        
    return all_sentences_grouped


In [31]:
# --- Example Usage ---

file_to_read = './UD_English-EWT-master/en_ewt-ud-train.conllu'
parsed_data_grouped = parse_conllu_grouped(file_to_read)

# Print the first two processed sentences to see the new structure
if parsed_data_grouped:
    print("Successfully parsed the file with the new structure.")
    
    # The first sentence
    print("\n--- First Sentence ---")
    sentence_one = parsed_data_grouped[0]
    print(f"Forms: {sentence_one[0]}")
    print(f"Lemmas: {sentence_one[1]}")
    
    # The second sentence
    print("\n--- Second Sentence ---")
    sentence_two = parsed_data_grouped[1]
    print(f"Forms: {sentence_two[0]}")
    print(f"Lemmas: {sentence_two[1]}")

Successfully parsed the file with the new structure.

--- First Sentence ---
Forms: ['Al', '-', 'Zaman', ':', 'American', 'forces', 'killed', 'Shaikh', 'Abdullah', 'al', '-', 'Ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'Qaim', ',', 'near', 'the', 'Syrian', 'border', '.']
Lemmas: ['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT']

--- Second Sentence ---
Forms: ['[', 'This', 'killing', 'of', 'a', 'respected', 'cleric', 'will', 'be', 'causing', 'us', 'trouble', 'for', 'years', 'to', 'come', '.', ']']
Lemmas: ['PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'AUX', 'AUX', 'VERB', 'PRON', 'NOUN', 'ADP', 'NOUN', 'PART', 'VERB', 'PUNCT', 'PUNCT']


In [32]:
def build_model(source_vectorizer, target_vectorizer, embedding_dim=128, lstm_units=64):
    """
    Builds and compiles a sequence-to-sequence Keras model with variable parameters.
    
    Args:
        source_vectorizer (TextVectorization): The adapted vectorizer for input forms.
        target_vectorizer (TextVectorization): The adapted vectorizer for target lemmas.
        embedding_dim (int): The dimensionality of the token embeddings.
        lstm_units (int): The number of units in the LSTM layer.
        
    Returns:
        keras.Model: A compiled Keras model.
    """
    SOURCE_VOCAB_SIZE = len(source_vectorizer.get_vocabulary())
    TARGET_VOCAB_SIZE = len(target_vectorizer.get_vocabulary())
    MAX_SEQUENCE_LENGTH = source_vectorizer.get_config()['output_sequence_length']
    
    # 1. Input Layer
    inputs = Input(shape=(1,), dtype=tf.string, name="input_forms")
    
    # 2. Vectorization and Embedding
    x = source_vectorizer(inputs)
    x = Embedding(
        input_dim=SOURCE_VOCAB_SIZE, 
        output_dim=embedding_dim, # Use the function parameter
        mask_zero=True,
        name="embedding"
    )(x)
    
    # 3. Recurrent Layer
    x = Bidirectional(LSTM(lstm_units, return_sequences=True), name="bidirectional_lstm")(x) # Use the function parameter
    
    # 4. Output Layer
    outputs = TimeDistributed(
        Dense(TARGET_VOCAB_SIZE, activation="softmax"),
        name="output_lemmas"
    )(x)
    
    model = keras.Model(inputs, outputs)
    
    model.compile(
        optimizer="adam",
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    
    return model

In [33]:
def train_model(model, x_train, y_train, x_val, y_val, epochs=10, batch_size=32, model_path='best_model.keras'):
    """
    Trains the model and saves the best version.
    
    Args:
        model (keras.Model): The model to train.
        x_train, y_train: Training data and labels.
        x_val, y_val: Validation data and labels.
        epochs (int): Number of training epochs.
        batch_size (int): Size of the training batches.
        model_path (str): Path to save the best model.
        
    Returns:
        History: The training history object.
    """
    print(f"\n--- Starting Training for {epochs} Epochs ---")
    
    # Save the model with the best validation loss
    checkpoint = ModelCheckpoint(
        filepath=model_path,
        save_best_only=True,
        monitor='val_loss',
        verbose=1
    )
    
    history = model.fit(
        x_train, y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(x_val, y_val),
        callbacks=[checkpoint]
    )
    print("--- Training Finished ---")
    return history

def evaluate_model(model_path, x_test, y_test):
    """
    Loads the best saved model and evaluates it on the test set.
    """
    print(f"\n--- Evaluating Model: {model_path} ---")
    # Load the best model saved by the checkpoint
    best_model = keras.models.load_model(model_path)
    
    loss, accuracy = best_model.evaluate(x_test, y_test)
    print(f"Test Loss:     {loss:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")
    print("--- Evaluation Finished ---")
    return loss, accuracy

def plot_history(history):
    """
    Plots training & validation accuracy and loss from a history object.
    """
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Plot accuracy
    ax1.plot(history.history['accuracy'], label='Train Accuracy')
    ax1.plot(history.history['val_accuracy'], label='Validation Accuracy')
    ax1.set_title('Model Accuracy')
    ax1.set_ylabel('Accuracy')
    ax1.set_xlabel('Epoch')
    ax1.legend()
    
    # Plot loss
    ax2.plot(history.history['loss'], label='Train Loss')
    ax2.plot(history.history['val_loss'], label='Validation Loss')
    ax2.set_title('Model Loss')
    ax2.set_ylabel('Loss')
    ax2.set_xlabel('Epoch')
    ax2.legend()
    
    plt.tight_layout()
    plt.show()

In [34]:
# --- 1. Define Paths and Load Data 📂 ---
# Replace with your actual file names
train_file = './UD_English-EWT-master/en_ewt-ud-train.conllu'
val_file   = './UD_English-EWT-master/en_ewt-ud-dev.conllu'    
test_file  = './UD_English-EWT-master/en_ewt-ud-test.conllu'

print("Loading and parsing data from separate files...")
train_data_parsed = parse_conllu_grouped(train_file)
val_data_parsed   = parse_conllu_grouped(val_file)
test_data_parsed  = parse_conllu_grouped(test_file)
print("Data parsing complete.")


# --- 2. Prepare Text and Vectorizers ✍️ ---
# Prepare text from the TRAINING set to adapt the vectorizers
train_forms  = [" ".join(item[0]) for item in train_data_parsed]
train_lemmas = [" ".join(item[1]) for item in train_data_parsed]

# Define model constants
MAX_VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 128

# Create vectorization layers
source_vectorizer = TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    output_sequence_length=MAX_SEQUENCE_LENGTH
)

target_vectorizer = TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    output_sequence_length=MAX_SEQUENCE_LENGTH
)

# IMPORTANT: Adapt the vectorizers ONLY on the training data.
# This ensures the model does not learn any vocabulary from the validation or test sets.
print("Adapting vectorizers on training data...")
source_vectorizer.adapt(train_forms)
target_vectorizer.adapt(train_lemmas)
print("Vectorizers adapted.")


# --- 3. Vectorize All Three Data Splits 🔢 ---
print("Vectorizing train, validation, and test sets...")

# Vectorize the training data
x_train = source_vectorizer(np.array(train_forms))
y_train = target_vectorizer(np.array(train_lemmas))

# Prepare and vectorize the validation data
val_forms  = [" ".join(item[0]) for item in val_data_parsed]
val_lemmas = [" ".join(item[1]) for item in val_data_parsed]
x_val = source_vectorizer(np.array(val_forms))
y_val = target_vectorizer(np.array(val_lemmas))

# Prepare and vectorize the test data
test_forms  = [" ".join(item[0]) for item in test_data_parsed]
test_lemmas = [" ".join(item[1]) for item in test_data_parsed]
x_test = source_vectorizer(np.array(test_forms))
y_test = target_vectorizer(np.array(test_lemmas))
print("Data vectorization complete.")


# --- 4. Verify the Shapes ---
print(f"\nTraining samples:   {x_train.shape[0]}")
print(f"Validation samples: {x_val.shape[0]}")
print(f"Test samples:       {x_test.shape[0]}")

Loading and parsing data from separate files...
Data parsing complete.
Adapting vectorizers on training data...
Vectorizers adapted.
Vectorizing train, validation, and test sets...
Data vectorization complete.

Training samples:   12542
Validation samples: 2001
Test samples:       2077


In [35]:

# Define different model configurations to test
model_variants = [
    {"name": "Small Model", "embedding_dim": 64, "lstm_units": 32},
    # {"name": "Base Model", "embedding_dim": 128, "lstm_units": 64},
    # {"name": "Large Model", "embedding_dim": 256, "lstm_units": 128},
]

for variant in model_variants:
    print(f"\n{'='*20} TESTING VARIANT: {variant['name']} {'='*20}")
    
    # 1. Build the model with the specified parameters
    model = build_model(
        source_vectorizer,
        target_vectorizer,
        embedding_dim=variant["embedding_dim"],
        lstm_units=variant["lstm_units"]
    )
    model.summary()
    
    # 2. Train the model
    model_path = f"best_model_{variant['name'].replace(' ', '_').lower()}.keras"
    history = train_model(model, x_train, y_train, x_val, y_val, epochs=5, model_path=model_path) # Using 5 epochs for a quick example
    
    # 3. Plot the results
    plot_history(history)
    
    # 4. Evaluate the best saved model on the test set
    evaluate_model(model_path, x_test, y_test)





--- Starting Training for 5 Epochs ---
Epoch 1/5


ValueError: Exception encountered when calling TextVectorization.call().

[1mWhen using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(None, 128) with rank=2[0m

Arguments received by TextVectorization.call():
  • inputs=tf.Tensor(shape=(None, 128), dtype=string)