# LSTM Models with 3 different languages

### Create the configuration for the experiment

All the languages with use the same model configuration for comparing performances across them

In [1]:
%pip install conllu

Note: you may need to restart the kernel to use updated packages.


In [2]:
import keras

keras.utils.set_random_seed(50)



In [3]:
from data.preprocessor import DataPreprocessor, DataPreprocessorConfig
from trainer.trainer import TrainerConfig, Trainer
from models.base_model import ModelConfig

preprocessor_config = DataPreprocessorConfig(
    padding_type="post",
    truncation_type="post",
    remove_long_sentences=True,
    max_sequence_length=100,
)

training_config = TrainerConfig(
    epochs=20,
    batch_size=64,
    early_stopping_patience=3,
    learning_rate=1e-3,
    model_dir="saved_models",
    save_best_only=True,
)

model_config = ModelConfig(
    embedding_dim=128,
    lstm_units=128,
    bidirectional=False,
    dropout_rate=0.3,
    training_config=training_config,
)

## English

### Preprocesses the data

In [4]:
from utils import load_data

train_data_en, dev_data_en, test_data_en = load_data("english")

In [5]:
preprocessor_en = DataPreprocessor(preprocessor_config)

In [6]:
X_train_en, y_train_en = preprocessor_en.process_data_to_pad_sequences(
    train_data_en, is_train_dataset=True
)
X_dev_en, y_dev_en = preprocessor_en.process_data_to_pad_sequences(
    dev_data_en, is_train_dataset=False
)
X_test_en, y_test_en = preprocessor_en.process_data_to_pad_sequences(
    test_data_en, is_train_dataset=False
)

### Initilize the model with the config

In [7]:
from models.lstm_model import LSTMModel

lstm_en = LSTMModel(
    model_config,
    preprocessor_en.vocab_size,
    preprocessor_en.num_tags,
    preprocessor_config.max_sequence_length,
)
lstm_en.build_model()
lstm_en.compile_model()

print("Model summary:\n")
print(lstm_en.get_model().summary())

Model summary:



None


### Training the model

In [8]:
# Initialize trainer
trainer_en = Trainer(training_config, lstm_en, preprocessor_en)

In [9]:
# Train the model
print("Training model...\n")
trainer_en.train((X_train_en, y_train_en), (X_dev_en, y_dev_en), 'english')
print("Training completed.\n")

Training model...

Epoch 1/20
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step - _masked_accuracy: 0.4118 - loss: 2.0879
Epoch 1: val_loss improved from None to 0.60873, saving model to saved_models/LSTM_Embed128_LSTM128_english.keras
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 116ms/step - _masked_accuracy: 0.6050 - loss: 1.3939 - val__masked_accuracy: 0.8253 - val_loss: 0.6087
Epoch 2/20
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step - _masked_accuracy: 0.8778 - loss: 0.4401
Epoch 2: val_loss improved from 0.60873 to 0.41079, saving model to saved_models/LSTM_Embed128_LSTM128_english.keras
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 115ms/step - _masked_accuracy: 0.8979 - loss: 0.3679 - val__masked_accuracy: 0.8797 - val_loss: 0.4108
Epoch 3/20
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step - _masked_accuracy: 0.9299 - loss: 0.2438
Epoch 3: val_loss im

### Testing the model

In [10]:
from evaluator.evaluator import Evaluator

evaluator_en = Evaluator(lstm_en, preprocessor_en)

In [11]:
test_metrics_en = evaluator_en.evaluate(X_test_en, y_test_en, "Test")


Test Set Evaluation:
Accuracy: 0.8935

Detailed Classification Report:
              precision    recall  f1-score   support

         ADJ       0.92      0.83      0.87      1788
         ADP       0.89      0.92      0.91      2029
         ADV       0.88      0.86      0.87      1191
         AUX       0.95      0.96      0.95      1543
       CCONJ       0.99      0.99      0.99       736
         DET       0.97      0.97      0.97      1897
        INTJ       0.98      0.66      0.79       121
        NOUN       0.76      0.94      0.84      4123
         NUM       0.91      0.68      0.78       542
        PART       0.85      0.95      0.90       649
        PRON       0.97      0.96      0.96      2165
       PROPN       0.87      0.61      0.72      2075
       PUNCT       0.99      0.99      0.99      3096
       SCONJ       0.79      0.59      0.67       384
         SYM       0.83      0.83      0.83       109
        VERB       0.89      0.90      0.90      2606
         

### Inference on new sentences

In [12]:
from inference.predictor import Predictor

# Create predictor for inference
predictor_en = Predictor(lstm_en.get_model(), preprocessor_en)

In [13]:
test_sentences = {
    "Simple Case": [
        "Today it is cloudy",
        "The quick brown fox jumps over the lazy dog ."
    ],
    "Ambiguity": [
        "The leaves are falling .", # leaves => NOUN
        "He leaves tomorrow .", # leaves => VERB
        "I bought an apple .", # apple => NOUN
        "I work at Apple ." # Apple => PROPN
    ],
    "OOV & Typos": [
        "I googled this supercalifragilisticexpialidocious wrd .",
        "This sentance has twoo mispellings ."
    ]
}

for category, sentences in test_sentences.items():
    print(f"\n--- {category} ---")
    for sentence in sentences:
        predicted_tags = predictor_en.predict_sentence(sentence)
        print(f"  Sentence: {sentence}")
        print(f"  Tags:     {' '.join(predicted_tags)}")


--- Simple Case ---
  Sentence: Today it is cloudy
  Tags:     NOUN PRON AUX ADJ
  Sentence: The quick brown fox jumps over the lazy dog .
  Tags:     DET ADJ ADJ NOUN VERB ADP DET ADJ NOUN PUNCT

--- Ambiguity ---
  Sentence: The leaves are falling .
  Tags:     DET VERB AUX VERB PUNCT
  Sentence: He leaves tomorrow .
  Tags:     PRON VERB NOUN PUNCT
  Sentence: I bought an apple .
  Tags:     PRON VERB DET NOUN PUNCT
  Sentence: I work at Apple .
  Tags:     PRON VERB ADP PROPN PUNCT

--- OOV & Typos ---
  Sentence: I googled this supercalifragilisticexpialidocious wrd .
  Tags:     PRON VERB DET NOUN NOUN PUNCT
  Sentence: This sentance has twoo mispellings .
  Tags:     PRON NOUN AUX NOUN NOUN PUNCT


## Spanish

In [14]:
from utils import load_data

train_data_es, dev_data_es, test_data_es = load_data("spanish")

In [15]:
preprocessor_es = DataPreprocessor(preprocessor_config)

In [16]:
X_train_es, y_train_es = preprocessor_es.process_data_to_pad_sequences(
    train_data_es, is_train_dataset=True
)
X_dev_es, y_dev_es = preprocessor_es.process_data_to_pad_sequences(
    dev_data_es, is_train_dataset=False
)
X_test_es, y_test_es = preprocessor_es.process_data_to_pad_sequences(
    test_data_es, is_train_dataset=False
)

### Initilize the model with the config

In [17]:
from models.lstm_model import LSTMModel

es_lstm = LSTMModel(
    model_config,
    preprocessor_es.vocab_size,
    preprocessor_es.num_tags,
    preprocessor_config.max_sequence_length,
)
es_lstm.build_model()
es_lstm.compile_model()

print("Model summary:\n")
print(es_lstm.get_model().summary())

Model summary:



None


In [18]:
trainer = Trainer(training_config, es_lstm, preprocessor_es)

In [19]:
# Train the model
print("Training model...\n")
trainer.train((X_train_es, y_train_es), (X_dev_es, y_dev_es), 'spanish')
print("Training completed.\n")

Training model...

Epoch 1/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step - _masked_accuracy: 0.4934 - loss: 1.7665
Epoch 1: val_loss improved from None to 0.39927, saving model to saved_models/LSTM_Embed128_LSTM128_spanish.keras
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 117ms/step - _masked_accuracy: 0.6827 - loss: 1.1090 - val__masked_accuracy: 0.8919 - val_loss: 0.3993
Epoch 2/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - _masked_accuracy: 0.9140 - loss: 0.3280
Epoch 2: val_loss improved from 0.39927 to 0.24841, saving model to saved_models/LSTM_Embed128_LSTM128_spanish.keras
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 115ms/step - _masked_accuracy: 0.9295 - loss: 0.2727 - val__masked_accuracy: 0.9274 - val_loss: 0.2484
Epoch 3/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - _masked_accuracy: 0.9551 - loss: 0.1727
Epoch 3: val_loss im

In [20]:
from evaluator.evaluator import Evaluator

evaluator_es = Evaluator(es_lstm, preprocessor_es)

In [21]:
test_metrics_es = evaluator_es.evaluate(X_test_es, y_test_es, "Test")


Test Set Evaluation:
Accuracy: 0.9284

Detailed Classification Report:
              precision    recall  f1-score   support

         ADJ       0.87      0.84      0.85       665
         ADP       0.99      1.00      0.99      1876
         ADV       0.97      0.90      0.94       424
         AUX       0.86      0.96      0.91       331
       CCONJ       0.98      0.90      0.94       395
         DET       0.96      0.99      0.97      1696
        INTJ       0.00      0.00      0.00         1
        NOUN       0.95      0.91      0.93      2225
         NUM       0.99      0.76      0.86       230
        PART       0.00      0.00      0.00         1
        PRON       0.90      0.83      0.86       445
       PROPN       0.71      0.93      0.80       818
       PUNCT       1.00      1.00      1.00      1260
       SCONJ       0.89      0.91      0.90       336
         SYM       1.00      0.76      0.86        25
        VERB       0.92      0.87      0.89      1167
         

### Inference on new sentences

In [22]:
from inference.predictor import Predictor

# Create predictor for inference
predictor_es = Predictor(es_lstm.get_model(), preprocessor_es)

In [23]:
test_sentences_spanish = [
    "El vino está bueno .",            # vino => NOUN
    "Él no vino a mi casa .",          # vino => VERB
    "Yo canto en la ducha .",             # canto => VERB
    "El canto del pájaro está curioso .",   # canto => NOUN
    "Compró una rosa rosa .",            # rosa => NOUN
    "Habló con Rosa esta mañana .",     # Rosa => PROPN
    "El marco de la foto es de madera .", # marco => NOUN
    "Vi a Marco en su casa ."         # Marco => PROPN
    "El hombre bajo se fue .",           # bajo => ADJ
    "Él toca el bajo y el piano .",                 # bajo => NOUN
    "Por favor habla más bajo que no escucho ."        # bajo => ADV
]

for sentence in test_sentences_spanish:
    predicted_tags = predictor_es.predict_sentence(sentence)
    print(f"  Sentence: {sentence}")
    print(f"  Tags:     {' '.join(predicted_tags)}")

  Sentence: El vino está bueno .
  Tags:     DET NOUN AUX ADJ PUNCT
  Sentence: Él no vino a mi casa .
  Tags:     PRON ADV VERB ADP DET NOUN PUNCT
  Sentence: Yo canto en la ducha .
  Tags:     PRON NOUN ADP DET NOUN PUNCT
  Sentence: El canto del pájaro está curioso .
  Tags:     DET NOUN ADP NOUN AUX ADJ PUNCT
  Sentence: Compró una rosa rosa .
  Tags:     PROPN DET ADJ ADJ PUNCT
  Sentence: Habló con Rosa esta mañana .
  Tags:     PROPN ADP PROPN DET NOUN PUNCT
  Sentence: El marco de la foto es de madera .
  Tags:     DET NOUN ADP DET NOUN AUX ADP NOUN PUNCT
  Sentence: Vi a Marco en su casa .El hombre bajo se fue .
  Tags:     PROPN ADP PROPN ADP DET NOUN PROPN NOUN ADP PRON AUX PUNCT
  Sentence: Él toca el bajo y el piano .
  Tags:     PRON VERB DET NOUN CCONJ DET NOUN PUNCT
  Sentence: Por favor habla más bajo que no escucho .
  Tags:     ADP NOUN VERB ADV ADJ SCONJ ADV VERB PUNCT


## German

### Preprocess the data

In [24]:
from utils import load_data

train_data_ge, dev_data_ge, test_data_ge = load_data("german")

In [25]:
preprocessor_ge = DataPreprocessor(preprocessor_config)

In [26]:
X_train_ge, y_train_ge = preprocessor_ge.process_data_to_pad_sequences(
    train_data_ge, is_train_dataset=True
)
X_dev_ge, y_dev_ge = preprocessor_ge.process_data_to_pad_sequences(
    dev_data_ge, is_train_dataset=False
)
X_test_ge, y_test_ge = preprocessor_ge.process_data_to_pad_sequences(
    test_data_ge, is_train_dataset=False
)

### Initilize the model with the config

In [27]:
from models.lstm_model import LSTMModel

ge_lstm = LSTMModel(
    model_config,
    preprocessor_ge.vocab_size,
    preprocessor_ge.num_tags,
    preprocessor_config.max_sequence_length,
)
ge_lstm.build_model()
ge_lstm.compile_model()

print("Model summary:\n")
print(ge_lstm.get_model().summary())

Model summary:



None


In [28]:
trainer_ge = Trainer(training_config, ge_lstm, preprocessor_ge)

In [29]:
# Train the model
print("Training model...\n")
trainer_ge.train((X_train_ge, y_train_ge), (X_dev_ge, y_dev_ge), 'german')
print("Training completed.\n")

Training model...

Epoch 1/20
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - _masked_accuracy: 0.4221 - loss: 1.9838
Epoch 1: val_loss improved from None to 0.61411, saving model to saved_models/LSTM_Embed128_LSTM128_german.keras
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 115ms/step - _masked_accuracy: 0.6008 - loss: 1.3388 - val__masked_accuracy: 0.8288 - val_loss: 0.6141
Epoch 2/20
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step - _masked_accuracy: 0.8749 - loss: 0.4320
Epoch 2: val_loss improved from 0.61411 to 0.36783, saving model to saved_models/LSTM_Embed128_LSTM128_german.keras
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 119ms/step - _masked_accuracy: 0.9030 - loss: 0.3448 - val__masked_accuracy: 0.8892 - val_loss: 0.3678
Epoch 3/20
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step - _masked_accuracy: 0.9525 - loss: 0.1851
Epoch 3: val_loss impr

In [30]:
from evaluator.evaluator import Evaluator

evaluator_ge = Evaluator(ge_lstm, preprocessor_ge)

In [31]:
test_metrics_ge = evaluator_ge.evaluate(X_test_ge, y_test_ge, "Test")


Test Set Evaluation:
Accuracy: 0.8950

Detailed Classification Report:
              precision    recall  f1-score   support

         ADJ       0.90      0.68      0.77      1249
         ADP       0.94      0.98      0.96      1603
         ADV       0.93      0.89      0.91      1058
         AUX       0.88      0.94      0.91       691
       CCONJ       0.98      0.93      0.95       460
         DET       0.97      0.97      0.97      2264
        INTJ       0.00      0.00      0.00         4
        NOUN       0.81      0.91      0.86      3111
         NUM       0.97      0.88      0.92       233
        PART       0.92      0.74      0.82       210
        PRON       0.94      0.91      0.92       705
       PROPN       0.63      0.82      0.71      1022
       PUNCT       1.00      1.00      1.00      2366
       SCONJ       0.90      0.72      0.80       168
         SYM       0.00      0.00      0.00         4
        VERB       0.91      0.74      0.82      1326
         

### Inference on new sentences

In [32]:
from inference.predictor import Predictor

# Create predictor for inference
predictor_ge = Predictor(ge_lstm.get_model(), preprocessor_ge)

In [33]:
test_sentences_german = [
    "Wir essen jetzt .",                 # essen => VERB
    "Das Essen ist fertig .",            # Essen => NOUN
    "Die Vögel fliegen .",               # fliegen => VERB
    "Die Fliegen sind nervig .",         # Fliegen => NOUN
    "Homograph Ambiguity (ADJ/NOUN/ADV)",
    "Der Junge ist arm .",               # arm => ADJ
    "Sein Arm ist gebrochen .",          # Arm => NOUN
    "Ich komme morgen .",                # morgen => ADV
    "Der Morgen ist kalt .",              # Morgen => NOUN
    "Der Wolf ist im Wald .",            # Wolf => NOUN
    "Das ist Herr Wolf .",               # Wolf => PROPN
    "Der Sommer ist heiß .",             # Sommer => NOUN
    "Ich habe Frau Sommer gesehen ."     # Sommer => PROPN
]

for sentence in test_sentences_german:
    predicted_tags = predictor_ge.predict_sentence(sentence)
    print(f"  Sentence: {sentence}")
    print(f"  Tags:     {' '.join(predicted_tags)}")

  Sentence: Wir essen jetzt .
  Tags:     PRON VERB ADV PUNCT
  Sentence: Das Essen ist fertig .
  Tags:     DET NOUN AUX ADJ PUNCT
  Sentence: Die Vögel fliegen .
  Tags:     DET NOUN VERB PUNCT
  Sentence: Die Fliegen sind nervig .
  Tags:     DET NOUN AUX NOUN PUNCT
  Sentence: Homograph Ambiguity (ADJ/NOUN/ADV)
  Tags:     NOUN NOUN NOUN
  Sentence: Der Junge ist arm .
  Tags:     DET NOUN AUX ADJ PUNCT
  Sentence: Sein Arm ist gebrochen .
  Tags:     DET NOUN AUX VERB PUNCT
  Sentence: Ich komme morgen .
  Tags:     PRON VERB ADV PUNCT
  Sentence: Der Morgen ist kalt .
  Tags:     DET NOUN AUX ADJ PUNCT
  Sentence: Der Wolf ist im Wald .
  Tags:     DET PROPN AUX NOUN PROPN PUNCT
  Sentence: Das ist Herr Wolf .
  Tags:     DET AUX NOUN PROPN PUNCT
  Sentence: Der Sommer ist heiß .
  Tags:     DET NOUN AUX ADJ PUNCT
  Sentence: Ich habe Frau Sommer gesehen .
  Tags:     PRON AUX NOUN NOUN VERB PUNCT
