In [None]:
!pip install conllu

Collecting conllu
  Downloading conllu-6.0.0-py3-none-any.whl.metadata (21 kB)
Downloading conllu-6.0.0-py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-6.0.0


In [40]:
from conllu import parse

filename = "data/english/en_ewt-ud-dev.conllu"


def get_fields(line):
    words = line.split("\t")
    if len(words) < 4:
        return None
    return words[1], words[3], line


with open(filename, "r", encoding="utf-8") as file:
    sentences = parse(file.read())

sentences[:2]

[TokenList<From, the, AP, comes, this, story, :, metadata={newdoc id: "weblog-blogspot.com_nominations_20041117172713_ENG_20041117_172713", sent_id: "weblog-blogspot.com_nominations_20041117172713_ENG_20041117_172713-0001", newpar id: "weblog-blogspot.com_nominations_20041117172713_ENG_20041117_172713-p0001", text: "From the AP comes this story :"}>, TokenList<President, Bush, on, Tuesday, nominated, two, individuals, to, replace, retiring, jurists, on, federal, courts, in, the, Washington, area, ., metadata={sent_id: "weblog-blogspot.com_nominations_20041117172713_ENG_20041117_172713-0002", newpar id: "weblog-blogspot.com_nominations_20041117172713_ENG_20041117_172713-p0002", text: "President Bush on Tuesday nominated two individuals to replace retiring jurists on federal courts in the Washington area."}>]

In [61]:
X_data = []
y_data = []

for sentence in sentences:
    parsed_sentence = []
    sentence_labels = []
    for token in sentence:
        if type(token["id"]) == int:
            parsed_sentence.append(token["form"])
            sentence_labels.append(token["upostag"])
    X_data.append(" ".join(parsed_sentence))
    y_data.append(sentence_labels)


print(f"Sentences: {X_data[:5]}")
print(f"Class labels: {y_data[:5]}")

Sentences: ['From the AP comes this story :', 'President Bush on Tuesday nominated two individuals to replace retiring jurists on federal courts in the Washington area .', 'Bush nominated Jennifer M. Anderson for a 15 - year term as associate judge of the Superior Court of the District of Columbia , replacing Steffen W. Graae .', '***', 'Bush also nominated A. Noel Anketell Kramer for a 15 - year term as associate judge of the District of Columbia Court of Appeals , replacing John Montague Steadman .']
Class labels: [['ADP', 'DET', 'PROPN', 'VERB', 'DET', 'NOUN', 'PUNCT'], ['PROPN', 'PROPN', 'ADP', 'PROPN', 'VERB', 'NUM', 'NOUN', 'PART', 'VERB', 'VERB', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'ADP', 'DET', 'PROPN', 'NOUN', 'PUNCT'], ['PROPN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'ADP', 'DET', 'NUM', 'PUNCT', 'NOUN', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'ADP', 'DET', 'ADJ', 'PROPN', 'ADP', 'DET', 'PROPN', 'ADP', 'PROPN', 'PUNCT', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT'], ['PUNCT'], ['PROPN', 'ADV', 'VE

In [76]:
import tensorflow as tf
import numpy as np

# Map labels
unique_tags = sorted(set(tag for sublist in y_data for tag in sublist))
tag_to_index = {tag: idx for idx, tag in enumerate(unique_tags)}
num_tags = len(tag_to_index)

# Convert labels into numbers
y_data_indexed = [[tag_to_index[tag] for tag in sublist] for sublist in y_data]

# Padding de las etiquetas (y_train) para que tengan la misma longitud
y_train = tf.keras.preprocessing.sequence.pad_sequences(y_data_indexed, padding='post', maxlen=128)

# TextVectorization
text_vectorizer = tf.keras.layers.TextVectorization(output_mode='int', max_tokens=10000, output_sequence_length=128)
text_vectorizer.adapt(X_data)
X_data_vectorized = text_vectorizer(np.array(X_data))

input_layer = tf.keras.layers.Input(shape=(128,), dtype=tf.int32)
x = tf.keras.layers.Embedding(input_dim=len(text_vectorizer.get_vocabulary()), output_dim=30)(input_layer)
x = tf.keras.layers.LSTM(units=64, return_sequences=True)(x)
x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(num_tags, activation='softmax'))(x)

model = tf.keras.Model(inputs=input_layer, outputs=x)

# Compilar el modelo
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.fit(X_data_vectorized, y_train, batch_size=512, epochs=10, verbose=1)

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2s/step - accuracy: 0.4856 - loss: 2.4976
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s/step - accuracy: 0.9100 - loss: 1.0651
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step - accuracy: 0.9085 - loss: 0.4658
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step - accuracy: 0.9102 - loss: 0.3448
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2s/step - accuracy: 0.9110 - loss: 0.3184
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2s/step - accuracy: 0.9116 - loss: 0.3140
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step - accuracy: 0.9142 - loss: 0.3140
Epoch 8/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2s/step - accuracy: 0.9187 - loss: 0.3122
Epoch 9/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s

<keras.src.callbacks.history.History at 0x780a60fb3ac0>

In [75]:
import tensorflow as tf
import numpy as np

# Map labels
unique_tags = sorted(set(tag for sublist in y_data for tag in sublist))
tag_to_index = {tag: idx for idx, tag in enumerate(unique_tags)}
num_tags = len(tag_to_index)

# Convert labels into numbers
y_data_indexed = [[tag_to_index[tag] for tag in sublist] for sublist in y_data]

y_train = tf.keras.preprocessing.sequence.pad_sequences(y_data_indexed, padding='post', maxlen=128)

text_vectorizer = tf.keras.layers.TextVectorization(output_mode='int', max_tokens=10000, output_sequence_length=128)
text_vectorizer.adapt(X_data)

input_layer = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(input_layer)
x = tf.keras.layers.Embedding(input_dim=len(text_vectorizer.get_vocabulary()), output_dim=30)(x)
x = tf.keras.layers.LSTM(units=64, return_sequences=True)(x)
x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(num_tags, activation='softmax'))(x)
model = tf.keras.Model(inputs=input_layer, outputs=x)

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Entrenar el modelo
model.fit(np.array(X_data), y_train, batch_size=512, epochs=10, verbose=1)


ValueError: Invalid dtype: str12704

In [71]:
import tensorflow as tf
import numpy as np

# Map labels
unique_tags = sorted(set(tag for sublist in y_data for tag in sublist))
tag_to_index = {tag: idx for idx, tag in enumerate(unique_tags)}
num_tags = len(tag_to_index)

# Convert labels into numbers
y_data_indexed = [[tag_to_index[tag] for tag in sublist] for sublist in y_data]

# Padding de las etiquetas (y_train) para que tengan la misma longitud
y_train = tf.keras.preprocessing.sequence.pad_sequences(y_data_indexed, padding='post', maxlen=128)

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_data)
X_data_sequences = tokenizer.texts_to_sequences(X_data)
X_data_padded = tf.keras.preprocessing.sequence.pad_sequences(X_data_sequences, padding='post', maxlen=128)

input_layer = tf.keras.layers.Input(shape=(128,), dtype=tf.int32)
x = tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=30)(input_layer)
x = tf.keras.layers.LSTM(units=64, return_sequences=True)(x)
x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(num_tags, activation='softmax'))(x)

model = tf.keras.Model(inputs=input_layer, outputs=x)
model.summary()

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.fit(X_data_padded, y_train, batch_size=512, epochs=10, verbose=1)


Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2s/step - accuracy: 0.4859 - loss: 2.3551
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step - accuracy: 0.9085 - loss: 0.8369
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1s/step - accuracy: 0.9096 - loss: 0.4070
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1s/step - accuracy: 0.9092 - loss: 0.3301
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2s/step - accuracy: 0.9088 - loss: 0.3145
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1s/step - accuracy: 0.9085 - loss: 0.3107
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1s/step - accuracy: 0.9108 - loss: 0.3030
Epoch 8/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1s/step - accuracy: 0.9090 - loss: 0.3090
Epoch 9/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2s/

<keras.src.callbacks.history.History at 0x780a639408b0>