In [1]:
!pip install --upgrade datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency re

In [3]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313511 sha256=ad54d6fd41a64028e3b46136bcec49900fd06a94fa82fecd4330f830baaa064a
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a51

In [5]:
import re
import logging
import fasttext
import keras

import numpy as np

from keras import layers, models, preprocessing, optimizers
from sklearn.metrics import classification_report
from datasets import load_dataset
from huggingface_hub import hf_hub_download
from typing import List, Tuple, Dict
from collections import Counter

In [14]:
class POSTagger:
    def __init__(self, max_len: int = 64, embedding_dim: int = 300):
        self.max_len = max_len
        self.embedding_dim = embedding_dim
        self.word2id = {'PAD': 0, 'UNK': 1}
        self.id2word = {0: 'PAD', 1: 'UNK'}
        self.label2id = {}
        self.id2label = {}
        self.model = None

    def load_data(self):
        print("load_data in process...")
        dataset = load_dataset("xtreme", "udpos.English")

        tags = dataset['train'].features['pos_tags'].feature.names
        self.label2id = {tag: i for i, tag in enumerate(tags)}
        self.id2label = {i: tag for i, tag in enumerate(tags)}

        self._build_vocab(dataset['train']['tokens'])

        X_train = self._vectorize(dataset['train']['tokens'])
        y_train = self._pad_tags(dataset['train']['pos_tags'])

        X_val = self._vectorize(dataset['validation']['tokens'])
        y_val = self._pad_tags(dataset['validation']['pos_tags'])

        X_test = self._vectorize(dataset['validation']['tokens'])
        y_test = self._pad_tags(dataset['validation']['pos_tags'])

        return (X_train, y_train), (X_val, y_val), (X_test, y_test)

    def _build_vocab(self, sentences: List[List[str]]):
        vocab = Counter()
        for sent in sentences:
            vocab.update([word.lower() for word in sent])

        for word in vocab:
            self.word2id[word] = len(self.word2id)
            self.id2word[len(self.id2word)] = word

    def _vectorize(self, sentences: List[List[str]]) -> np.ndarray:
        vectorized = []
        for sent in sentences:
            ids = [self.word2id.get(word.lower(), 1) for word in sent]
            vectorized.append(ids)
        return preprocessing.sequence.pad_sequences(
            vectorized, maxlen=self.max_len, padding='post'
        )

    def _pad_tags(self, tags: List[List[int]]) -> np.ndarray:
        return preprocessing.sequence.pad_sequences(
            tags, maxlen=self.max_len, value=0, padding='post'
        )

    def _build_embedding_matrix(self):
        print("_build_embedding_matrix in process...")
        model_path = hf_hub_download("facebook/fasttext-en-vectors", "model.bin")
        ft_model = fasttext.load_model(model_path)

        embedding_matrix = np.zeros((len(self.word2id), self.embedding_dim))
        for word, idx in self.word2id.items():
            if word in ft_model:
                embedding_matrix[idx] = ft_model.get_word_vector(word)
        return embedding_matrix

    def build_model(self):
        embedding_matrix = self._build_embedding_matrix()

        inputs = layers.Input(shape=(self.max_len,))
        x = layers.Embedding(
            input_dim=len(self.word2id),
            output_dim=self.embedding_dim,
            weights=[embedding_matrix],
            trainable=False
        )(inputs)

        x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
        x = layers.Dropout(0.3)(x)
        x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
        outputs = layers.Dense(len(self.label2id), activation='softmax')(x)

        self.model = models.Model(inputs=inputs, outputs=outputs)
        self.model.compile(
            optimizer=optimizers.Adam(learning_rate=0.001),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        print("built successfully\n")

    def train(self, X_train, y_train, X_val, y_val, epochs: int = 5, batch_size: int = 128):
        print("Starting training...\n")
        history = self.model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            batch_size=batch_size,
            epochs=epochs,
            verbose=1
        )
        return history

    def evaluate(self, X_test, y_test):
        y_pred = self.model.predict(X_test).argmax(axis=-1)
        print(classification_report(
            y_test.flatten(),
            y_pred.flatten(),
            target_names=list(self.label2id.keys()),
            zero_division=0
        ))

    def predict_text(self, text: str) -> List[Tuple[str, str]]:
        tokens = re.findall(r'\w+|[^\w\s]+', text)
        token_ids = [self.word2id.get(token.lower(), 1) for token in tokens]
        padded_ids = preprocessing.sequence.pad_sequences(
            [token_ids], maxlen=self.max_len, padding='post'
        )

        preds = self.model.predict(padded_ids)[0]
        tags = [self.id2label[pred.argmax()] for pred in preds[:len(tokens)]]

        return list(zip(tokens, tags))

In [15]:
def main():
    tagger = POSTagger(max_len=64)
    (X_train, y_train), (X_val, y_val), (X_test, y_test) = tagger.load_data()
    tagger.build_model()
    tagger.train(X_train, y_train, X_val, y_val, epochs=5)
    tagger.evaluate(X_test, y_test)

    test_sentence = "The Michael Book will book, a book, for us."
    tagged = tagger.predict_text(test_sentence)
    print("\nPredicted tags:")
    for token, tag in tagged:
        print(f"{token:15} -- {tag}")

In [16]:
if __name__ == "__main__":
    main()

load_data in process...
_build_embedding_matrix in process...
built successfully

Starting training...

Epoch 1/5
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 552ms/step - accuracy: 0.7418 - loss: 0.9014 - val_accuracy: 0.8798 - val_loss: 0.3957
Epoch 2/5
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 531ms/step - accuracy: 0.8892 - loss: 0.3678 - val_accuracy: 0.9467 - val_loss: 0.1778
Epoch 3/5
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 519ms/step - accuracy: 0.9444 - loss: 0.1843 - val_accuracy: 0.9613 - val_loss: 0.1285
Epoch 4/5
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 511ms/step - accuracy: 0.9606 - loss: 0.1293 - val_accuracy: 0.9660 - val_loss: 0.1147
Epoch 5/5
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 527ms/step - accuracy: 0.9679 - loss: 0.1050 - val_accuracy: 0.9674 - val_loss: 0.1106
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 68ms/