# Entrenar un modelo de NER usando Transfer-learning

In [1]:
import nltk
import tensorflow as tf
# Deshabilitar la GPU, necesaro para macs con M1
tf.config.set_visible_devices([], 'GPU')
# Cargar transformrers
from transformers import BertTokenizer, TFBertForTokenClassification
from sklearn.model_selection import train_test_split

### Descargar el corpus de CONLL 2002

In [2]:
nltk.download('conll2002')
from nltk.corpus import conll2002
# Load the corpus
corpus = conll2002.iob_sents()

[nltk_data] Downloading package conll2002 to /Users/salva/nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


### Preparar la entrada al formato de Token Classification
El dataset está en un formato no útil para nosotros (Listas de tokens y listas de etiquetas IOB).
Preparemos la entrada.

In [3]:
def prepare_input(corpus):
    sentences, labels = [], []
    for tagged_sentence in corpus:
        sentence, _, tag = zip(*tagged_sentence)
        sentences.append(" ".join(sentence))
        labels.append(list(tag))
    return sentences, labels

sentences, labels = prepare_input(corpus)

# Split the data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    sentences, labels, test_size=0.2, random_state=42
)

### Tokenizar la entrada usando el tokenizador de BERT

In [4]:
model_name = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
# Tokenize and convert sentences to input format
train_encodings = tokenizer(train_sentences, truncation=True, padding=True)
val_encodings = tokenizer(val_sentences, truncation=True, padding=True)

### Convertir las etiquetas IOB a índices:

In [5]:
def convert_labels_to_ids(labels, label_map):
    label_ids = []
    for sent_labels in labels:
        label_ids.append([label_map[label] for label in sent_labels])
    return label_ids
# Define the label mapping. Sort it to have O before. And then B before I.
labels_set = set(sum(labels, []))
label_list = sorted(labels_set, key=lambda e: "0" if e == "O" else e[2]+e[0])
label_map = {label: i for i, label in enumerate(label_list)}
# Convert labels to label IDs
train_label_ids = convert_labels_to_ids(train_labels, label_map)
val_label_ids = convert_labels_to_ids(val_labels, label_map)

### Crear los datasets de entrenamiento y validación

In [6]:
train_label_ids = tf.keras.preprocessing.sequence.pad_sequences(
    train_label_ids, padding="post", truncating="post", maxlen=tokenizer.model_max_length)
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_label_ids
))
val_label_ids = tf.keras.preprocessing.sequence.pad_sequences(
    val_label_ids, padding="post", truncating="post", maxlen=tokenizer.model_max_length)
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_label_ids
))

### Definir y compilar el modelo:

In [10]:
model = TFBertForTokenClassification.from_pretrained(model_name, num_labels=len(label_map))
# Prepare optimizer
from transformers import create_optimizer
TOTAL_EPOCHS = 3
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=TOTAL_EPOCHS)
# Compile
model.compile(optimizer=optimizer, metrics=[tf.keras.metrics.sparse_categorical_accuracy, ] )

All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


### Entrenar el modelo:

In [None]:
model.fit(train_dataset.shuffle(1000).batch(16),
          validation_data=val_dataset.batch(16),
          epochs=TOTAL_EPOCHS)

Epoch 1/3
  11/1783 [..............................] - ETA: 11:06:49 - loss: 1.7318 - sparse_categorical_accuracy: 0.8495