In [None]:
! pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
raw_datasets['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [None]:
! pip install git+https://www.github.com/keras-team/keras-contrib.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-rctkgbu1
  Running command git clone --filter=blob:none --quiet https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-rctkgbu1
  Resolved https://www.github.com/keras-team/keras-contrib.git to commit 3fc5ef709e061416f4bc8a92ca3750c824b5d2b0
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
! pip install tf2crf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
train_dataset = raw_datasets['train']
validation_dataset = raw_datasets['validation']
test_dataset = raw_datasets['test']

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense, Dropout
from tf2crf import CRF, ModelWithCRFLoss

raw_datasets = {"train": train_dataset, "validation": validation_dataset, "test": test_dataset}

def preprocess_data(raw_data):
    tokenized_sentences = []
    label_ids = []

    for data in raw_data:
        tokens = data["tokens"]
        labels = data["ner_tags"]
        tokenized_sentences.append(tokens)
        label_ids.append(labels)

    return tokenized_sentences, label_ids

X = {}
y = {}
for dataset in raw_datasets:
    tokenized_sentences, label_ids = preprocess_data(raw_datasets[dataset])
    X[dataset] = tokenized_sentences
    y[dataset] = label_ids

word_tokenizer = Tokenizer(oov_token="<OOV>")
word_tokenizer.fit_on_texts(X["train"])
label_encoder = LabelEncoder()
label_encoder.fit([label for labels in y["train"] for label in labels])

max_len = max([len(sentence) for sentence in X["train"]])

def prepare_data(tokenized_sentences, label_ids):
    X_indices = word_tokenizer.texts_to_sequences(tokenized_sentences)
    y_indices = [label_encoder.transform(labels) for labels in label_ids]

    X_padded = pad_sequences(X_indices, maxlen=max_len, padding="post")
    y_padded = pad_sequences(y_indices, maxlen=max_len, padding="post")

    return np.array(X_padded), np.array(y_padded)


X_train, y_train = prepare_data(X["train"], y["train"])
X_val, y_val = prepare_data(X["validation"], y["validation"])

vocab_size = len(word_tokenizer.word_index) + 1
num_classes = len(label_encoder.classes_)

input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len, mask_zero=True)(input_layer)
dropout_layer = Dropout(0.5)(embedding_layer)
bilstm_layer = Bidirectional(LSTM(units=256, return_sequences=True))(embedding_layer)
bilstm_layer = Bidirectional(LSTM(units=256, return_sequences=True))(bilstm_layer)
time_distributed = TimeDistributed(Dense(num_classes))(bilstm_layer)
crf = CRF(num_classes)
output_layer = crf(time_distributed)

base_model = Model(inputs=input_layer, outputs=output_layer)
model = ModelWithCRFLoss(base_model)
model.compile(optimizer="adam")


from tensorflow.keras.callbacks import EarlyStopping


early_stopping = EarlyStopping(monitor='val_loss_val', patience=2, verbose=1, restore_best_weights=True)

model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=10, callbacks=[early_stopping])




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 4: early stopping


<keras.callbacks.History at 0x7f4291fbf220>

In [None]:
from sklearn.metrics import classification_report

def evaluate_model(model, test_data, ner_labels):
    X_test, y_test = preprocess_data(test_data)
    X_test_padded, y_test_padded = prepare_data(X_test, y_test)

    y_pred = model.predict(X_test_padded)
    
    y_pred_classes = y_pred

    y_test_flat = y_test_padded.flatten()
    y_pred_classes_flat = y_pred_classes.flatten()

    #non_padding_indices = np.where(y_test_flat != 0)
    y_test_no_padding = y_test_flat#[non_padding_indices]
    y_pred_classes_no_padding = y_pred_classes_flat#[non_padding_indices]

    y_test_labels = label_encoder.inverse_transform(y_test_no_padding)
    y_pred_labels = label_encoder.inverse_transform(y_pred_classes_no_padding)
    
    
    print(classification_report(y_test_labels, y_pred_labels, target_names=ner_labels))
    return y_test_labels, y_pred_labels


ner_labels = raw_datasets["test"].features["ner_tags"].feature.names
y_test_labels, y_pred_labels = evaluate_model(model, raw_datasets["test"], ner_labels)


              precision    recall  f1-score   support

           O       1.00      1.00      1.00    382081
       B-PER       0.93      0.72      0.81      1616
       I-PER       0.93      0.80      0.86      1155
       B-ORG       0.84      0.62      0.71      1660
       I-ORG       0.74      0.56      0.64       835
       B-LOC       0.81      0.83      0.82      1667
       I-LOC       0.48      0.67      0.56       257
      B-MISC       0.82      0.69      0.75       702
      I-MISC       0.51      0.55      0.53       216

    accuracy                           0.99    390189
   macro avg       0.78      0.71      0.74    390189
weighted avg       0.99      0.99      0.99    390189

