In [1]:
import tensorflow as tf
import numpy as np
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, TimeDistributed, Dense

In [2]:
# --- 1. Load Dataset (CoNLL-2003) ---
# We will use the 'conll2003' dataset, which is available in TensorFlow Datasets.
# It contains news wire text with four types of named entities: person, location, organization, and miscellaneous.


# Load the dataset
ds, info = tfds.load('conll2003', with_info=True)
train_ds = ds['train']

# Extract sentences (tokens) and NER tags from the dataset
sentences = []
tags = []

# The info object contains the names of the NER tags
# We create a mapping from integer ID to tag name (e.g., 0 -> 'O', 1 -> 'B-person')
id2tag_name = info.features['ner'].feature.names
for example in train_ds:
    sentences.append([t.decode('utf-8') for t in example['tokens'].numpy()])
    tags.append([id2tag_name[t] for t in example['ner'].numpy()])

print(f"Loaded {len(sentences)} sentences from the CoNLL-2003 training split.")
print("\nSample Sentence:", sentences[0])
print("Corresponding Tags:", tags[0])



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/conll2003/conll2003/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/conll2003/conll2003/incomplete.Y49YZ7_1.0.0/conll2003-train.tfrecord*...: …

Generating dev examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/conll2003/conll2003/incomplete.Y49YZ7_1.0.0/conll2003-dev.tfrecord*...:   …

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/conll2003/conll2003/incomplete.Y49YZ7_1.0.0/conll2003-test.tfrecord*...:  …

Dataset conll2003 downloaded and prepared to /root/tensorflow_datasets/conll2003/conll2003/1.0.0. Subsequent calls will reuse this data.
Loaded 14042 sentences from the CoNLL-2003 training split.

Sample Sentence: ['"', 'If', 'they', "'re", 'saying', 'at', 'least', '20', 'percent', ',', 'then', 'their', 'internal', 'forecasts', 'are', 'probably', 'saying', '25', 'or', '30', 'percent', ',', '"', 'said', 'one', 'Sydney', 'media', 'analyst', 'who', 'declined', 'to', 'be', 'named', '.']
Corresponding Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [3]:
print("\nSample Sentence:", sentences[100])
print("Corresponding Tags:", tags[100])


Sample Sentence: ['12', '-', 'Todd', 'Martin', '(', 'U.S.', ')', 'beat', 'Younes', 'El', 'Aynaoui', '(', 'Morocco', ')', '6-3', '6-2', '4-6', '6-4']
Corresponding Tags: ['O', 'O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O']


In [4]:
id2tag_name

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [5]:
# --- 2. Data Preprocessing ---

# Create word and tag mappings to integers
word_set = set(word for sentence in sentences for word in sentence)
tag_set = set(tag for tag_list in tags for tag in tag_list)

word2idx = {word: i + 2 for i, word in enumerate(word_set)}
word2idx["<PAD>"] = 0  # Padding token
word2idx["<UNK>"] = 1  # Unknown word token

tag2idx = {tag: i + 1 for i, tag in enumerate(tag_set)}
tag2idx["<PAD>"] = 0

# Convert sentences and tags to integer sequences
X = [[word2idx.get(word, word2idx["<UNK>"]) for word in sentence] for sentence in sentences]
y = [[tag2idx[tag] for tag in tag_list] for tag_list in tags]

# Pad sequences to have the same length
max_len = 70 # Set a reasonable max length for tweets
X_padded = pad_sequences(sequences=X, maxlen=max_len, padding="post", value=word2idx["<PAD>"])
y_padded = pad_sequences(sequences=y, maxlen=max_len, padding="post", value=tag2idx["<PAD>"])

# Convert to one-hot encoding for the labels
num_tags = len(tag2idx)
y_one_hot = tf.keras.utils.to_categorical(y_padded, num_classes=num_tags)

print(f"\nNumber of samples: {len(X_padded)}")
print(f"Max sequence length: {max_len}")
print(f"Number of unique words (vocab size): {len(word2idx)}")
print(f"Number of unique tags: {num_tags}")


Number of samples: 14042
Max sequence length: 70
Number of unique words (vocab size): 23625
Number of unique tags: 10


In [6]:
# --- 3. Build the Bidirectional LSTM Model ---

# Model Hyperparameters
VOCAB_SIZE = len(word2idx)
EMBEDDING_DIM = 50
LSTM_UNITS = 30
NUM_TAGS = num_tags

model = Sequential([
    tf.keras.Input(shape=(max_len,)),
    # 1. Embedding Layer
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM),

    # 2. Bidirectional LSTM Layer
    Bidirectional(LSTM(units=LSTM_UNITS, return_sequences=True)),

    # 3. TimeDistributed Dense Layer
    # TimeDistributed applies the exact same layer to every single timestep (or word) in a sequence.
    TimeDistributed(Dense(units=NUM_TAGS, activation='softmax'))
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

In [7]:
# --- 4. Train the Model ---

print("\n--- Training the model ---")
model.fit(X_padded, y_one_hot, batch_size=32, epochs=5, validation_split=0.1, verbose=1)






--- Training the model ---
Epoch 1/5
[1m395/395[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 21ms/step - accuracy: 0.9016 - loss: 0.4123 - val_accuracy: 0.9659 - val_loss: 0.1073
Epoch 2/5
[1m395/395[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.9688 - loss: 0.0942 - val_accuracy: 0.9823 - val_loss: 0.0616
Epoch 3/5
[1m395/395[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 19ms/step - accuracy: 0.9865 - loss: 0.0496 - val_accuracy: 0.9907 - val_loss: 0.0369
Epoch 4/5
[1m395/395[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 19ms/step - accuracy: 0.9949 - loss: 0.0229 - val_accuracy: 0.9932 - val_loss: 0.0264
Epoch 5/5
[1m395/395[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - accuracy: 0.9979 - loss: 0.0112 - val_accuracy: 0.9937 - val_loss: 0.0229


<keras.src.callbacks.history.History at 0x7b9ebc5955d0>

In [10]:
# --- 5. Make Predictions on a New Sentence ---
print("\n--- Making a prediction ---")

# Create an index-to-tag mapping for decoding the output
idx2tag = {i: t for t, i in tag2idx.items()}

# Test sentence
# test_sentence = "Paris is a capital of France".split()
# test_sentence = "Apple just released a new iPhone".split()
test_sentence = "Sundar Pichai is the CEO of Google".split()
# test_sentence = "100 people work at dlgjkhdlsfg".split()
# test_sentence = "Xiomi just released a new phone".split()

# Why model did not generalize?

# Convert sentence to padded integer sequence
test_sequence = [word2idx.get(w, word2idx["<UNK>"]) for w in test_sentence]
test_padded = pad_sequences([test_sequence], maxlen=max_len, padding="post", value=word2idx["<PAD>"])

# Get model prediction
prediction = model.predict(test_padded)
predicted_indices = np.argmax(prediction, axis=-1)

# Convert indices back to tags
predicted_tags = [idx2tag.get(i, '<PAD>') for i in predicted_indices[0] if i != tag2idx["<PAD>"]]

# Display the results
print(f"Input Sentence: {test_sentence}")
print(f"Predicted Tags: {predicted_tags[:len(test_sentence)]}") # Trim padding


print("\n--- Results ---")
for word, tag in zip(test_sentence, predicted_tags):
    print(f"{word:<15} -> {tag}")


--- Making a prediction ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Input Sentence: ['Sundar', 'Pichai', 'is', 'the', 'CEO', 'of', 'Google']
Predicted Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O']

--- Results ---
Sundar          -> O
Pichai          -> O
is              -> O
the             -> O
CEO             -> O
of              -> O
Google          -> O


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# --- 6. Evaluate on Validation Set and Display Confusion Matrix ---

print("\n--- Evaluating on Validation Set and displaying Confusion Matrix ---")

# Use the last 10% of the dataset for validation
split_index = int(len(X_padded) * 0.9)

X_train = X_padded[:split_index]
X_val = X_padded[split_index:]

y_one_hot_train = y_one_hot[:split_index]
y_one_hot_val = y_one_hot[split_index:]

y_padded_train = y_padded[:split_index]
y_padded_val = y_padded[split_index:]


# Get predictions on the validation set
y_pred_one_hot = model.predict(X_val)

# Convert one-hot encoded predictions and true labels back to integer labels
y_true_labels = np.argmax(y_one_hot_val, axis=-1)
y_pred_labels = np.argmax(y_pred_one_hot, axis=-1)

# Flatten the arrays to compute confusion matrix across all tokens
y_true_flat = y_true_labels.flatten()
y_pred_flat = y_pred_labels.flatten()

# Create a mask to exclude padding from the confusion matrix
non_pad_mask = y_true_flat != tag2idx["<PAD>"]

y_true_non_pad = y_true_flat[non_pad_mask]
y_pred_non_pad = y_pred_flat[non_pad_mask]


# Compute the confusion matrix
# Need to get the unique non-padding labels to define the range for the confusion matrix
unique_labels = np.unique(y_true_non_pad)
cm = confusion_matrix(y_true_non_pad, y_pred_non_pad, labels=unique_labels)

# Get tag names for the labels
# Filter out the PAD tag from the original idx2tag for display
display_labels = [idx2tag[label] for label in unique_labels if idx2tag[label] != '<PAD>']


# Display the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=display_labels, yticklabels=display_labels)
plt.xlabel('Predicted Tag')
plt.ylabel('True Tag')
plt.title('Confusion Matrix on Validation Set (excluding PAD)')
plt.show()