<a href="https://colab.research.google.com/github/github-ashwin/DeepLearning-Lab/blob/main/NER_BiLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [3]:
# Training sentences as strings
train_sentences = [
    "John lives in New York",
    "Mary works at Google",
    "Steve is from Paris",
    "Alice moved to San Francisco",
    "Bob joined Microsoft last year",
    "Eve visited Berlin and London",
    "Charlie is from Los Angeles",
    "Diana works for Amazon",
    "Frank went to Tokyo last month"
]

# Corresponding NER labels in IOB format for the tokens in the sentences
train_labels = [
    ["B-PER", "O", "O", "B-LOC", "I-LOC"],
    ["B-PER", "O", "O", "B-ORG"],
    ["B-PER", "O", "O", "B-LOC"],
    ["B-PER", "O", "O", "B-LOC", "I-LOC"],
    ["B-PER", "O", "B-ORG", "O", "O"],
    ["B-PER", "O", "B-LOC", "O", "B-LOC"],
    ["B-PER", "O", "O", "B-LOC", "I-LOC"],
    ["B-PER", "O", "O", "B-ORG"],
    ["B-PER", "O", "O", "B-LOC", "O", "O"]
]

In [4]:
# Preprocessing: Tokenize the sentences by splitting on spaces
tokenized_sentences = [sent.split() for sent in train_sentences]

In [5]:
# Create word vocabulary from training data
unique_words = set(word for sent in tokenized_sentences for word in sent)

# Add special tokens for padding and unknown words
word2idx = {word: idx + 2 for idx, word in enumerate(unique_words)}
word2idx["PAD"] = 0  # Padding token
word2idx["UNK"] = 1  # Unknown token (for words not in vocabulary)

In [6]:
# Create tag vocabulary
unique_tags = set(tag for sent_tags in train_labels for tag in sent_tags)
tag2idx = {tag: idx for idx, tag in enumerate(sorted(unique_tags))}
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

In [7]:
# Parameters
max_seq_len = 10  # Maximum sentence length to pad/truncate to
num_tags = len(tag2idx)

In [9]:
# Convert words in sentences to their indices
X = []
for sent in tokenized_sentences:
    seq = [word2idx.get(word, word2idx["UNK"]) for word in sent]
    X.append(seq)

X_padded = pad_sequences(X, maxlen=max_seq_len, padding='post', value=word2idx["PAD"])

In [10]:
# Convert labels to indices and pad
y = []
for sent_tags in train_labels:
    label_seq = [tag2idx[tag] for tag in sent_tags]
    y.append(label_seq)

y_padded = pad_sequences(y, maxlen=max_seq_len, padding='post', value=tag2idx["O"])

# One-hot encode labels for model training
y_one_hot = np.array([np.eye(num_tags)[seq] for seq in y_padded])

In [11]:
# Build model
input_layer = Input(shape=(max_seq_len,))
embedding_layer = Embedding(input_dim=len(word2idx), output_dim=64, input_length=max_seq_len)(input_layer)
bilstm_layer = Bidirectional(LSTM(units=32, return_sequences=True))(embedding_layer)
output_layer = TimeDistributed(Dense(num_tags, activation='softmax'))(bilstm_layer)

model = Model(input_layer, output_layer)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()



In [12]:
# Train model
history = model.fit(
    X_padded, y_one_hot,
    batch_size=2,
    epochs=20,
    validation_split=0.2,
    verbose=1
)

Epoch 1/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 411ms/step - accuracy: 0.3500 - loss: 1.6026 - val_accuracy: 0.8000 - val_loss: 1.5535
Epoch 2/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.7438 - loss: 1.5414 - val_accuracy: 0.8000 - val_loss: 1.4890
Epoch 3/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.7571 - loss: 1.4699 - val_accuracy: 0.8000 - val_loss: 1.4067
Epoch 4/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 150ms/step - accuracy: 0.7288 - loss: 1.3843 - val_accuracy: 0.8000 - val_loss: 1.2963
Epoch 5/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.7421 - loss: 1.2591 - val_accuracy: 0.8000 - val_loss: 1.1479
Epoch 6/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.7471 - loss: 1.1089 - val_accuracy: 0.8000 - val_loss: 0.9653
Epoch 7/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━

In [13]:
def predict_entities(sentence):
    """
    Predict NER tags for a given input sentence.
    Arguments:
    - sentence: string (a sentence to predict NER tags on)

    Returns:
    - list of tuples: (word, predicted_tag)
    """
    words = sentence.split()
    # Convert words to indices, using UNK token for unknown words
    tokens = [word2idx.get(word, word2idx["UNK"]) for word in words]
    tokens_padded = pad_sequences([tokens], maxlen=max_seq_len, padding='post', value=word2idx["PAD"])

    pred_probs = model.predict(tokens_padded)[0]
    pred_indices = np.argmax(pred_probs, axis=-1)

    # Map predicted indices to tags, cut to original sentence length
    pred_tags = [idx2tag[idx] for idx in pred_indices[:len(words)]]

    return list(zip(words, pred_tags))

In [14]:
  # Example test
test_sentence = "Mary lives in Paris"
print(predict_entities(test_sentence))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 599ms/step
[('Mary', 'B-PER'), ('lives', 'O'), ('in', 'O'), ('Paris', 'O')]
