#### Introduction
This notebook demonstrates how to build and train a Bidirectional LSTM (BiLSTM) model for sequence labeling, specifically for a Named Entity Recognition (NER) task. It uses the Twitter NER dataset.

In [None]:
# Import necessary libraries.
# numpy is for numerical operations, especially with arrays.
import numpy as np
# Import various layers and model-building tools from Keras.
from keras.layers import Dense, Input, Embedding, TimeDistributed, Layer, Multiply, Concatenate, Dropout, LSTM, Bidirectional
from keras.models import Model, Sequential
# K is the Keras backend (in this case, TensorFlow), used for some low-level operations.
from keras import backend as K
# Import TensorFlow itself.
import tensorflow as tf
# Import Keras callbacks for saving the model, stopping training early, and custom actions.
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
# Import a utility to convert class vectors to binary class matrices (one-hot encoding).
from keras.utils.np_utils import to_categorical

--- 
#### Function: Load Word Embeddings
This function loads pre-trained word embeddings from a file (like GloVe). It creates an embedding matrix and a vocabulary dictionary that maps words to integer IDs. Two special tokens are reserved: `_0_` for padding (index 0) and `_UNK_` for unknown words (index 1).

In [None]:
def load_embeddings(filename, max_vocab_size):
    """ Load pre-trained word embeddings, reserving 0 for padding symbol and 1 for UNK """
    
    # Initialize a dictionary to store the vocabulary (word -> id).
    vocab={}
    # Initialize a list to store the embedding vectors.
    embeddings=[]
    # Open and read the embeddings file.
    with open(filename) as file:
        
        # Read the first line to get the number of words and embedding dimension.
        cols=file.readline().split(" ")
        num_words=int(cols[0])
        size=int(cols[1])
        # Append a vector of zeros for the padding token at index 0.
        embeddings.append(np.zeros(size))
        # Append a vector of zeros for the unknown (UNK) token at index 1.
        embeddings.append(np.zeros(size))
        # Add the special tokens to our vocabulary.
        vocab["_0_"]=0
        vocab["_UNK_"]=1
        
        # Iterate over each line in the embeddings file.
        for idx,line in enumerate(file):

            # Stop if we have reached the maximum desired vocabulary size.
            if idx+2 >= max_vocab_size:
                break

            # Split the line into the word and its vector components.
            cols=line.rstrip().split(" ")
            # Convert the vector components to a numpy array of floats.
            val=np.array(cols[1:])
            # The first column is the word.
            word=cols[0]
            
            # Add the word's vector to our embeddings list.
            embeddings.append(val)
            # Add the word to our vocabulary with its corresponding index (idx + 2 because of padding and UNK).
            vocab[word]=idx+2

    # Convert the list of embeddings to a single numpy matrix and return it with the vocabulary.
    return np.array(embeddings), vocab

--- 
#### Function: Convert Sentences to Word IDs
This function takes sentences (lists of word-tag pairs) and converts them into numerical format for the model. It pads all sequences to the same length (the length of the longest sentence in the dataset) so they can be processed in batches.

In [None]:
def get_word_ids(sentences, word_vocab, label_vocab):
    
    """
    Function to convert a list of sentences (where each sentence is a list of (word, tag) tuples)
    into:
    -- a list of padded sequences of word ids
    -- a list of padded sequence of tag ids
    -- a list of sequence lengths (the original token count for each sentence)
    
    Pads each sequence to the maximum sequence length observed in the sentences input
    """
    
    # Initialize lists to hold the final processed data.
    words_ids=[]
    sent_lengths=[]
    tags_ids=[]
    
    # The output dimension for tags is the number of unique tags + 1 (for the padding tag).
    output_dim=len(label_vocab)+1
    
    # Find the length of the longest sentence to use for padding.
    max_length=0
    for sentence in sentences:
        if len(sentence) > max_length:
            max_length=len(sentence)
    
    # Process each sentence.
    for sentence in sentences:
        # Initialize lists for the current sentence's word and tag IDs.
        wids=[]
        tids=[]
        
        # Iterate over each (word, tag) pair in the sentence.
        for word, tag in sentence:
            # Get the word's ID from the vocabulary. If not found, use the UNK ID (1).
            val = word_vocab[word.lower()] if word.lower() in word_vocab else 1
            # Append the word ID to the list.
            wids.append(val)
            # One-hot encode the tag ID.
            y = to_categorical(label_vocab[tag], num_classes=output_dim)
            # Append the one-hot encoded tag vector to the list.
            tids.append(y)
        
        
        # Pad the sequences with zeros up to the max_length.
        for i in range(len(wids),max_length):
            # Append the padding ID (0) for the word.
            wids.append(0)
            # Append a one-hot vector for the padding tag (class 0).
            tids.append(to_categorical(0, num_classes=output_dim))
            
        # Add the padded word ID sequence to the main list.
        words_ids.append(wids)
        # Add the padded tag ID sequence to the main list.
        tags_ids.append(tids)
        # Store the original, unpadded length of the sentence.
        sent_lengths.append(len(sentence))
 
    # Convert lists to numpy arrays and return.
    return np.array(words_ids), np.array(tags_ids), np.array(sent_lengths)

--- 
#### Function: Read TSV Data
This function reads the dataset from a tab-separated value (TSV) file. The expected format is one token per line (`word\ttag`), with sentences separated by blank lines.

In [None]:
def read_tsv(filename):
    
    """ Read input in two-column TSV, one line per word, with sentences delimited by a blank line """
    
    # Initialize a list to hold all sentences.
    sentences=[]
    # Initialize a list to hold the current sentence being built.
    sentence=[]
    # Open and read the file.
    with open(filename) as file:
        for line in file:
            # Split the line by tab.
            cols=line.rstrip().split("\t")
            # If the line is blank (less than 2 columns), it marks the end of a sentence.
            if len(cols) < 2:
                # If the current sentence has words, add it to the list of all sentences.
                if len(sentence) > 0:
                    sentences.append(sentence)
                # Reset the current sentence list for the next one.
                sentence=[]
                # Skip to the next line.
                continue
                
            # The first column is the word.
            word=cols[0]
            # The second column is the NER tag.
            tag=cols[1]
            
            # Append the (word, tag) tuple to the current sentence.
            sentence.append((word, tag))
            
        # After the loop, add the last sentence if it exists.
        if len(sentence) > 0:
            sentences.append(sentence)
            
    # Return the list of all sentences.
    return sentences

--- 
#### Function: Create Tag Vocabulary
This function builds a vocabulary that maps each unique NER tag found in the dataset to an integer ID. The ID assignment starts at 1, reserving 0 for padding.

In [None]:
def get_tag_vocab(sentences):
    # Initialize a dictionary to store the tag vocabulary.
    tags={}
    # Start tag IDs from 1, as 0 is reserved for masking/padding.
    tid=1
    # Iterate through each sentence in the dataset.
    for sentence in sentences:
        # Iterate through each (word, tag) pair in the sentence.
        for word, tag in sentence:
            # If the tag is not yet in our vocabulary...
            if tag not in tags:
                # ...add it and assign it the current tag ID.
                tags[tag]=tid
                # Increment the ID for the next new tag.
                tid+=1
    # Return the completed tag vocabulary.
    return tags

--- 
#### Loading the Datasets
Here, we use the `read_tsv` function to load the training and development (validation) data from their respective files.

In [None]:
# Read the training data from the TSV file.
data=read_tsv("../data/twitter-ner/ner.train.txt")
# Read the development (validation) data from the TSV file.
devData=read_tsv("../data/twitter-ner/ner.dev.txt")

--- 
#### Building the Tag Vocabulary
Now we create the tag-to-ID mapping from the training data. We also create a reverse mapping (ID-to-tag), which will be useful later for interpreting the model's predictions.

In [None]:
# Create the tag vocabulary from the training data.
tag_vocab=get_tag_vocab(data)
# Initialize a dictionary for the reverse mapping (ID -> tag).
rev_tags={}
# Populate the reverse mapping dictionary.
for t in tag_vocab:
    rev_tags[tag_vocab[t]]=t

# Print the tag vocabulary to inspect it.
print(tag_vocab)

--- 
#### Loading Word Embeddings
This cell loads the pre-trained GloVe word embeddings for Twitter data. We limit the vocabulary size to 100,000 words.

In [None]:
# Call the function to load embeddings and the corresponding word vocabulary.
embeddings, word_vocab=load_embeddings("../data/glove.twitter.27B.100d.50K.txt.w2v", 100000)

--- 
#### Data Preprocessing
This cell converts the raw text data (for both training and validation sets) into numerical format using the `get_word_ids` function defined earlier. The result is padded sequences of word IDs and one-hot encoded tag IDs.

In [None]:
# Convert the training data into numerical format.
trainX, trainY, trainS=get_word_ids(data, word_vocab, tag_vocab)
# Convert the development (validation) data into numerical format.
devX, devY, devS=get_word_ids(devData, word_vocab, tag_vocab)

--- 
Let's train a bidirectional LSTM for sequence labeling to make predictions about the NER tag for each word in a sentence.  Explore the effect of the lstm size and dropout rate.

--- 
#### Model Architecture Definition
This function defines the Keras model architecture. It consists of:
1.  **Input Layer**: Takes sequences of word IDs.
2.  **Embedding Layer**: Converts word IDs into dense vectors using the pre-trained embeddings. `mask_zero=True` tells the model to ignore padded inputs.
3.  **Bidirectional LSTM Layer**: Processes the sequence of embeddings in both forward and backward directions to capture context from both sides of each word.
4.  **TimeDistributed Dense Layer**: Applies a fully connected layer to every timestep of the LSTM's output to produce a probability distribution over the possible NER tags for each word.

In [None]:
def create_bilstm(embeddings, output_dim, lstm_size=25, dropout_rate=0.25):
    
    # Get the vocabulary size and embedding dimension from the shape of the embeddings matrix.
    vocab_size, word_embedding_dim=embeddings.shape

    # Define the input layer for word ID sequences. `None` allows for variable sequence lengths.
    word_sequence_input = Input(shape=(None,), dtype='int32')
    # This input is for the original sentence lengths, but it's not used in this model version.
    sentence_lengths = Input(shape=(None,), dtype='int32')

    # Define the embedding layer.
    word_embedding_layer = Embedding(vocab_size,      # The number of words in our vocabulary.
                                    word_embedding_dim, # The dimension of the word vectors.
                                    weights=[embeddings], # Initialize with the pre-trained GloVe embeddings.
                                    trainable=False, # Freeze the embedding weights; do not update them during training.
                                    mask_zero=True) # Ignore padding (inputs of 0) in subsequent layers.

    # Pass the input sequences through the embedding layer.
    embedded_sequences = word_embedding_layer(word_sequence_input)
    # The Bidirectional LSTM layer. `return_sequences=True` makes it output a sequence, not just the final state.
    bi_lstm = Bidirectional(LSTM(lstm_size, return_sequences=True, activation='relu', dropout=dropout_rate), merge_mode='concat')(embedded_sequences)
    # The output layer. `TimeDistributed` applies the Dense layer to each time step (each word) of the sequence.
    preds = TimeDistributed(Dense(output_dim, activation="softmax"))(bi_lstm)

    # Create the Keras Model, defining the inputs and outputs.
    model = Model(inputs=[word_sequence_input, sentence_lengths], outputs=preds)

    # Compile the model with a loss function, optimizer, and evaluation metric.
    model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=["acc"])

    # Return the compiled model.
    return model

--- 
#### Training Function
This function handles the model training process. It defines two callbacks:
1.  **EarlyStopping**: Stops training if the validation loss doesn't improve for a set number of epochs (`patience=10`), preventing overfitting.
2.  **ModelCheckpoint**: Saves the model weights only when the validation loss improves, ensuring we keep the best-performing model.

In [None]:
def train(model, modelName):
    # Print a summary of the model's architecture and parameters.
    print (model.summary())

    # Define the EarlyStopping callback.
    early_stopping = EarlyStopping(monitor='val_loss', # Monitor the validation loss.
                                  min_delta=0,        # Minimum change to qualify as an improvement.
                                  patience=10,        # Number of epochs with no improvement after which training will be stopped.
                                  verbose=0,          # Suppress verbose output.
                                  mode='auto')        # Automatically infer the direction of improvement (min for loss).

    # Define the ModelCheckpoint callback.
    checkpoint = ModelCheckpoint(modelName,             # Filepath to save the model.
                               monitor='val_loss',    # Monitor the validation loss.
                               verbose=0,             # Suppress verbose output.
                               save_best_only=True, # Only save the model if `val_loss` has improved.
                               mode='min')           # The monitored quantity should be minimized.
    
    # Start the training process.
    model.fit([trainX, trainS], trainY,              # Training data (inputs and labels).
            validation_data=([devX, devS], devY),  # Validation data.
            epochs=30, batch_size=32,              # Number of epochs and batch size.
            callbacks=[checkpoint, early_stopping])# List of callbacks to use during training.

--- 
Let's train a model on the data and save the one that performs best on the validation data in `bilstm_sequence_labeling.hdf5`.

--- 
#### Initial Model Training
This cell creates an instance of the BiLSTM model and trains it using the `train` function. The best model weights will be saved to a file.

In [None]:
# Create the BiLSTM model with an output dimension equal to the number of tags + 1 (for padding).
model=create_bilstm(embeddings, len(tag_vocab)+1)
# Train the model and save the best version to 'bilstm_sequence_labeling.hdf5'.
train(model, "bilstm_sequence_labeling.hdf5")

--- 
We can explore the performance of the model by predicting the NER tags for a new sequence.

--- 
#### Loading the Best Model
To make predictions, we first instantiate the model architecture again and then load the best weights that were saved during the training process.

In [None]:
# Re-create the model architecture.
model=create_bilstm(embeddings, len(tag_vocab)+1)
# Load the saved weights from the file.
model.load_weights("bilstm_sequence_labeling.hdf5")

--- 
#### Prediction Function
This function takes a raw text sentence, preprocesses it into the numerical format the model expects, runs the prediction, and then decodes the numerical output back into human-readable NER tags.

In [None]:
def predict(text, model, rev_tags):
    # Split the input string into a list of words.
    text=text.split(" ")
    # Initialize a list to hold the word IDs.
    wids=[]
    # Convert each word to its ID.
    for t in text:
        if t.lower() in word_vocab:
            # If the word is in our vocabulary, get its ID.
            wids.append(word_vocab[t.lower()])
        else:
            # Otherwise, use the padding ID (0) which the model will ignore thanks to masking.
            wids.append(0)

    # Convert the list of IDs to a numpy array.
    wids=np.array(wids)
    # Get the length of the sequence.
    lengths=np.array([len(wids)])

    # The model expects batched input, so we wrap the arrays in another list/dimension.
    preds=model.predict([[wids], [lengths]])
    # For each word, find the tag with the highest probability using argmax.
    y_classes = preds.argmax(axis=-1)

    # Convert the predicted tag IDs back to tag strings using the reverse vocabulary.
    predicted=[rev_tags[t] for t in y_classes[0]]
    # Print each word and its predicted tag.
    for w, t in zip(text, predicted):
        print("%s\t%s" % (w,t))

--- 
#### Making a Prediction on New Text
Here we test the trained model on a new sentence to see how it performs.

In [None]:
# Define a new sentence to test.
text="Bill Gates is the founder of Microsoft"
# Call the predict function to get the NER tags.
predict(text, model, rev_tags)

---
Q1: You'll notice above that the model gets a token-level validation accuracy around 95 simply due to the high presence of the majority class ("O").  That's not a very helpful metric  in this case. Implement F-score for NER.  Remember, the F-score for NER is based on *chunks*; for more, see section 11.3.2 in: of SLP3 [chapter 11](https://web.stanford.edu/~jurafsky/slp3/11.pdf)

--- 
#### F1 Score Calculation
This function implements precision, recall, and F1-score for the NER task. Unlike simple accuracy, this metric evaluates performance based on correctly identified "chunks" of named entities (e.g., correctly identifying "Bill Gates" as a single "PER" entity). This is a much more meaningful metric for NER.

In [None]:
def calculateF1(gold_sequences, predicted_sequences):
    
    """
    Function to calculate the precision, recall and F-score over labeled chunks in the gold and predicted
    input sequences.
    """
    
    # Inner helper function to extract entity chunks from a sequence of tags.
    def get_entities(sequences):
        
        # A list to store all found entities as (sentence_id, start_word, end_word, category).
        ents=[]

        # Iterate over each sentence's tag sequence.
        for s_idx in range(len(sequences)):
            
            sent=sequences[s_idx]
            
            # Variables to track the start of a potential entity.
            start=None
            startCat=None

            # Iterate over each word's tag in the sentence.
            for w_idx in range(len(sent)):
                tag=sent[w_idx]
                # Split the tag (e.g., "B-PER") into its BIO part and category part.
                parts=tag.split("-")
                BIO="O" # Default to "O" (Outside).
                if len(parts) == 2:
                    BIO=parts[0]
                    cat=parts[1]

                # If we see a "B" tag or an "O" tag, the previous entity (if any) has ended.
                if BIO == "B" or BIO == "O":
                    if start != None:
                        # The entity ended at the previous word.
                        end=w_idx-1
                        # Add the completed entity to our list.
                        ents.append((s_idx, start, end, startCat))
                        # Reset the start trackers.
                        start=None
                        startCat=None
                        end=None

                # If we see a "B" (Begin) tag, a new entity has started.
                if BIO == "B":
                    start=w_idx
                    startCat=cat

            # After the loop, check if the sentence ended mid-entity.
            if start != None:
                ents.append((s_idx, start, len(sent)-1, startCat))

        # Return the list of all extracted entities.
        return ents
        
    # Extract entities from the ground truth (gold) and predicted sequences.
    gold_ents=get_entities(gold_sequences)
    pred_ents=get_entities(predicted_sequences)

    # Convert the lists of entities to sets for efficient intersection calculation.
    g_set=set(gold_ents)
    p_set=set(pred_ents)
    
    # Calculate precision: (number of correctly predicted entities) / (total number of predicted entities).
    precision=0
    if len(p_set) > 0:
        precision=float(len(g_set.intersection(p_set)))/len(p_set)
    # Calculate recall: (number of correctly predicted entities) / (total number of actual entities).
    recall=0
    if len(g_set) > 0:
        recall=float(len(g_set.intersection(p_set)))/len(g_set)
    
    # Calculate F1 score: the harmonic mean of precision and recall.
    F1=0
    if precision + recall > 0:
        F1=2*precision*recall/(precision+recall)

    # Return the three metrics.
    return precision, recall, F1

--- 
#### F1 Score Example
This cell runs a small example to test the `calculateF1` function and verify its output.

In [None]:
# Example from class on 4/4 to test the F1 calculation.
# Gold standard has two entities: (B-PER, I-PER) and (B-ORG).
gold_sequences=[["B-PER", "I-PER", "O", "O", "O", "O", "B-ORG"], ["O", "O", "O"]]
# Prediction incorrectly splits the first entity and correctly finds the second.
predicted_sequences=[["B-PER", "O", "O", "O", "B-PER", "O", "B-ORG"], ["O", "O", "O"]]

# Calculate the metrics.
precision, recall, F1=calculateF1(gold_sequences, predicted_sequences)
# Print the results.
print("P: %.3f, R: %.3f, F: %.3f" % (precision, recall, F1))

--- 
Keras by default calculates metrics like accuracy at the batch level (averaging the metric across batches).  F-score, however, is a metric properly calculated over an entire dataset; we can incorporate that into learning by defining a callback function that prints out the validation F-score at the end of each epoch.  Once you've implemented `calculateF1` above, execute the following cells to see the validation F-score while training.

--- 
#### Custom Keras Callback for F1 Score
This class defines a custom Keras callback. The `on_epoch_end` method is automatically called by Keras at the end of each training epoch. Inside this method, we run predictions on the entire validation set, calculate the chunk-based F1 score, and print it. This gives us a much more accurate view of the model's performance as it trains.

In [None]:
class F_score(Callback):
    
    # The constructor saves the reverse tag vocabulary for later use.
    def __init__(self, reverse_tag_vocab):
        self.reverse_tag_vocab=reverse_tag_vocab
        
    # This function is executed at the end of each epoch.
    def on_epoch_end(self, epoch, logs={}):
        
        # Get the validation data from the model.
        valX=self.validation_data[0] # Word ID sequences
        valS=self.validation_data[1] # Sentence lengths
        valY=self.validation_data[2] # True tag labels (one-hot)
        
        # Make predictions on the validation data.
        predictions=self.model.predict([valX, valS])
        # Convert predictions from probabilities to class IDs (the index of the max value).
        y_classes = predictions.argmax(axis=-1)
        # Convert true labels from one-hot vectors to class IDs.
        truth = valY.argmax(axis=-1)

        # Initialize lists to hold the decoded tag sequences.
        preds=[]
        golds=[]

        # Get the shape of the predicted classes matrix (num_sentences, max_length).
        s,w=y_classes.shape
        # Iterate over each sentence in the validation set.
        for i in range(s):
            # Lists for the current sentence's tags.
            sent_preds=[]
            sent_golds=[]
            # Iterate up to the original length of the sentence to ignore padding.
            for j in range(int(valS[i])):
                # Decode the true tag ID to its string representation.
                sent_golds.append(self.reverse_tag_vocab[truth[i,j]])
                # Decode the predicted tag ID to its string representation.
                sent_preds.append(self.reverse_tag_vocab[y_classes[i,j]])
            # Add the decoded sequences to the main lists.
            preds.append(sent_preds)
            golds.append(sent_golds)
        
        # Calculate Precision, Recall, and F1 over the entire validation set.
        precision, recall, F1=calculateF1(golds, preds)
        # Print the results for this epoch.
        print("P: %.3f, R: %.3f, F: %.3f" % (precision, recall, F1))
    
        return

--- 
#### Updated Training Function with F1 Callback
This is a revised version of the `train` function. It now creates an instance of our custom `F_score` callback and includes it in the list of callbacks passed to `model.fit`.

In [None]:
def train(model, modelName):
    # Print the model summary.
    print (model.summary())

    # Define the EarlyStopping callback.
    early_stopping = EarlyStopping(monitor='val_loss',
                                  min_delta=0,
                                  patience=10,
                                  verbose=0, 
                                  mode='auto')

    # Create an instance of our custom F_score callback.
    f_score=F_score(rev_tags)
    # Define the ModelCheckpoint callback.
    checkpoint = ModelCheckpoint(modelName, monitor='val_loss', verbose=0, save_best_only=True, mode='min')
    
    # Start the training process.
    model.fit([trainX, trainS], trainY, 
            validation_data=([devX, devS], devY),
            epochs=30, batch_size=32,
            # Pass the list of callbacks, now including our f_score calculator.
            callbacks=[f_score, checkpoint, early_stopping])
    

--- 
#### Training with F1 Score Monitoring
Finally, we create a new model and train it using the updated `train` function. Now, at the end of each epoch, we will see the precision, recall, and F1 score on the validation set, providing a much clearer picture of how well the model is learning the NER task.

In [None]:
# Create the BiLSTM model.
model=create_bilstm(embeddings, len(tag_vocab)+1)
# Train the model, this time with F1 score reporting at each epoch.
train(model, "bilstm_sequence_labeling.hdf5")