#### Import Libraries

In [4]:
import numpy as np
from datasets import load_dataset
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

#### Load the GloVe Embeddings
- Purpose: <br>
  This function reads a GloVe file (a plain-text file) where each line contains a word followed by its vector (300 numbers in our case).
- How It Works: <br>
    - The file is opened, and for each line, the first element (the word) is separated from its numeric vector.
    - The vector components are converted into a NumPy array (for efficient numerical operations).
    - A dictionary (embeddings_index) is built where each word is a key, and its associated vector is the value.
- Outcome: <br>
  After running this cell, I have a dictionary (glove_embeddings) with pre-trained 300-dimensional word vectors.

[Link to download GloVe](https://nlp.stanford.edu/projects/glove/)

In [5]:
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split() # Split the line into individual elements
            word = values[0] # The first element is the word itself
            coefs = np.asarray(values[1:], dtype='float32') # The remaining values are the vector components
            embeddings_index[word] = coefs # Store the vector in a dictionary keyed by the word
    return embeddings_index

# Specify the path to the GloVe file (e.g., 'glove.6B.300d.txt')
glove_file = "C:/Users/CommAdmin/Downloads/glove.6B/glove.6B.300d.txt"  
glove_embeddings = load_glove_embeddings(glove_file)
print(f"Loaded {len(glove_embeddings)} word vectors from GloVe.")

Loaded 400000 word vectors from GloVe.


### Loading the POS Tagging Dataset
Load the POS tagging dataset from Hugging Face.
- The load_dataset function downloads and prepares the dataset.
- The dataset has a split called "train"; each example is a dictionary with keys like "words" (a list of tokens) and "labels" (the corresponding POS tags).

In [6]:
# Load the dataset (we use the "train" split for both training and test as described)
dataset = load_dataset("batterydata/pos_tagging")
data = dataset["train"]

# Inspect the first example
print(data[0])
print(type(data[0]))  # Should print <class 'dict'>

{'words': ['Confidence', 'in', 'the', 'pound', 'is', 'widely', 'expected', 'to', 'take', 'another', 'sharp', 'dive', 'if', 'trade', 'figures', 'for', 'September', ',', 'due', 'for', 'release', 'tomorrow', ',', 'fail', 'to', 'show', 'a', 'substantial', 'improvement', 'from', 'July', 'and', 'August', "'s", 'near-record', 'deficits', '.'], 'labels': ['NN', 'IN', 'DT', 'NN', 'VBZ', 'RB', 'VBN', 'TO', 'VB', 'DT', 'JJ', 'NN', 'IN', 'NN', 'NNS', 'IN', 'NNP', ',', 'JJ', 'IN', 'NN', 'NN', ',', 'VB', 'TO', 'VB', 'DT', 'JJ', 'NN', 'IN', 'NNP', 'CC', 'NNP', 'POS', 'JJ', 'NNS', '.']}
<class 'dict'>


#### Extracting All Tokens and Building an Embedding Cache Using GloVe
- Purpose: <br>
  Create a “cache” of embeddings for every unique token in the dataset.
- How It Works:
    - get_all_tokens function: <br>
      Iterates through every example (sentence) in the dataset and collects all words.
    - unique_tokens = set(all_tokens): <br>
      Converts the list of tokens into a set to remove duplicates (so each word is processed only once).
    - Building the Cache: <br>
      For each unique token, the code:
        - Checks if the token exists in the GloVe embeddings (case-sensitive).
        - If not, it checks the lowercase version (since GloVe is usually in lowercase).
        - If still not found, it assigns a zero vector (an array of zeros).
- Outcome: <br>
  You have an embedding_cache dictionary that quickly gives you the vector for any token.

In [7]:
# Iterates through every example (sentence) in the dataset and collects all words.
def get_all_tokens(examples):
    tokens = []
    for example in examples:
        tokens.extend(example['words'])
    return tokens

# Get all tokens from the entire dataset (for building the cache)
all_tokens = get_all_tokens(data)
unique_tokens = set(all_tokens) # Converts the list of tokens into a set to remove duplicates (so each word is processed only once).
print(f"Number of unique tokens: {len(unique_tokens)}")

embedding_cache = {}
embedding_dim = 300  # Using the 300d GloVe embeddings

for token in unique_tokens:
    if token in glove_embeddings:
        embedding_cache[token] = glove_embeddings[token]
    elif token.lower() in glove_embeddings:
        embedding_cache[token] = glove_embeddings[token.lower()]
    else:
        # If the token is not found in GloVe, assign a zero vector.
        embedding_cache[token] = np.zeros(embedding_dim)

Number of unique tokens: 24847


#### Define a Function to Create Context-Augmented Features
- Purpose: <br>
  Enhance each token’s feature vector with information about its neighboring words.
- How It Works:
    - Padding: <br>
      Creates zero vectors to pad the beginning and end of the sentence so that every token (including those at the boundaries) can have the same-sized context window.
    - Context Extraction: <br>
      For each token in the sentence, the function extracts the embeddings for the token itself and its neighbors (one token before and one after, if available).
    - Flattening: <br>
      The neighboring embeddings are concatenated (flattened) into a single vector.
- Outcome: <br>
  Each token is now represented by a feature vector that is longer (e.g., 900 dimensions when using a window of 1 and 300-dimensional embeddings) and contains context information.

In [8]:
def create_context_features(sentence_embeddings, window_size=1):
    """
    Given an array of token embeddings for a sentence (shape: (n_tokens, embed_dim)),
    return an array of context features for each token by concatenating the embeddings
    from a window of size `window_size` on both sides.
    
    At sentence boundaries, pad with zeros.
    """
    n_tokens, embed_dim = sentence_embeddings.shape
    # Create a padding array (zeros) for the beginning and end
    pad = np.zeros((window_size, embed_dim))
    padded = np.vstack([pad, sentence_embeddings, pad])
    
    features = []
    for i in range(window_size, window_size + n_tokens):
        # Extract embeddings for the context window (previous, current, next)
        context = padded[i - window_size: i + window_size + 1].flatten()
        features.append(context)
    return np.array(features)

#### Build Feature Matrices and Label Vectors
- Purpose:<br>
  Transform the dataset from a list of sentences (with each sentence as a list of tokens) into a flat feature matrix for training and testing.
- How It Works:<br>
    - For Training:<br>
      The code iterates over the first 1000 sentences. For each sentence:
        - It retrieves the word embeddings (using the cache).
        - It calls create_context_features to build a feature vector for each token.
        - It collects these feature vectors and the corresponding labels (POS tags) into lists.
    - For Testing:<br>
      The same procedure is applied over all sentences in the dataset.
    - Flattening:<br>
      Since each sentence produces an array of feature vectors, the code uses np.vstack to stack them into one large NumPy array.
- Outcome:<br>
X_train_context and X_test_context become matrices where each row corresponds to a token’s feature vector.<br>
y_train_context and y_test_context are arrays of labels for each token.

In [9]:
# Select the first 1000 sentences for training
train_sentences = data.select(range(1000))
# Use all sentences for testing
test_sentences = data

# Initialize lists to hold features and corresponding labels
X_train_context = []
y_train_context = []
X_test_context = []
y_test_context = []

# Process training sentences
for example in train_sentences:
    words = example['words']
    labels = example['labels']
    # Build an array of embeddings for the sentence using the embedding cache
    sent_embeddings = np.array([embedding_cache[word] for word in words])
    # Create context-augmented features (for window_size=1, feature vector length = 3 * 300 = 900)
    sent_features = create_context_features(sent_embeddings, window_size=1)
    X_train_context.append(sent_features)
    y_train_context.extend(labels)

# Process test sentences
for example in test_sentences:
    words = example['words']
    labels = example['labels']
    sent_embeddings = np.array([embedding_cache[word] for word in words])
    sent_features = create_context_features(sent_embeddings, window_size=1)
    X_test_context.append(sent_features)
    y_test_context.extend(labels)

# Flatten the list of arrays (each sentence's features) into a single array
X_train_context = np.vstack(X_train_context)
X_test_context = np.vstack(X_test_context)
y_train_context = np.array(y_train_context)
y_test_context = np.array(y_test_context)

print("Shape of training features:", X_train_context.shape)
print("Shape of testing features:", X_test_context.shape)

Shape of training features: (23969, 900)
Shape of testing features: (321815, 900)


#### Encode the POS Tag Labels
- Purpose:<br>
  Convert the POS tags into numeric labels that the classifier can work with.
- How It Works:
    - LabelEncoder:<br>
      A tool from scikit-learn that assigns each unique label a unique integer.
    - Fitting:<br>
      We fit the encoder on the union of training and test labels (so that all possible labels are recognized).
    - Transforming:<br>
      The textual labels are converted into integer arrays.
- Outcome:<br>
  You now have y_train_enc and y_test_enc as numeric representations of your POS tags.

In [10]:
# Option: Fit the encoder on both training and test tags to cover all labels.
all_tags = list(y_train_context) + list(y_test_context)
encoder = LabelEncoder()
encoder.fit(all_tags)

y_train_enc = encoder.transform(y_train_context)
y_test_enc = encoder.transform(y_test_context)

print("Unique POS tags:", encoder.classes_)

Unique POS tags: ['#' '$' "''" '(' ')' ',' '-LRB-' '-NONE-' '-RRB-' '.' ':' 'CC' 'CD' 'DT'
 'EX' 'FW' 'IN' 'JJ' 'JJR' 'JJS' 'LS' 'MD' 'NN' 'NNP' 'NNPS' 'NNS' 'PDT'
 'POS' 'PRP' 'PRP$' 'RB' 'RBR' 'RBS' 'RP' 'SYM' 'TO' 'UH' 'VB' 'VBD' 'VBG'
 'VBN' 'VBP' 'VBZ' 'WDT' 'WP' 'WP$' 'WRB' '``']


#### Train the Classifier (Logistic Regression)
- Purpose:<br>
  Train a Logistic Regression model to learn the mapping from token features (with context) to POS tag labels.
- How It Works:
    - Model Initialization:<br>
      The classifier is set up with a maximum of 1000 iterations (to ensure convergence).
    - Training:<br>
      The classifier’s .fit() method is called with the training features and labels.
- Outcome:<br>
  The classifier learns patterns from the data.

In [11]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_context, y_train_enc)

#### Evaluate the Classifier

In [12]:
y_pred = clf.predict(X_test_context)

accuracy = accuracy_score(y_test_enc, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test_enc, y_pred, target_names=encoder.classes_))

Test Accuracy: 0.8419

Classification Report:
              precision    recall  f1-score   support

           #       1.00      0.85      0.92        48
           $       1.00      0.99      0.99      2529
          ''       1.00      0.99      1.00      2295
           (       0.43      0.22      0.29       351
           )       0.23      0.16      0.19       358
           ,       1.00      1.00      1.00     16256
       -LRB-       0.00      0.00      0.00        67
      -NONE-       0.00      0.00      0.00      4106
       -RRB-       0.00      0.00      0.00        70
           .       1.00      0.99      1.00     13210
           :       1.00      0.92      0.96      1690
          CC       0.98      0.98      0.98      8022
          CD       0.86      0.89      0.88     12055
          DT       0.98      0.97      0.98     27541
          EX       0.92      0.97      0.94       313
          FW       0.82      0.20      0.32        45
          IN       0.93      0.96  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
