<p>
CAS on Advanced Machine Learning <br>
Data Science Lab, University of Bern, 2026 <br>
Prepared by Dr. Mykhailo Vladymyrov.

</p>

This work is licensed under a <a href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License</a>.

# Install libs

In [None]:
!pip install nltk spacy scikit-learn gensim matplotlib seaborn pandas tqdm flair

# Thu morning 1 (NN on tf-idf)

In this session we will continue with building simple neural networks.
We will use the more sophisticated features, and rely on previously established intuition about building the NNs.

## 1. Imports

In [None]:
# Import the necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
import numpy as np

## 2. Fetch dataset

In [None]:
fetch_20newsgroups().target_names

The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date.

We strip the headers and footers, as those can make the task easier.

In [None]:
# Load the 20 Newsgroups dataset
categories = ['rec.sport.hockey', 'sci.electronics', 'comp.graphics']
SEED = 64
train_data = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=SEED)
test_data = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=SEED)

In [None]:
# inspect the data structure
train_data.keys()

In [None]:
# inspect the data input elements, target, target_names, number of elements... (5 min)


In [None]:
# after add the `remove=('headers', 'footers', 'quotes')` argument to the `fetch_20newsgroups` call and repeat
# what changed?

## 3. Preprocess and inspect dataset

In [None]:
# Iterate over the train_data and test_data
for d in [train_data, test_data]:

  # Remove leading and trailing whitespace, tab,
  # and new line characters from each data point
  d.data = [s.strip(' \n\t\r') for s in d.data]

  # Get the indices of data points with non-empty content
  ok_idx = [i for i, s in enumerate(d.data) if len(s) > 0]

  # Filter the data and target lists to keep only non-empty data points
  d.data = [d.data[i] for i in ok_idx]
  d.target = [d.target[i] for i in ok_idx]

In [None]:
# inspect the data labels elements
#train_data.target

In [None]:
len(train_data.data), len(test_data.data)

In [None]:
sample = train_data.data[24]
sample_label = train_data.target[24]


In [None]:
print(sample)
print(f'label={sample_label}, ({categories[sample_label]})')

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [None]:
# Convert the text data to TF-IDF vectors, max 10 words
vectorizer = TfidfVectorizer(max_features=10)

# `object.fit` followed by processing by `object.transform` or the
# joined `object.fit_transform` is the
train_vectors = vectorizer.fit_transform(train_data.data[:3])

In [None]:
feature_names = vectorizer.get_feature_names_out()
print(len(feature_names))

In [None]:
feature_names

In [None]:
# the vectors are stored in sparse format:
train_vectors

In [None]:
# to get the dense representation we need to convert them to an array:
train_vectors.toarray()

In [None]:
plt.figure(figsize=(30, 3))
plt.plot(vectorizer.get_feature_names_out(), train_vectors.toarray()[0])
plt.plot(vectorizer.get_feature_names_out(), train_vectors.toarray()[1])
plt.plot(vectorizer.get_feature_names_out(), train_vectors.toarray()[2])
plt.xticks(rotation=45, horizontalalignment='right');
plt.ylim(0,1);

In [None]:
# Convert the text data to TF-IDF vectors, all words, all data
vectorizer = ?
train_vectors = ?
test_vectors = ?

In [None]:
# what's the difference in processing train_vectors and test vectors?

In [None]:
train_vectors.toarray().shape

In [None]:
af = train_vectors.toarray().flatten()
# plot only present words (tfidf>0)
aff = af[af>0]
plt.hist(aff, 100, log=True);

## 4. Create and train the model

In [None]:
# complete the code and train the model (20 min)

In [None]:
# Define the model
class Classifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.dropout(torch.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

# Define the model hyperparameters
input_dim = train_vectors.shape[1]
hidden_dim = 256
output_dim = len(categories)

# Create the model instance
model = ?

# Define the optimizer and loss function
optimizer = ?
criterion = ?

# Set the device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model and data to the device
model = model.to(device)
train_vectors_t = torch.tensor(train_vectors.toarray(), dtype=torch.float).to(device)
train_labels_t = torch.tensor(train_data.target, dtype=torch.long).to(device)
test_vectors_t = torch.tensor(test_vectors.toarray(), dtype=torch.float).to(device)
test_labels_t = torch.tensor(test_data.target, dtype=torch.long).to(device)

# Training loop
def train(model, optimizer, criterion):
    # set model to train mode
    ?

    optimizer.zero_grad()
    predictions = model(train_vectors_t)
    loss = criterion(predictions, train_labels_t)

    # perform backpropagation
    ?
    optimizer.step()

# Evaluation loop
def evaluate(model):
    # set model to evaluation mode
    ?
    with torch.no_grad():
        predictions = model(test_vectors_t)
        predicted_labels = torch.argmax(predictions, dim=1)
        accuracy = torch.sum(predicted_labels == test_labels_t).item() / len(test_labels_t)
    return accuracy

# Training and evaluation loop
NUM_EPOCHS = 100
accuracy_arr = []
for epoch in tqdm(range(NUM_EPOCHS)):
    train(model, optimizer, criterion)
    accuracy = evaluate(model)
    #print(f"Epoch {epoch+1}/{NUM_EPOCHS} - Accuracy: {accuracy:.4f}")

    accuracy_arr.append(accuracy)

# plot the accuracy evolution
?
print(max(accuracy_arr))

In [None]:
# What do you see?

## 5. Exercise 1h + 30min discussion

Work in 3 groups:
 - present final group results in the end (10 min per group):
 - half time through - share code of your intermediate results (on Zoom) so that other groups use it for final results:

1. Optimize the model architecture to improve performance.
2. Study dependence of performance & training time on number of tf-idf features
3. Study relevant performance metrics. Make evaluation code for both training and the test sets.

# Thu morning 2: RNN on embeddings

## 1. Imports/utils

In [None]:
from gensim.models import Word2Vec
import gensim.downloader as gd
from sklearn.datasets import fetch_20newsgroups

import re

In [None]:
import nltk
import tqdm
import pickle
import os
import numpy as np
import matplotlib.pyplot as plt

In [None]:
nltk.download('punkt')

In [None]:
def load_pckl(file_name, path=None):
    if path is not None:
        file_name = os.path.join(path, file_name)

    with open(file_name, 'rb') as f:
        data = pickle.load(f)
    return data


def save_pckl(d, file_name, pr=None, path=None):
    if path is not None:
        file_name = os.path.join(path, file_name)

    with open(file_name, 'wb') as f:
        pickle.dump(d, f, protocol=pr if pr is not None else pickle.DEFAULT_PROTOCOL)

In [None]:
def remove_punctuation(input_string):
    # Use regular expression to remove all punctuation characters
    return re.sub(r'[^\w\s]', '', input_string)  # everything which is not a word (\w) or space (\s) -> empty string ('')

## 2. Load and preprocess the dataset

In [None]:
# Load the 20 Newsgroups dataset
categories = ['rec.sport.hockey', 'sci.electronics', 'comp.graphics']
SEED = 64
train_data = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=SEED, remove=('headers', 'footers', 'quotes'))
test_data = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=SEED, remove=('headers', 'footers', 'quotes'))

In [None]:
# Iterate over the train_data and test_data
for d in [train_data, test_data]:

  # Remove leading and trailing whitespace, tab,
  # and new line characters from each data point
  d.data = [s.strip(' \n\t\r') for s in d.data]

  # Get the indices of data points with non-empty content
  ok_idx = [i for i, s in enumerate(d.data) if len(s) > 0]

  # Filter the data and target lists to keep only non-empty data points
  d.data = [d.data[i] for i in ok_idx]
  d.target = [d.target[i] for i in ok_idx]

## 3. Explore Word2Vec (Homework)

Similar to the tiny dataset example explore similar words and word arythmetic.

Use your imagination, e.g. snake-long+short, or brick-hard+soft, etc

### 1. Tiny dataset

In [None]:
# Preprocess the corpus (replace with your own preprocessing steps)
corpus = ["I love to eat pizza", "I hate Mondays", "Pizza is delicious", "I enjoy playing tennis"]

# Tokenize the corpus
tokenized_corpus = [sentence.lower().split() for sentence in corpus]

# Train Word2Vec model
model = Word2Vec(tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4, )

In [None]:
# Find similar words
similar_words = model.wv.most_similar("pizza")
print("Similar words to 'pizza':")
for word, similarity in similar_words:
    print(word, similarity)

In [None]:
# Perform word embeddings arithmetic
result = model.wv.most_similar(positive=["tennis", "hate"], negative=["love"])  # "tenis" - "love" + "hate"
print("Word embeddings arithmetic:")
for word, similarity in result:
    print(word, similarity)


As you see - on tiny dataset the vectors don't make much sense.
You are encouraged to explore the embedding vectors built based on the bigger datasets in the nxt 2 sections

### 2. Bigger dataset

In [None]:
# Preprocess the corpus (replace with your own preprocessing steps)
corpus = train_data.data

# Tokenize the corpus
tokenized_corpus = [remove_punctuation(sentence.lower()).split() for sentence in corpus]

# Train Word2Vec model
model = Word2Vec(tokenized_corpus, vector_size=300, window=5, min_count=1, workers=4, )

In [None]:
word = "network"
similar_words = model.wv.most_similar(word, topn=20)
print(f"Similar words to '{word}':")
for word, similarity in similar_words:
    print(word, similarity)

Try removing the stopwords and re-fitting the model (see sections below hoe to do it)



### 3. 300k words

Here a pretrained model on many news articles is downloaded (Downloading it can take a while!). Feature vectors are of length 300, total 300k words.

In [None]:
# Download the word2vec-google-news-300 model
model_gn300 = gd.load('word2vec-google-news-300')

In [None]:
model_gn300.vectors.shape

In [None]:
word = "car"
similar_words = model_gn300.most_similar(word)
print(f"Similar words to '{word}':")
for word, similarity in similar_words:
    print(word, similarity)

In [None]:
word = "god"
similar_words = model_gn300.most_similar(word)
print(f"Similar words to '{word}':")
for word, similarity in similar_words:
    print(word, similarity)

In [None]:


# Find similar words
similar_words = model_gn300.most_similar("python")
print("Similar words to 'pizza':")
for word, similarity in similar_words:
    print(word, similarity)

# Perform word embeddings arithmetic
result = model_gn300.most_similar(positive=["python", "young"], negative=["old"])
print("Word embeddings arithmetic:")
for word, similarity in result:
    print(word, similarity)

# Visualize word embeddings using t-SNE (replace with your own visualization code)
# ...

# Additional tasks and experiments with word embeddings
# ...


## 4. Preparing embedding dataset (not run in the class - time-consuming)

### 0. Embedding utilities

First we create word embeddigns with Word2Vec

In [None]:
# Download (takes a while!) the word2vec-google-news-300 model
model_gn300 = gd.load('word2vec-google-news-300')

In [None]:
def get_vec(model, word):
  # convert word to vector
  # Check if the model has an index for the word, of so get vector,
  # otherwise return None
  return model.get_vector(word) if model.has_index_for(word) else None

In [None]:
# Function to convert text to word vectors
# using a specified word embedding model
# (default - word2vec-google-news-300 model, Download takes a while!
# you can also try yours)
def convert_text_to_vecs(text, model=model_gn300):
  # Tokenize the input text into individual words
  words = nltk.tokenize.word_tokenize(text)

  # Get word vectors for each word in the text using the specified word embedding model
  wvs = [get_vec(model, word) for word in words]

  # Filter out None values (word vectors that couldn't be found in the model)
  wvs = [v for v in wvs if v is not None]

  # Convert the list of word vectors to a NumPy array
  wvs = np.array(wvs)

  # Return the array of word vectors for the input text
  return wvs

Few helper functions to convert input dataset, and to save whole dataset to file

In [None]:
def convert_corpus_to_vecs(corpus, convert_corpus_to_vecs_fn):
  # for each text in the corpus - vectorize it
  # using list comprehension. Use tqdm to display progress
  return [convert_corpus_to_vecs_fn(text) for text in tqdm.auto.tqdm(corpus)]

In [None]:
def prepare_embedding_dataset(tra_input, tra_labels,
                              val_input, val_labels,
                              filename,
                              convert_text_to_vecs_fn):
  # Prepare and save an embedding dataset for training and validation.
  print('embedding training data:')
  tra_data_vecs = convert_corpus_to_vecs(tra_input, convert_text_to_vecs_fn)

  print('embedding validation data:')
  val_data_vecs = convert_corpus_to_vecs(val_input, convert_text_to_vecs_fn)

  # remove empty elements
  ok_idx = [i for i, s in enumerate(tra_data_vecs) if len(s)>0]
  tra_data_vecs = [tra_data_vecs[i] for i in ok_idx]
  tra_labels = [tra_labels[i] for i in ok_idx]

  ok_idx = [i for i, s in enumerate(val_data_vecs) if len(s)>0]
  val_data_vecs = [val_data_vecs[i] for i in ok_idx]
  val_labels = [val_labels[i] for i in ok_idx]

  print(f'preparing and saving dataset to: {filename}')

  # Create a dataset dictionary containing training and validation data and labels
  dataset = {
    'tra_data': tra_data_vecs,
    'tra_labels': tra_labels,
    'val_data': val_data_vecs,
    'val_labels': val_labels,
  }

  # Save the dataset as a pickle file
  save_pckl(dataset, filename, path='./')

GloVe and RoBERTa embeddings (with flair)

In [None]:
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings, TransformerWordEmbeddings
from flair.data import Sentence

In [None]:
# Download models for word embeddings with GloVe and RoBERTa
glove_embedding = WordEmbeddings('glove')
roberta_embedding = TransformerWordEmbeddings('roberta-base')

# Create a DocumentPoolEmbeddings object (optional but helpful)
document_embeddings_glove = DocumentPoolEmbeddings([glove_embedding])
document_embeddings_roberta = DocumentPoolEmbeddings([roberta_embedding])


In [None]:
def convert_text_to_embedding(text, doc_embedding_model):
  # Create a Flair Sentence object
  sentence = Sentence(text)

  # Embed the sentence
  try:
    doc_embedding_model.embed(sentence)
  except Exception as e:
    # if model can't convert a text - print it, before raising the exception
    print('failed text:', text)
    raise e

  embeddings = [token.embedding.cpu().numpy() for token in sentence]
  return np.array(embeddings)

In [None]:
def convert_text_to_glove(text):
  return convert_text_to_embedding(text, document_embeddings_glove)

def convert_text_to_roberta(text):
  return convert_text_to_embedding(text, document_embeddings_roberta)


### 1. Create word2vec embeddings

In [None]:
prepare_embedding_dataset(train_data.data, train_data.target,
                          test_data.data, test_data.target,
                          filename='dataset_20newsgroups_3_cat.pckl',
                          convert_text_to_vecs_fn=convert_text_to_vecs
                          )

In [None]:
!mv dataset_20newsgroups_3_cat.pckl "/content/drive/MyDrive/Colab Data/NLP_data"

### 2. Create GloVe embeddings

In [None]:
prepare_embedding_dataset(train_data.data, train_data.target,
                          test_data.data, test_data.target,
                          filename='dataset_20newsgroups_3_cat_gl.pckl',
                          convert_text_to_vecs_fn=convert_text_to_glove
                          )

In [None]:
!mv dataset_20newsgroups_3_cat_gl.pckl "/content/drive/MyDrive/Colab Data/NLP_data"

### 3. Create RoBERTa embeddings (run with a GPU)

In [None]:
prepare_embedding_dataset(train_data.data, train_data.target,
                          test_data.data, test_data.target,
                          filename='dataset_20newsgroups_3_cat_rb.pckl',
                          convert_text_to_vecs_fn=convert_text_to_roberta
                          )

In [None]:
!mv dataset_20newsgroups_3_cat_rb.pckl "/content/drive/MyDrive/Colab Data/NLP_data"

### 4. Inspect prepared datasets:

In [None]:
dataset_20newsgroups_3_cat = load_pckl('dataset_20newsgroups_3_cat.pckl', '/content/drive/MyDrive/NLP_data')

In [None]:
tra_data = dataset_20newsgroups_3_cat['tra_data']
tra_labels = dataset_20newsgroups_3_cat['tra_labels']
val_data = dataset_20newsgroups_3_cat['val_data']
val_labels = dataset_20newsgroups_3_cat['val_labels']

In [None]:
len(tra_data), len(tra_labels), len(val_data), len(val_labels)

In [None]:
tra_data[200].shape

## 5. LSTM Training:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pickle
import os
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import LinearLR, CosineAnnealingLR, SequentialLR

from torch.utils.data import DataLoader

from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt

In [None]:
def load_pckl(file_name, path=None):
    if path is not None:
        file_name = os.path.join(path, file_name)

    with open(file_name, 'rb') as f:
        data = pickle.load(f)
    return data


def save_pckl(d, file_name, pr=None, path=None):
    if path is not None:
        file_name = os.path.join(path, file_name)

    with open(file_name, 'wb') as f:
        pickle.dump(d, f, protocol=pr if pr is not None else pickle.DEFAULT_PROTOCOL)

In [None]:
ds_name = 'dataset_20newsgroups_3_cat.pckl'


# Load the embedded 20 Newsgroups dataset
dataset = load_pckl(ds_name,
                    '/content/drive/MyDrive/NLP_data')
tra_data = dataset['tra_data']
tra_labels = dataset['tra_labels']
val_data = dataset['val_data']
val_labels = dataset['val_labels']

ok_idx = [i for i, s in enumerate(tra_data) if len(s)>0]
tra_data = [tra_data[i] for i in ok_idx]
tra_labels = [tra_labels[i] for i in ok_idx]

ok_idx = [i for i, s in enumerate(val_data) if len(s)>0]
val_data = [val_data[i] for i in ok_idx]
val_labels = [val_labels[i] for i in ok_idx]

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [None]:
# inspect preprocessing function and ensure you understand each line (10 min)

In [None]:
batch_size = 50

# make preprocessing function converting data to torch tensors
def preprocess(data, labels):
    # Make random crops of the sequences up to max_len length,
    # Pad short sequences to ensure consistent sequence length
    max_len = 128  # Maximum sequence length
    lens = [len(d) for d in data]  # Get the length of each data point

    # Calculate random offsets for padding
    ofs = [np.random.randint(0, max(1, len_i - max_len)) for len_i in lens]

    # Apply padding based on offsets
    data = [d[o:o + max_len] for d, o in zip(data, ofs)]
    max_len = max([len(d) for d in data])  # Update maximum sequence length after padding

    # Pad sequences with zeros to match the maximum sequence length
    # pad_width contains size of padding  left and righ in each dimension
    data_padded = [np.pad(d, pad_width=((0, max_len - len(d)),
                                        (0, 0))) for d in data]

    # Convert data and labels to NumPy arrays
    data_padded = np.array(data_padded)
    labels = np.array(labels)

    # Convert NumPy arrays to PyTorch tensors
    # and move them to the specified device (e.g., GPU)
    data_t = torch.tensor(data_padded, dtype=torch.float32).to(device)
    labels_t = torch.tensor(labels, dtype=torch.int64).to(device)

    return data_t, labels_t

# make data loader

train_loader = DataLoader(list(zip(tra_data, tra_labels)),
                          batch_size=batch_size, shuffle=True,
                          collate_fn=lambda x: preprocess(*zip(*x)),
                          drop_last=True)

test_loader = DataLoader(list(zip(val_data, val_labels)),
                         batch_size=89, shuffle=False,  # 89*13==len(val_data)
                         collate_fn=lambda x: preprocess(*zip(*x)),
                         drop_last=True)

In [None]:
class ClassificationRNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(ClassificationRNN, self).__init__()
        self.embedding = nn.Linear(input_dim, embedding_dim)

        if type(hidden_dim) == int:
          hidden_dim = [hidden_dim]

        self.rnn = []
        for i, hidd_d in enumerate(hidden_dim):
          prev_d = embedding_dim if i == 0 else hidden_dim[i-1]
          rnn = nn.LSTM(prev_d, hidd_d, batch_first=True)  # BSC

          # most important line:
          self.add_module(f'lstm_{i}', rnn)
          self.rnn.append(rnn)

        self.fc = nn.Linear(hidd_d, output_dim)

    def forward(self, x):
        # apply embedding layer and relu to the input
        embedded = ?

        # apply each rnn layer. tip: what does each layer return?
        for rnn in self.rnn:
          ?? = ?

        output = embedded
        rnn_out = hidden_hn[0]  # hn
        return self.fc(rnn_out)

In [None]:
# Initialize the model
input_dim = tra_data[0].shape[1]
embedding_dim = 256
hidden_dim = [128, 16]  # you can specify a list of number of units in sequential LSTM layers
output_dim = 3

model = ClassificationRNN(input_dim, embedding_dim, hidden_dim, output_dim).to(device)

# Define the sparse cross-entropy loss function
criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(model.parameters(), lr=0.005)

# Train the model
n_epochs = 200

num_warmup = 30                          # warm-up epochs

# 1. linear warm-up from 0 → base lr
warmup_sched = LinearLR(
    optimizer,
    start_factor=0.0001,    # start at 0 × base_lr
    end_factor=1.0,
    total_iters=num_warmup
)

# 2. cosine decay from base lr → 0
cosine_sched = CosineAnnealingLR(
    optimizer,
    T_max=n_epochs - num_warmup,
    eta_min=0.0
)

# 3. chain them
scheduler = SequentialLR(
    optimizer,
    schedulers=[warmup_sched, cosine_sched],
    milestones=[num_warmup]
)


tra_loss_hist = []
val_loss_hist = []
val_acc_hist = []
lrs = []

for epoch in range(n_epochs):
    train_loss = 0.
    valid_loss = 0.

    model.train()
    for batch in train_loader:
        data, labels = batch
        optimizer.zero_grad()
        output = model(data)

        loss = criterion(output, labels)
        loss.backward()

        # perform a training update:
        ?

        train_loss += loss.item()
    train_loss /= len(train_loader)
    tra_loss_hist.append(train_loss)

    scheduler.step()
    lrs.append(scheduler.get_last_lr())

    model.eval()
    with torch.no_grad():
        correct = []
        for batch in test_loader:
            data, labels = batch
            output = model(data)
            loss = criterion(output, labels)

            valid_loss += loss.item()

            pred_class = torch.argmax(output, dim=1)
            corr = pred_class == labels
            correct.append(corr.detach().cpu().numpy())

        valid_loss /= len(test_loader)
        correct = np.concatenate(correct)
        accuracy = np.mean(correct)
        print(f"{epoch}:\t Test loss: {valid_loss}; accuracy: {accuracy}")

        val_loss_hist.append(valid_loss)
        val_acc_hist.append(accuracy)

vall_acc_max = np.max(val_acc_hist)
vall_acc_max_eopch = np.argmax(val_acc_hist)

print(f'Best val accuracy: {vall_acc_max:.3} @ epoch {vall_acc_max_eopch}')

In [None]:
plt.plot(lrs)

In [None]:
# plot loss and accuracy on 2 subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
ax1.plot(tra_loss_hist, label='train')
ax1.plot(val_loss_hist, label='test')
ax1.set_xlabel('epoch')
ax1.set_ylabel('loss')
ax1.legend()

ax2.plot(val_acc_hist)
ax2.set_xlabel('epoch')
ax2.set_ylabel('accuracy')
plt.show()

print(f'best validation accuracy: {np.max(val_acc_hist)} @ epoch {np.argmax(val_acc_hist)}')

## 6. Exercise 1.

Work in 3 groups, use the 3 datasets.
Optimize the model architecture: # layers, units per layer (hidden_dim) and the embedding_dim to improve validation accuracy.

## 7. Tip

Same Flair interface as for Glove and Roberta, can be used to concatenate embeddings from several models, including your own ones. E.g., given your model file 'my_model.pt', you can create document embedder:

```
my_model = torch.load('my_model.pt')
document_embeddings_my_model = DocumentPoolEmbeddings([my_model])
```

alternatively - you can combine the Glove embeddings and your emebedding, or the roberta ones:
```
document_embeddings_stacked = DocumentPoolEmbeddings([my_model, glove_embedding])
```

# Thu evening-Friday morning (Transfer learning for PoS tagging/NER)

### 0. Libs and utils

In [None]:
!pip install transformers[torch] datasets evaluate seqeval

In [None]:
from datasets import load_dataset
import datasets

from transformers import AutoTokenizer

from transformers import AutoModelForTokenClassification, AutoModel
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import numpy as np
import matplotlib.pyplot as plt
import evaluate

import torch.cuda
import torch

In [None]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

### 1. load dataset

Dataset info: https://huggingface.co/datasets/conll2003

In [None]:
dataset = load_dataset("conll2003", num_proc=4)

In [None]:
# inspect the dataset object (10 min):
# 1. length of tra & test parts
# 2. column names
# 3. features
# ????

https://cs.nyu.edu/~grishman/jet/guide/PennPOS.html

In [None]:
pos_tags_info = dataset['train'].features['pos_tags'].feature
class_names = pos_tags_info.names

class_idx_to_class_name = dict(enumerate(class_names))
class_name_to_class_idx = {v:k for k, v in class_idx_to_class_name.items()}
n_classes = len(class_names)

class_types = list(set(class_names))  # get unique

class_idx_to_class_name

### 2. Load tokenizer

In the transformer library for most models you have the associated tokenizer.

The methods AutoTokenizer and AutoModel instantiate a model of proper type.

In [None]:
mod_name = 'bert-base-uncased'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(mod_name)

In [None]:
tokens = tokenizer.tokenize("Decoding is going the other way around: from vocabulary indices, we want to get a string. This can be done with the decode() method as follows")
print(tokens)

In [None]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

In [None]:
decoded = tokenizer.decode(token_ids)
print(decoded)

In [None]:
# play with tokenizer (10 min). How is the splitting performed?

In [None]:
# discuss

### 3. Load model

In [None]:
model = AutoModel.from_pretrained(mod_name)

In [None]:
example = dataset["test"][560]
example_txt = example['tokens']
print(example_txt)

In [None]:
example_tokens_ids = tokenizer(example_txt, is_split_into_words=True, return_tensors='pt')
example_tokens_ids

In [None]:
np_ids = example_tokens_ids['input_ids'].numpy()[0]
tokens_r = tokenizer.convert_ids_to_tokens(np_ids)
text_r = tokenizer.convert_tokens_to_string(tokens_r)
text_r
#tokens_r

In [None]:
res = model(**example_tokens_ids)

In [None]:
for k, v in res.items():
  print(k, v.shape)

So.. What kind of model is this 'bert-base-uncased'?

Lets use it for transfer learning - we use it's features to do token classification. For this we create a model of type `AutoModelForTokenClassification`:

In [None]:
model = AutoModelForTokenClassification.from_pretrained(mod_name, num_labels=n_classes).to(device)

In [None]:
res = model(**example_tokens_ids.to(device))

In [None]:
for k, v in res.items():
  print(k, v.shape)

### 4. Prepare dataset

we can use the tokenizer to covert text into token-ids. but there will be more of them than words. Our lables - are word based. We need thus to generate proper label - per token:

In [None]:
# study the function (10 min)
def tokenize_ner(examples, print_info=False):
  # Extract word tokens and NER labels from examples
  batch_words = examples['tokens']  # these are word tokens
  batch_ner_tags = examples['pos_tags']  # examples['ner_tags']  # these are NER-labels per word

  batch_tokens = tokenizer(batch_words, is_split_into_words=True, truncation=True)
  batch_labels = []

  LBL_IGN = -100  # label for tokens to be ignored during training

  # Iterate through each example in the batch
  for sample_idx, sample_ner_tags in enumerate(batch_ner_tags):
    sample_labels = []  # To store NER labels for tokens in the current example

    word_idxs = batch_tokens.word_ids(batch_index=sample_idx)
    if print_info:
      sample_token_ids = batch_tokens['input_ids'][sample_idx]
      print(word_idxs, sample_ner_tags, tokenizer.convert_ids_to_tokens(sample_token_ids))

    for word_idx in word_idxs:
      if word_idx is None:  # Ignore tokens that are not related to real words
        sample_labels.append(LBL_IGN)
      else:
        token_label = sample_ner_tags[word_idx]
        sample_labels.append(token_label)  # Store the NER label for the token

    # Add the NER labels for the current example to the batch_labels list
    batch_labels.append(sample_labels)

  # Add the batch_labels to the tokenized batch
  batch_tokens['labels'] = batch_labels

  return batch_tokens

Lets inspect what and how does it do:

In [None]:
res = tokenize_ner(dataset['train'][560:564], print_info=True)
print(res)

In [None]:
for token, token_id, label in zip(tokenizer.convert_ids_to_tokens(res['input_ids'][0]), res['input_ids'][0], res['labels'][0]):
  print(f'{token:<15} {token_id:<10} {label}')

In [None]:
tokenized_datasets = dataset.map(tokenize_ner, batched=True, num_proc=4)

In [None]:
tokenized_datasets['train'][560]

In [None]:
tokenized_datasets['train'][0].keys()

In [None]:
res = model(input_ids=torch.tensor(tokenized_datasets['train'][560:561]['input_ids']).to(device))

In [None]:
for k, v in res.items():
  print(k, v.shape)

Lastly we need a collator.

It ensures that tokenized inputs, labels, and other relevant data are properly formatted and batched together during training. This is particularly useful for tasks like Named Entity Recognition (NER) and part-of-speech tagging where token-level labels need to be aligned with input tokens.

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

### 5. Performnce evaluation

In [None]:
metric = evaluate.load("poseval")  # seqeval for NER

In [None]:
# Define a function to compute evaluation metrics for a given batch of predictions and labels.
# The function takes eval_pred as input, which contains batch_logits and batch_labels.
def compute_metrics(eval_pred, print_info=False):
    # Extract batch_logits and batch_labels from eval_pred.
    batch_logits, batch_labels = eval_pred

    # Compute batch_predictions by selecting the class with the highest probability.
    batch_predictions = np.argmax(batch_logits, axis=-1)

    # Initialize lists to store filtered predictions and labels for each sample in the batch.
    filtered_hr_batch_predictions = []
    filtered_hr_batch_labels = []

    # Iterate over samples in the batch to filter out padding tokens (-100).
    for sample_prediction, sample_label in zip(batch_predictions, batch_labels):
        filtered_hr_sample_predictions = []
        filtered_hr_sample_labels = []

        # Iterate over predictions and labels in each sample.
        for prediction, label in zip(sample_prediction, sample_label):
            # Check if the label is not a padding token (-100).
            if label != -100:
                # Convert prediction and label indices to class names using class_idx_to_class_name mapping.
                filtered_hr_sample_predictions.append(class_idx_to_class_name[prediction])
                filtered_hr_sample_labels.append(class_idx_to_class_name[label])

        # Append filtered predictions and labels for the sample to the batch lists.
        filtered_hr_batch_predictions.append(filtered_hr_sample_predictions)
        filtered_hr_batch_labels.append(filtered_hr_sample_labels)

    # Compute evaluation metrics using the filtered predictions and labels.
    metric_res = metric.compute(predictions=filtered_hr_batch_predictions,
                                references=filtered_hr_batch_labels)

    # Optionally print information about filtered predictions, labels, and metric results.
    if print_info:
        print(filtered_hr_batch_predictions)
        print(filtered_hr_batch_labels)
        print(metric_res)

    # Create a dictionary to store computed metrics.
    all_metrics = {k: v for k, v in metric_res.items() if type(v) is not dict}
    all_metrics = {**all_metrics, **metric_res['weighted avg']}

    # Compute and add F1 scores for specific class types to the metrics dictionary.
    for ct in class_types:
        v = metric_res.get(ct, None)
        all_metrics[ct + '_f1'] = v['f1-score'] if v is not None else 0.

    # Return the computed evaluation metrics.
    return all_metrics

In [None]:
ds_test = tokenized_datasets['test']

res = model(input_ids=torch.tensor(ds_test['input_ids'][:1]).to(device),
            attention_mask=torch.tensor(ds_test['attention_mask'][:1]).to(device)
            )

In [None]:
# don't forget to copy data to cpu-accessibel memory and convert to NumPy
pred = res.logits.detach().cpu().numpy()
lbl = tokenized_datasets['test']['labels'][:1]

In [None]:
metrics_res = compute_metrics((pred, lbl), print_info=True)

In [None]:
metrics_res

Since the last model layer doing the 47-way classification is not yet trained - the model spits out rubbish, and the scores are 0

### 6. Training

In [None]:
# rm -rf ner logs_ner

In [None]:
rm -rf pos logs_pos

In [None]:
training_args = TrainingArguments(output_dir="pos",
                                  num_train_epochs=10,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",

                                  per_device_train_batch_size=16,  # batch size per device during training
                                  per_device_eval_batch_size=16,   # batch size for evaluation
                                  warmup_steps=250,                # number of warmup steps for learning rate scheduler
                                  weight_decay=0.01,               # strength of weight decay
                                  logging_dir='./logs_pos',        # directory for storing logs
                                  logging_steps=10,
                                  # optim="adafactor"
                                  )

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    report_to='wandb',
)

In [None]:
!ls

In [None]:
trainer.train()

In [None]:
def plot_hist(log_hist):
  """
  Helper function to aggregate and visualize training history
  from trainers's logs
  """
  last_loss = 0
  s = 4
  sfx = 'eval_'
  e_loss_name = sfx+'loss'
  sfx_skip = ['_per_second', 'runtime', 'epoch', 'step', 'confusion_matrix']

  steps = []
  loss = []

  e_steps = []
  e_loss = []
  e_mtr = {}

  e_cms = []

  for el in log_hist:
    if e_loss_name in el:
      for k, v in el.items():
        if any([s in k for s in sfx_skip]):
          continue


        if k == e_loss_name:
          e_loss.append(v)
        else:
          if k not in e_mtr:
            e_mtr[k] = []
          e_mtr[k].append(v)

      e_steps.append(el['step'])
      if 'confusion_matrix' in el:
        e_cms.append(el['confusion_matrix'])

    else:
      if 'loss' in el:
        steps.append(el['step'])
        loss.append(el['loss'])

  n_fig = len(e_mtr)+1

  fix, ax = plt.subplots(1, n_fig, figsize=(s*n_fig, s*1))
  if n_fig < 2:
    ax = [ax]
  
  ax[0].plot(steps, loss, alpha=0.5, label='tra')
  ax[0].plot(e_steps, e_loss, alpha=0.5, label='val')
  ax[0].set_xlabel('steps')
  ax[0].set_title('loss')
  ax[0].legend()

  for idx, (lbl, vals) in enumerate(e_mtr.items()):
    i = idx+1

    #print(lbl, vals, e_steps)
    ax[i].plot(e_steps, vals, alpha=0.5, label='val')
    ax[i].set_xlabel('steps')
    ax[i].set_title(lbl)

    #ax[i].legend()

  plt.show()
  plt.close()

  for i, cm in enumerate(e_cms):
    plt.matshow(cm, cmap=plt.cm.Blues, alpha=0.3)
    plt.title(e_steps[i])
    plt.show()

In [None]:
# summarize the steps in a flowchart on a whiteboard (15 min)

In [None]:
plot_hist(trainer.state.log_history)

### 7. Exercise

Try modifying the code to do NER tag identification