<a href="https://colab.research.google.com/github/fubotz/ICL_2024W/blob/main/FinalProject_Fabian_SCHAMBECK_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Final Project: Finetuning a Pretrained Multilingual Model for Cognate Detection

Methods: Nearest Neighbor / [MASK]

Model: distilbert-base-multilingual-cased

Dataset: Helsinki-NLP / europarl (en-fr split)

In [35]:
!pip install bertviz
!pip install datasets
!pip install evaluate
!pip install optuna
!pip install scikit-learn
!pip install transformers
!pip install torch

Collecting optuna
  Downloading optuna-4.2.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.0-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.4/383.4 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM

# Load tokenizer and models
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Model for Nearest Neighbor Approach
embedding_model = AutoModel.from_pretrained(model_name)

# Model for [MASK] Approach
mlm_model = AutoModelForMaskedLM.from_pretrained(model_name)

print(tokenizer)

DistilBertTokenizerFast(name_or_path='distilbert-base-multilingual-cased', vocab_size=119547, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


In [7]:
import torch
import numpy as np

# Get the embeddings for the model's vocabulary
def get_vocab_embeddings(tokenizer, model):
    """
    Extract embeddings for all meaningful tokens in the model's vocabulary.
    Filters out subwords, special tokens, and non-alphabetical tokens.
    """
    vocab_size = len(tokenizer)
    embeddings = []
    words = []

    for token_id in range(vocab_size):
        token = tokenizer.convert_ids_to_tokens(token_id)
        # Filter: Exclude subwords (tokens starting with ##), special tokens, and non-alphabetical tokens
        if token.isalpha() and not token.startswith("##"):
            words.append(token)
            with torch.no_grad():
                # Create a tensor for the token and get its embedding
                inputs = torch.tensor([[token_id]])
                outputs = model.embeddings.word_embeddings(inputs)
                embeddings.append(outputs[0].squeeze(0).numpy())

    return words, np.array(embeddings)

In [8]:
# Retrieve embedding for a single word
def get_embedding(word, tokenizer, model):
    """
    Retrieve the embedding for a single word.
    """
    tokens = tokenizer(word, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokens)
        embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token embedding
    return embedding.squeeze(0)

## Evaluate Pretrained Model ##

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# NN aproach
def find_nearest_neighbors_direct(word_en, tokenizer, model, vocab_embeddings, vocab_words, top_k=5):
    """
    Find the nearest neighbors for an input word directly from the model's filtered vocabulary.
    """
    # Get the embedding for the input word
    en_embedding = get_embedding(word_en, tokenizer, model).numpy()

    # Compute cosine similarities
    similarities = cosine_similarity([en_embedding], vocab_embeddings)[0]

    # Get the top_k most similar tokens
    top_indices = similarities.argsort()[-top_k:][::-1]
    top_tokens = [(vocab_words[idx], similarities[idx]) for idx in top_indices]

    return top_tokens

In [10]:
# [MASK] approach
def predict_with_mask(word_en, tokenizer, model, top_k=5):
    """
    Predict the most probable French words for the given English word using the [MASK] approach.
    """
    # Define the prompt with the [MASK] token
    sentence = f"The English word is '{word_en}'. Le mot français est [MASK]."

    # Tokenize the sentence
    tokens = tokenizer(sentence, return_tensors="pt")

    # Identify the [MASK] token index
    mask_token_index = torch.where(tokens["input_ids"] == tokenizer.mask_token_id)[1]

    # Perform inference
    with torch.no_grad():
        outputs = model(**tokens)
        logits = outputs.logits

    # Get the top_k predictions for the [MASK] token
    mask_token_logits = logits[0, mask_token_index, :]
    top_token_ids = torch.topk(mask_token_logits, top_k, dim=-1).indices[0].tolist()

    # Decode the predicted tokens into words
    predictions = [tokenizer.decode([token_id]).strip() for token_id in top_token_ids]

    return predictions

In [11]:
# Define hardcoded word pairs for evaluation (n=21) (taken from Frossard et al.)
word_pairs = [
    ("academic", "académique"),
    ("administrator", "administrateur"),
    ("algorithm", "algorithme"),
    ("chemical", "chimique"),
    ("delicious", "délicieux"),
    ("emotion", "émotion"),
    ("exercise", "exercice"),
    ("gender", "genre"),
    ("gorilla", "gorille"),
    ("loyalty", "loyauté"),
    ("notation", "notamment"),
    ("objective", "objectif"),
    ("oratory", "oratoire"),
    ("particle", "particule"),
    ("quarter", "quartier"),
    ("september", "septembre"),
    ("skeleton", "squelette"),
    ("traditionally", "traditionnellement"),
    ("voice", "voix"),
    ("west", "ouest"),
    ("wine", "vin"),
]

# Extract filtered vocabulary embeddings
vocab_words, vocab_embeddings = get_vocab_embeddings(tokenizer, embedding_model)

# Evaluate all words in the hardcoded list
for word_en, word_fr in word_pairs:
    print(f"\nProcessing '{word_en}' (expected: '{word_fr}')")

    # Nearest Neighbor Approach
    nearest_neighbors = find_nearest_neighbors_direct(word_en, tokenizer, embedding_model, vocab_embeddings, vocab_words)

    # [MASK] Approach
    mask_predictions = predict_with_mask(word_en, tokenizer, mlm_model)

    # Print results for Nearest Neighbors
    print(f"Nearest neighbors:")
    for token, similarity in nearest_neighbors:
        print(f"  {token}: {similarity:.4f}")

    # Print results for [MASK] Approach
    print(f"[MASK] predictions:")
    for prediction in mask_predictions:
        print(f"  {prediction}")


Processing 'academic' (expected: 'académique')
Nearest neighbors:
  only: 0.1267
  Mad: 0.1172
  Ancient: 0.1168
  達: 0.1097
  Flying: 0.1079
[MASK] predictions:
  inconnu
  latin
  scientifique
  anglais
  français

Processing 'administrator' (expected: 'administrateur')
Nearest neighbors:
  only: 0.1250
  شروع: 0.1145
  нем: 0.1139
  Bart: 0.1119
  կամ: 0.1057
[MASK] predictions:
  inconnu
  蹦
  present
  ancien
  simple

Processing 'algorithm' (expected: 'algorithme')
Nearest neighbors:
  schnell: 0.1322
  Ergebnis: 0.1231
  Flying: 0.1169
  شروع: 0.1145
  wahrscheinlich: 0.1110
[MASK] predictions:
  inconnu
  蹦
  proche
  present
  beteiligt

Processing 'chemical' (expected: 'chimique')
Nearest neighbors:
  Pflanzen: 0.1225
  ʸ: 0.1223
  浦: 0.1155
  only: 0.1125
  πριν: 0.1115
[MASK] predictions:
  inconnu
  simple
  latin
  proche
  courant

Processing 'delicious' (expected: 'délicieux')
Nearest neighbors:
  Ancient: 0.1184
  only: 0.1132
  flying: 0.1126
  達: 0.1115
  pur: 0.106

In [12]:
def evaluate(word_pairs, tokenizer, embedding_model, mlm_model, vocab_words, vocab_embeddings, top_k=5):
    """
    Evaluate the Nearest Neighbor and [MASK] approaches on the given word pairs.
    Returns the accuracy for each method.
    """
    nn_correct = 0
    mask_correct = 0

    for word_en, word_fr in word_pairs:
        print(f"\nProcessing '{word_en}' (expected: '{word_fr}')")

        # Nearest Neighbor Approach
        nearest_neighbors = find_nearest_neighbors_direct(word_en, tokenizer, embedding_model, vocab_embeddings, vocab_words, top_k)
        nn_predictions = [token for token, _ in nearest_neighbors]

        # [MASK] Approach
        mask_predictions = predict_with_mask(word_en, tokenizer, mlm_model, top_k)

        # Check if the expected word is in the top_k predictions
        if word_fr in nn_predictions:
            nn_correct += 1
        if word_fr in mask_predictions:
            mask_correct += 1

        # Print results for debugging
        print(f"Nearest neighbors: {nn_predictions}")
        print(f"[MASK] predictions: {mask_predictions}")

    # Calculate accuracy
    total = len(word_pairs)
    nn_accuracy = nn_correct / total * 100
    mask_accuracy = mask_correct / total * 100

    print("\nEvaluation Results:")
    print(f"Nearest Neighbor Accuracy: {nn_accuracy:.2f}%")
    print(f"[MASK] Accuracy: {mask_accuracy:.2f}%")

    return nn_accuracy, mask_accuracy


# Evaluate the hardcoded word pairs
nn_accuracy, mask_accuracy = evaluate(
    word_pairs,
    tokenizer,
    embedding_model,
    mlm_model,
    vocab_words,
    vocab_embeddings,
    top_k=5
)


Processing 'academic' (expected: 'académique')
Nearest neighbors: ['only', 'Mad', 'Ancient', '達', 'Flying']
[MASK] predictions: ['inconnu', 'latin', 'scientifique', 'anglais', 'français']

Processing 'administrator' (expected: 'administrateur')
Nearest neighbors: ['only', 'شروع', 'нем', 'Bart', 'կամ']
[MASK] predictions: ['inconnu', '蹦', 'present', 'ancien', 'simple']

Processing 'algorithm' (expected: 'algorithme')
Nearest neighbors: ['schnell', 'Ergebnis', 'Flying', 'شروع', 'wahrscheinlich']
[MASK] predictions: ['inconnu', '蹦', 'proche', 'present', 'beteiligt']

Processing 'chemical' (expected: 'chimique')
Nearest neighbors: ['Pflanzen', 'ʸ', '浦', 'only', 'πριν']
[MASK] predictions: ['inconnu', 'simple', 'latin', 'proche', 'courant']

Processing 'delicious' (expected: 'délicieux')
Nearest neighbors: ['Ancient', 'only', 'flying', '達', 'pur']
[MASK] predictions: ['蹦', 'inconnu', 'simple', 'present', 'vrai']

Processing 'emotion' (expected: 'émotion')
Nearest neighbors: ['Ancient', 'on

## Load and Preprocess Dataset ##

In [18]:
from datasets import load_dataset

# Load the English-French subset of the Helsinki-NLP/europarl dataset
dataset = load_dataset("Helsinki-NLP/europarl", "en-fr")

# Inspect the dataset
print(dataset)

# Print a few examples
print("\nSample from the training set:")
print(dataset["train"][0])

# Check the number of examples in each split
print("\nNumber of examples in each split:")
print({split: len(dataset[split]) for split in dataset})

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 2051014
    })
})

Sample from the training set:
{'translation': {'en': 'Resumption of the session', 'fr': 'Reprise de la session'}}

Number of examples in each split:
{'train': 2051014}


In [21]:
# Shuffle the dataset and select 10,000 examples
small_dataset = dataset["train"].shuffle(seed=42).select(range(10000))

# Inspect the smaller dataset
print(f"\nSmall dataset size: {len(small_dataset)}")
print("\nSample from the small dataset:")
print(small_dataset[0])


Small dataset size: 10000

Sample from the small dataset:
{'translation': {'en': 'As Europeans, with our experience, our culture of peace and our economic opportunities, we too are called upon to make our contribution towards a better future for Iraq.', 'fr': "Les Européens que nous sommes, avec leur expérience, leur culture de la paix et leurs moyens économiques, sont appelés à apporter leur contribution en faveur d'un avenir meilleur en Irak."}}


In [31]:
# Split the small dataset into 80% train and 20% temporary
train_val_test_split = small_dataset.train_test_split(test_size=0.2, seed=42)

# Further split the temporary set (20%) into 10% validation and 10% test
val_test_split = train_val_test_split["test"].train_test_split(test_size=0.5, seed=42)

# Final splits
train_dataset = train_val_test_split["train"]
val_dataset = val_test_split["train"]
test_dataset = val_test_split["test"]

# Print sizes of the splits
print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Training set size: 8000
Validation set size: 1000
Test set size: 1000


In [32]:
def preprocess_function(examples):
    """
    Preprocess examples by tokenizing English and French text and preparing input and label tensors.
    """
    # Extract English and French sentences as lists
    inputs = [item["en"] for item in examples["translation"]]  # Extract "en" field from each item
    targets = [item["fr"] for item in examples["translation"]]  # Extract "fr" field from each item

    # Tokenize English and French sentences
    model_inputs = tokenizer(inputs, max_length=64, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=64, truncation=True, padding="max_length")

    # MLM labels: Add tokenized French sentences to labels
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Apply preprocessing to each dataset split
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Remove unnecessary columns (translation field)
train_dataset = train_dataset.remove_columns(["translation"])
val_dataset = val_dataset.remove_columns(["translation"])
test_dataset = test_dataset.remove_columns(["translation"])

# Set the format to PyTorch for efficient data loading
train_dataset.set_format("torch")
val_dataset.set_format("torch")
test_dataset.set_format("torch")

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [33]:
# Inspect a preprocessed example from the training set
print("\nSample preprocessed training example:")
print(train_dataset[0])

# Verify dataset sizes after preprocessing
print(f"\nFinal dataset sizes:")
print(f"Training set: {len(train_dataset)}")
print(f"Validation set: {len(val_dataset)}")
print(f"Test set: {len(test_dataset)}")


Sample preprocessed training example:
{'input_ids': tensor([  101, 11723, 10301,   169, 14772, 11299, 11846, 13641, 24317, 10230,
        99402, 21422,   117, 10473,   146, 10392, 10108, 10105, 32282, 10189,
        10105, 63711, 10108, 13246, 18322, 21422, 14819, 10472, 10347, 32296,
          119,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'labels': tensor([  101, 10282, 16507, 10119, 13023, 11491, 10104, 25005, 62651, 10107,
        21535, 24407,   117, 55956, 10144, 49301,   172,   112, 57822, 10608,
          112, 10154, 10554, 30441, 108

## Finetune Model ##

In [40]:
from transformers import TrainingArguments
import evaluate

# Load accuracy metric
accuracy = evaluate.load("accuracy")

# Define metric computation function
def compute_metrics(eval_pred):
    """
    Computes accuracy during validation.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [41]:
import optuna
from transformers import Trainer

def objective(trial):
    """
    Optuna objective function for hyperparameter tuning.
    """
    # Suggest hyperparameters to tune
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    weight_decay = trial.suggest_float("weight_decay", 0.01, 0.3)  # Adjusted range for weight decay

    # Define training arguments
    arguments = TrainingArguments(
        output_dir="/content/drive/MyDrive/Colab Notebooks/cognate_trainer",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        logging_steps=8,
        num_train_epochs=5,
        eval_strategy="epoch",  # Updated from evaluation_strategy
        save_strategy="epoch",
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        load_best_model_at_end=True,
        report_to="none",
        seed=224,
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=mlm_model,  # Use the MLM model
        args=arguments,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Evaluate on the validation dataset and return accuracy
    eval_result = trainer.evaluate()
    return eval_result["eval_accuracy"]

In [42]:
# Create the Optuna study
study = optuna.create_study(direction="maximize")  # Maximize accuracy

# Optimize the study
study.optimize(objective, n_trials=20)

# Print the best hyperparameters
print("Best hyperparameters found:")
print(study.best_params)

[I 2025-01-27 09:24:43,352] A new study created in memory with name: no-name-c1358eaf-850e-4720-a2ed-cba2e8f7aa8d
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


[W 2025-01-27 09:27:17,180] Trial 0 failed with parameters: {'learning_rate': 0.00045164970741194357, 'batch_size': 16, 'weight_decay': 0.07767016420191199} because of the following error: OutOfMemoryError('CUDA out of memory. Tried to allocate 3.19 GiB. GPU 0 has a total capacity of 14.75 GiB of which 331.06 MiB is free. Process 9482 has 14.42 GiB memory in use. Of the allocated memory 11.44 GiB is allocated by PyTorch, and 2.85 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)').
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-41-8ccfcb319a61>", line 40, in objective
    train

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.19 GiB. GPU 0 has a total capacity of 14.75 GiB of which 331.06 MiB is free. Process 9482 has 14.42 GiB memory in use. Of the allocated memory 11.44 GiB is allocated by PyTorch, and 2.85 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Extract best hyperparameters
best_params = study.best_params

# Define training arguments with best hyperparameters
best_arguments = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/cognate_trainer",
    per_device_train_batch_size=best_params["batch_size"],
    per_device_eval_batch_size=best_params["batch_size"],
    logging_steps=8,
    num_train_epochs=5,
    eval_strategy="epoch",  # Updated from evaluation_strategy
    save_strategy="epoch",
    learning_rate=best_params["learning_rate"],
    weight_decay=best_params["weight_decay"],
    load_best_model_at_end=True,
    report_to="none",
    seed=224,
)

# Initialize the Trainer with the best configuration
best_trainer = Trainer(
    model=mlm_model,  # Use the correct model object
    args=best_arguments,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,  # Pass the tokenizer
    compute_metrics=compute_metrics,
)

In [None]:
import os

# Train the model with the best hyperparameters
best_trainer.train()

# Ensure the output directory exists before saving
output_dir = "/content/drive/MyDrive/Colab Notebooks/cognate_trainer_best_model"
os.makedirs(output_dir, exist_ok=True)

# Save the best model
best_trainer.save_model(output_dir)

In [None]:
# Evaluate the fine-tuned model on the test dataset
test_results = best_trainer.evaluate(test_dataset)

print("\nTest Results:")
print(test_results)