<a href="https://colab.research.google.com/github/fubotz/ICL_2024W/blob/main/FinalProject_Fabian_SCHAMBECK_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Final Project: Finetuning a Pretrained Multilingual Model for Cognate Detection

Methods: Nearest Neighbor / [MASK]

Model: distilbert-base-multilingual-cased

Dataset: Helsinki-NLP / europarl (en-fr split)

In [47]:
!pip install bertviz
!pip install datasets
!pip install evaluate
!pip install optuna
!pip install scikit-learn
!pip install transformers
!pip install torch



In [48]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load Dataset ##

In [49]:
!wget https://raw.githubusercontent.com/fubotz/ICL_2024W/refs/heads/main/word_pairs.json        # dataset taken from Frossard et al.

--2025-01-27 19:37:04--  https://raw.githubusercontent.com/fubotz/ICL_2024W/refs/heads/main/word_pairs.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23242 (23K) [text/plain]
Saving to: ‘word_pairs.json.1’


2025-01-27 19:37:04 (31.0 MB/s) - ‘word_pairs.json.1’ saved [23242/23242]



In [50]:
import json
with open("word_pairs.json", "r") as f:
    dataset = json.load(f)
print(dataset)

[{'abandon': 'abandon'}, {'abbe': 'abbé'}, {'abdomen': 'abdomen'}, {'abdominal': 'abdominal'}, {'aberration': 'aberration'}, {'abolition': 'abolition'}, {'abominable': 'abominable'}, {'absence': 'absence'}, {'absolute': 'absolu'}, {'absolution': 'absolution'}, {'absorption': 'absorption'}, {'abstinence': 'abstinence'}, {'abstraction': 'abstraction'}, {'absurd': 'absurde'}, {'absurdity': 'absurdité'}, {'abundance': 'abondance'}, {'abundant': 'abondant'}, {'academic': 'académique'}, {'academy': 'académie'}, {'acceleration': 'accélération'}, {'accent': 'accent'}, {'acceptable': 'acceptable'}, {'access': 'accès'}, {'accessory': 'accessoire'}, {'accident': 'accident'}, {'accidental': 'accidentel'}, {'accidentally': 'accidentellement'}, {'acclamation': 'acclamation'}, {'accord': 'accord'}, {'acetone': 'acétone'}, {'acid': 'acide'}, {'acoustic': 'acoustique'}, {'activity': 'activité'}, {'actor': 'acteur'}, {'addition': 'addition'}, {'address': 'addresse'}, {'adherent': 'adhérent'}, {'administ

In [51]:
import random

# Shuffle the dataset
random.shuffle(dataset)

# Calculate the split indices
total_size = len(dataset)
train_size = int(0.7 * total_size)
val_size = int(0.2 * total_size)

# Create train, validation, and test splits (70:20:10)
train_data = dataset[:train_size]
val_data = dataset[train_size:train_size + val_size]
test_data = dataset[train_size + val_size:]

# Verify the split sizes
print(f"Total samples: {total_size}")
print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")

Total samples: 492
Training samples: 344
Validation samples: 98
Test samples: 50


## Load Model and Get Embeddings ##

In [70]:
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM

# Load tokenizer and models
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Model for Nearest Neighbor Approach
embedding_model = AutoModel.from_pretrained(model_name)

# Model for [MASK] Approach
mlm_model = AutoModelForMaskedLM.from_pretrained(model_name)

print(tokenizer)

DistilBertTokenizerFast(name_or_path='distilbert-base-multilingual-cased', vocab_size=119547, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


In [53]:
import torch
import numpy as np

# Get the embeddings for the model's vocabulary
def get_vocab_embeddings(tokenizer, model):
    """
    Extract embeddings for all meaningful tokens in the model's vocabulary.
    Filters out subwords, special tokens, and non-alphabetical tokens.
    """
    vocab_size = len(tokenizer)
    embeddings = []
    words = []

    for token_id in range(vocab_size):
        token = tokenizer.convert_ids_to_tokens(token_id)
        # Filter: Exclude subwords (tokens starting with ##), special tokens, and non-alphabetical tokens
        if token.isalpha() and not token.startswith("##"):
            words.append(token)
            with torch.no_grad():
                # Create a tensor for the token and get its embedding
                inputs = torch.tensor([[token_id]])
                outputs = model.embeddings.word_embeddings(inputs)
                embeddings.append(outputs[0].squeeze(0).numpy())

    return words, np.array(embeddings)

In [54]:
# Retrieve embedding for a single word
def get_embedding(word, tokenizer, model):
    """
    Retrieve the embedding for a single word.
    """
    tokens = tokenizer(word, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokens)
        embedding = outputs.last_hidden_state[:, 0, :]      # [CLS] token embedding
    return embedding.squeeze(0)

## Evaluate Pretrained Model ##

In [55]:
from sklearn.metrics.pairwise import cosine_similarity

# NN aproach
def find_nearest_neighbors_direct(word_en, tokenizer, model, vocab_embeddings, vocab_words, top_k=5):
    """
    Find the nearest neighbors for an input word directly from the model's filtered vocabulary.
    """
    # Get the embedding for the input word
    en_embedding = get_embedding(word_en, tokenizer, model).numpy()

    # Compute cosine similarities
    similarities = cosine_similarity([en_embedding], vocab_embeddings)[0]

    # Get the top_k most similar tokens
    top_indices = similarities.argsort()[-top_k:][::-1]
    top_tokens = [(vocab_words[idx], similarities[idx]) for idx in top_indices]

    return top_tokens

In [56]:
# [MASK] approach
def predict_with_mask(word_en, tokenizer, model, top_k=5):
    """
    Predict the most probable French words for the given English word using the [MASK] approach.
    """
    # Define the prompt with the [MASK] token
    sentence = f"The English word is '{word_en}'. Le mot français est [MASK]."

    # Tokenize the sentence
    tokens = tokenizer(sentence, return_tensors="pt")

    # Identify the [MASK] token index
    mask_token_index = torch.where(tokens["input_ids"] == tokenizer.mask_token_id)[1]

    # Perform inference
    with torch.no_grad():
        outputs = model(**tokens)
        logits = outputs.logits

    # Get the top_k predictions for the [MASK] token
    mask_token_logits = logits[0, mask_token_index, :]
    top_token_ids = torch.topk(mask_token_logits, top_k, dim=-1).indices[0].tolist()

    # Decode the predicted tokens into words
    predictions = [tokenizer.decode([token_id]).strip() for token_id in top_token_ids]

    return predictions

In [57]:
# Extract filtered vocabulary embeddings
vocab_words, vocab_embeddings = get_vocab_embeddings(tokenizer, embedding_model)

# Use English-French pairs from test_data instead of hardcoded pairs
for word_pair in test_data:
    word_en, word_fr = list(word_pair.keys())[0], list(word_pair.values())[0]       # Extract English and French words
    print(f"\nProcessing '{word_en}' (expected: '{word_fr}')")

    # Nearest Neighbor Approach
    nearest_neighbors = find_nearest_neighbors_direct(word_en, tokenizer, embedding_model, vocab_embeddings, vocab_words)

    # [MASK] Approach
    mask_predictions = predict_with_mask(word_en, tokenizer, mlm_model)

    # Print results for Nearest Neighbors
    print(f"Nearest neighbors:")
    for token, similarity in nearest_neighbors:
        print(f"  {token}: {similarity:.4f}")

    # Print results for [MASK] Approach
    print(f"[MASK] predictions:")
    for prediction in mask_predictions:
        print(f"  {prediction}")


Processing 'accidentally' (expected: 'accidentellement')
Nearest neighbors:
  Ancient: 0.1203
  only: 0.1200
  elections: 0.1194
  吞: 0.1043
  Flying: 0.1027
[MASK] predictions:
  inconnu
  蹦
  present
  proche
  beteiligt

Processing 'acclamation' (expected: 'acclamation')
Nearest neighbors:
  Flying: 0.1275
  wider: 0.1203
  των: 0.1183
  растения: 0.1179
  flying: 0.1179
[MASK] predictions:
  蹦
  inconnu
  beteiligt
  present
  proche

Processing 'adventure' (expected: 'aventure')
Nearest neighbors:
  only: 0.1180
  Flying: 0.1173
  flying: 0.1148
  Mad: 0.1132
  сих: 0.1095
[MASK] predictions:
  inconnu
  蹦
  present
  traduit
  beteiligt

Processing 'music' (expected: 'musique')
Nearest neighbors:
  Ancient: 0.1245
  only: 0.1208
  flying: 0.1165
  吞: 0.1153
  Mad: 0.1132
[MASK] predictions:
  beteiligt
  inconnu
  latin
  present
  anglais

Processing 'longitude' (expected: 'longitude')
Nearest neighbors:
  Ancient: 0.1081
  cunoscute: 0.1043
  flying: 0.1039
  входит: 0.1034
  

In [58]:
def evaluate_model(word_pairs, tokenizer, embedding_model, mlm_model, vocab_words, vocab_embeddings, top_k=5):
    """
    Evaluate the Nearest Neighbor and [MASK] approaches on the given word pairs.
    Returns the accuracy for each method.
    """
    nn_correct = 0
    mask_correct = 0

    for word_pair in word_pairs:
        # Extract English and French words from dictionary
        word_en, word_fr = list(word_pair.keys())[0], list(word_pair.values())[0]
        print(f"\nProcessing '{word_en}' (expected: '{word_fr}')")

        # Nearest Neighbor Approach
        nearest_neighbors = find_nearest_neighbors_direct(word_en, tokenizer, embedding_model, vocab_embeddings, vocab_words, top_k)
        nn_predictions = [token for token, _ in nearest_neighbors]

        # [MASK] Approach
        mask_predictions = predict_with_mask(word_en, tokenizer, mlm_model, top_k)

        # Check if the expected word is in the top_k predictions
        if word_fr in nn_predictions:
            nn_correct += 1
        if word_fr in mask_predictions:
            mask_correct += 1

        # Print results for debugging
        print(f"Nearest neighbors: {nn_predictions}")
        print(f"[MASK] predictions: {mask_predictions}")

    # Calculate accuracy
    total = len(word_pairs)
    nn_accuracy = nn_correct / total * 100
    mask_accuracy = mask_correct / total * 100

    print("\nEvaluation Results:")
    print(f"Nearest Neighbor Accuracy: {nn_accuracy:.2f}%")
    print(f"[MASK] Accuracy: {mask_accuracy:.2f}%")

    return nn_accuracy, mask_accuracy


# Evaluate the dynamically loaded word pairs from `test_data`
nn_accuracy, mask_accuracy = evaluate(
    test_data,
    tokenizer,
    embedding_model,
    mlm_model,
    vocab_words,
    vocab_embeddings,
    top_k=5
)


Processing 'accidentally' (expected: 'accidentellement')
Nearest neighbors: ['Ancient', 'only', 'elections', '吞', 'Flying']
[MASK] predictions: ['inconnu', '蹦', 'present', 'proche', 'beteiligt']

Processing 'acclamation' (expected: 'acclamation')
Nearest neighbors: ['Flying', 'wider', 'των', 'растения', 'flying']
[MASK] predictions: ['蹦', 'inconnu', 'beteiligt', 'present', 'proche']

Processing 'adventure' (expected: 'aventure')
Nearest neighbors: ['only', 'Flying', 'flying', 'Mad', 'сих']
[MASK] predictions: ['inconnu', '蹦', 'present', 'traduit', 'beteiligt']

Processing 'music' (expected: 'musique')
Nearest neighbors: ['Ancient', 'only', 'flying', '吞', 'Mad']
[MASK] predictions: ['beteiligt', 'inconnu', 'latin', 'present', 'anglais']

Processing 'longitude' (expected: 'longitude')
Nearest neighbors: ['Ancient', 'cunoscute', 'flying', 'входит', '浦']
[MASK] predictions: ['inconnu', 'proche', 'courant', 'long', 'variable']

Processing 'circumstance' (expected: 'circonstance')
Nearest n

## Preprocess Dataset ##

In [59]:
from torch.utils.data import DataLoader

# Preprocessing function
def preprocess_function(examples):
    """
    Preprocess examples by tokenizing English and French text and preparing input and label tensors.
    """
    # Extract inputs and targets
    inputs = examples["word_en"]        # List of English words
    targets = examples["word_fr"]       # List of French words

    # Tokenize inputs and targets with a smaller max_length
    model_inputs = tokenizer(inputs, max_length=8, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=8, truncation=True, padding="max_length")["input_ids"]

    # Replace padding tokens in labels with -100
    model_inputs["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in seq] for seq in labels
    ]

    return model_inputs

In [60]:
from datasets import Dataset

# Convert raw data into the correct format
formatted_train_data = [{"word_en": k, "word_fr": v} for item in train_data for k, v in item.items()]
formatted_val_data = [{"word_en": k, "word_fr": v} for item in val_data for k, v in item.items()]
formatted_test_data = [{"word_en": k, "word_fr": v} for item in test_data for k, v in item.items()]

# Create Hugging Face datasets
train_dataset = Dataset.from_list(formatted_train_data)
val_dataset = Dataset.from_list(formatted_val_data)
test_dataset = Dataset.from_list(formatted_test_data)

In [61]:
# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/344 [00:00<?, ? examples/s]

Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [62]:
# Inspect a preprocessed example from the training set
print("\nSample preprocessed training example:")
sample_train_example = train_dataset[0]  # Access the first example in train_dataset
print(sample_train_example)

# Decode the input IDs and labels to verify correctness
decoded_input = tokenizer.decode(sample_train_example["input_ids"].tolist(), skip_special_tokens=True)
decoded_label = tokenizer.decode(
    [token for token in sample_train_example["labels"].tolist() if token != -100], skip_special_tokens=True
)

print("\nDecoded Training Example:")
print(f"Input (word_en): {decoded_input}")
print(f"Label (word_fr): {decoded_label}")

# Verify dataset sizes after preprocessing
print(f"\nFinal dataset sizes:")
print(f"Training set: {len(train_dataset)}")
print(f"Validation set: {len(val_dataset)}")
print(f"Test set: {len(test_dataset)}")


Sample preprocessed training example:
{'input_ids': tensor([  101, 29580,   102,     0,     0,     0,     0,     0]), 'attention_mask': tensor([1, 1, 1, 0, 0, 0, 0, 0]), 'labels': tensor([  101, 12604, 51584,   102,  -100,  -100,  -100,  -100])}

Decoded Training Example:
Input (word_en): ideal
Label (word_fr): idéal

Final dataset sizes:
Training set: 344
Validation set: 98
Test set: 50


## Finetune Model ##

In [63]:
from transformers import TrainingArguments
import evaluate

# Load accuracy metric
accuracy = evaluate.load("accuracy")

# Define metric computation function
def compute_metrics(eval_pred):
    """
    Computes accuracy during validation by ignoring padding tokens.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Flatten predictions and labels (remove -100 labels)
    flattened_predictions = []
    flattened_labels = []

    for pred, label in zip(predictions, labels):
        for p, l in zip(pred, label):
            if l != -100:  # Ignore padding token labels
                flattened_predictions.append(p)
                flattened_labels.append(l)

    return accuracy.compute(predictions=flattened_predictions, references=flattened_labels)

In [64]:
from transformers import Trainer

# Define training arguments
arguments = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/cognate_trainer",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=8,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to='none',
    seed=224
)

# Initialize the Trainer
trainer = Trainer(
    model=mlm_model,        # Use MLM model
    args=arguments,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,       # NB: change for test
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [65]:
# Verify dataset format
from torch.utils.data import DataLoader

# Create a DataLoader for debugging
debug_loader = DataLoader(train_dataset, batch_size=8)

# Get a batch
batch = next(iter(debug_loader))
print(batch)

# Print shapes
print(f"Input IDs shape: {batch['input_ids'].shape}")
print(f"Attention Mask shape: {batch['attention_mask'].shape}")
print(f"Labels shape: {batch['labels'].shape}")

{'input_ids': tensor([[  101, 29580,   102,     0,     0,     0,     0,     0],
        [  101, 14379, 76299,   102,     0,     0,     0,     0],
        [  101, 46176,   102,     0,     0,     0,     0,     0],
        [  101, 12533,   102,     0,     0,     0,     0,     0],
        [  101, 49523, 14786, 13275,   102,     0,     0,     0],
        [  101, 38676, 12717, 48532, 10415,   102,     0,     0],
        [  101, 11487,   102,     0,     0,     0,     0,     0],
        [  101, 20455,   102,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 0, 0]]), 'labels': tensor([[  101, 12604, 51584,   102,  -100,  -100,  -100,  -100],
        [  101, 14379, 45375,   102,  -100,  -100,  -100,  -100],
        [  101, 13621,

In [66]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.7345,2.216182,0.674479
2,1.535,1.762113,0.742188
3,1.2008,1.78386,0.734375


Epoch,Training Loss,Validation Loss,Accuracy
1,1.7345,2.216182,0.674479
2,1.535,1.762113,0.742188
3,1.2008,1.78386,0.734375
4,0.898,1.757307,0.742188
5,0.5664,1.765558,0.744792


There were missing keys in the checkpoint model loaded: ['vocab_projector.weight'].


TrainOutput(global_step=215, training_loss=1.8576470463774926, metrics={'train_runtime': 253.4713, 'train_samples_per_second': 6.786, 'train_steps_per_second': 0.848, 'total_flos': 3569930974080.0, 'train_loss': 1.8576470463774926, 'epoch': 5.0})

In [67]:
# Save the trained model
output_dir = "/content/drive/MyDrive/Colab Notebooks/cognate_trainer_best_model"
trainer.save_model(output_dir)

# Evaluate the model on the test dataset
test_results = trainer.evaluate(test_dataset)

print("\nTest Results:")
print(test_results)


Test Results:
{'eval_loss': 1.0179818868637085, 'eval_accuracy': 0.8305084745762712, 'eval_runtime': 1.1917, 'eval_samples_per_second': 41.955, 'eval_steps_per_second': 5.874, 'epoch': 5.0}


In [68]:
finetuned_model = AutoModelForMaskedLM.from_pretrained(output_dir)

In [None]:
# testset

In [69]:
nn_accuracy, mask_accuracy = evaluate_model(
    test_data,          # Use your test data
    tokenizer,          # Updated tokenizer for the fine-tuned model
    embedding_model,    # Same embedding model (if unchanged)
    finetuned_model,    # Fine-tuned model for [MASK] predictions
    vocab_words,        # Updated vocabulary if necessary
    vocab_embeddings,   # Updated embeddings if necessary
    top_k=5             # Top-k predictions
)

# Output the results
print("\nEvaluation Results after Fine-Tuning:")
print(f"Nearest Neighbor Accuracy: {nn_accuracy:.2f}%")
print(f"[MASK] Accuracy: {mask_accuracy:.2f}%")

TypeError: 'module' object is not callable

In [None]:
# visualization