<a href="https://colab.research.google.com/github/fubotz/ICL_2024W/blob/main/FinalProject_Fabian_SCHAMBECK_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Final Project: Finetuning a Pretrained Multilingual Model for Cognate Detection

Methods: Nearest Neighbor / [MASK]

Model: distilbert-base-multilingual-cased

Dataset: Helsinki-NLP / europarl (en-fr split)

In [49]:
!pip install bertviz
!pip install datasets
!pip install evaluate
!pip install optuna
!pip install scikit-learn
!pip install transformers
!pip install torch



In [50]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM

# Load tokenizer and models
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Model for Nearest Neighbor Approach
embedding_model = AutoModel.from_pretrained(model_name)

# Model for [MASK] Approach
mlm_model = AutoModelForMaskedLM.from_pretrained(model_name)

print(tokenizer)

DistilBertTokenizerFast(name_or_path='distilbert-base-multilingual-cased', vocab_size=119547, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


In [52]:
import torch
import numpy as np

# Get the embeddings for the model's vocabulary
def get_vocab_embeddings(tokenizer, model):
    """
    Extract embeddings for all meaningful tokens in the model's vocabulary.
    Filters out subwords, special tokens, and non-alphabetical tokens.
    """
    vocab_size = len(tokenizer)
    embeddings = []
    words = []

    for token_id in range(vocab_size):
        token = tokenizer.convert_ids_to_tokens(token_id)
        # Filter: Exclude subwords (tokens starting with ##), special tokens, and non-alphabetical tokens
        if token.isalpha() and not token.startswith("##"):
            words.append(token)
            with torch.no_grad():
                # Create a tensor for the token and get its embedding
                inputs = torch.tensor([[token_id]])
                outputs = model.embeddings.word_embeddings(inputs)
                embeddings.append(outputs[0].squeeze(0).numpy())

    return words, np.array(embeddings)

In [53]:
# Retrieve embedding for a single word
def get_embedding(word, tokenizer, model):
    """
    Retrieve the embedding for a single word.
    """
    tokens = tokenizer(word, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokens)
        embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token embedding
    return embedding.squeeze(0)

## Evaluate Pretrained Model ##

In [54]:
from sklearn.metrics.pairwise import cosine_similarity

# NN aproach
def find_nearest_neighbors_direct(word_en, tokenizer, model, vocab_embeddings, vocab_words, top_k=5):
    """
    Find the nearest neighbors for an input word directly from the model's filtered vocabulary.
    """
    # Get the embedding for the input word
    en_embedding = get_embedding(word_en, tokenizer, model).numpy()

    # Compute cosine similarities
    similarities = cosine_similarity([en_embedding], vocab_embeddings)[0]

    # Get the top_k most similar tokens
    top_indices = similarities.argsort()[-top_k:][::-1]
    top_tokens = [(vocab_words[idx], similarities[idx]) for idx in top_indices]

    return top_tokens

In [55]:
# [MASK] approach
def predict_with_mask(word_en, tokenizer, model, top_k=5):
    """
    Predict the most probable French words for the given English word using the [MASK] approach.
    """
    # Define the prompt with the [MASK] token
    sentence = f"The English word is '{word_en}'. Le mot français est [MASK]."

    # Tokenize the sentence
    tokens = tokenizer(sentence, return_tensors="pt")

    # Identify the [MASK] token index
    mask_token_index = torch.where(tokens["input_ids"] == tokenizer.mask_token_id)[1]

    # Perform inference
    with torch.no_grad():
        outputs = model(**tokens)
        logits = outputs.logits

    # Get the top_k predictions for the [MASK] token
    mask_token_logits = logits[0, mask_token_index, :]
    top_token_ids = torch.topk(mask_token_logits, top_k, dim=-1).indices[0].tolist()

    # Decode the predicted tokens into words
    predictions = [tokenizer.decode([token_id]).strip() for token_id in top_token_ids]

    return predictions

In [56]:
# Define hardcoded word pairs for evaluation (n=21) (taken from Frossard et al.)
word_pairs = [
    ("academic", "académique"),
    ("administrator", "administrateur"),
    ("algorithm", "algorithme"),
    ("chemical", "chimique"),
    ("delicious", "délicieux"),
    ("emotion", "émotion"),
    ("exercise", "exercice"),
    ("gender", "genre"),
    ("gorilla", "gorille"),
    ("loyalty", "loyauté"),
    ("notation", "notamment"),
    ("objective", "objectif"),
    ("oratory", "oratoire"),
    ("particle", "particule"),
    ("quarter", "quartier"),
    ("september", "septembre"),
    ("skeleton", "squelette"),
    ("traditionally", "traditionnellement"),
    ("voice", "voix"),
    ("west", "ouest"),
    ("wine", "vin"),
]

# Extract filtered vocabulary embeddings
vocab_words, vocab_embeddings = get_vocab_embeddings(tokenizer, embedding_model)

# Evaluate all words in the hardcoded list
for word_en, word_fr in word_pairs:
    print(f"\nProcessing '{word_en}' (expected: '{word_fr}')")

    # Nearest Neighbor Approach
    nearest_neighbors = find_nearest_neighbors_direct(word_en, tokenizer, embedding_model, vocab_embeddings, vocab_words)

    # [MASK] Approach
    mask_predictions = predict_with_mask(word_en, tokenizer, mlm_model)

    # Print results for Nearest Neighbors
    print(f"Nearest neighbors:")
    for token, similarity in nearest_neighbors:
        print(f"  {token}: {similarity:.4f}")

    # Print results for [MASK] Approach
    print(f"[MASK] predictions:")
    for prediction in mask_predictions:
        print(f"  {prediction}")


Processing 'academic' (expected: 'académique')
Nearest neighbors:
  only: 0.1267
  Mad: 0.1172
  Ancient: 0.1168
  達: 0.1097
  Flying: 0.1079
[MASK] predictions:
  inconnu
  latin
  scientifique
  anglais
  français

Processing 'administrator' (expected: 'administrateur')
Nearest neighbors:
  only: 0.1250
  شروع: 0.1145
  нем: 0.1140
  Bart: 0.1119
  կամ: 0.1057
[MASK] predictions:
  inconnu
  蹦
  present
  ancien
  simple

Processing 'algorithm' (expected: 'algorithme')
Nearest neighbors:
  schnell: 0.1322
  Ergebnis: 0.1231
  Flying: 0.1169
  شروع: 0.1145
  wahrscheinlich: 0.1110
[MASK] predictions:
  inconnu
  蹦
  proche
  present
  beteiligt

Processing 'chemical' (expected: 'chimique')
Nearest neighbors:
  Pflanzen: 0.1225
  ʸ: 0.1223
  浦: 0.1155
  only: 0.1125
  πριν: 0.1115
[MASK] predictions:
  inconnu
  simple
  latin
  proche
  courant

Processing 'delicious' (expected: 'délicieux')
Nearest neighbors:
  Ancient: 0.1184
  only: 0.1132
  flying: 0.1126
  達: 0.1115
  pur: 0.106

In [57]:
def evaluate(word_pairs, tokenizer, embedding_model, mlm_model, vocab_words, vocab_embeddings, top_k=5):
    """
    Evaluate the Nearest Neighbor and [MASK] approaches on the given word pairs.
    Returns the accuracy for each method.
    """
    nn_correct = 0
    mask_correct = 0

    for word_en, word_fr in word_pairs:
        print(f"\nProcessing '{word_en}' (expected: '{word_fr}')")

        # Nearest Neighbor Approach
        nearest_neighbors = find_nearest_neighbors_direct(word_en, tokenizer, embedding_model, vocab_embeddings, vocab_words, top_k)
        nn_predictions = [token for token, _ in nearest_neighbors]

        # [MASK] Approach
        mask_predictions = predict_with_mask(word_en, tokenizer, mlm_model, top_k)

        # Check if the expected word is in the top_k predictions
        if word_fr in nn_predictions:
            nn_correct += 1
        if word_fr in mask_predictions:
            mask_correct += 1

        # Print results for debugging
        print(f"Nearest neighbors: {nn_predictions}")
        print(f"[MASK] predictions: {mask_predictions}")

    # Calculate accuracy
    total = len(word_pairs)
    nn_accuracy = nn_correct / total * 100
    mask_accuracy = mask_correct / total * 100

    print("\nEvaluation Results:")
    print(f"Nearest Neighbor Accuracy: {nn_accuracy:.2f}%")
    print(f"[MASK] Accuracy: {mask_accuracy:.2f}%")

    return nn_accuracy, mask_accuracy


# Evaluate the hardcoded word pairs
nn_accuracy, mask_accuracy = evaluate(
    word_pairs,
    tokenizer,
    embedding_model,
    mlm_model,
    vocab_words,
    vocab_embeddings,
    top_k=5
)


Processing 'academic' (expected: 'académique')
Nearest neighbors: ['only', 'Mad', 'Ancient', '達', 'Flying']
[MASK] predictions: ['inconnu', 'latin', 'scientifique', 'anglais', 'français']

Processing 'administrator' (expected: 'administrateur')
Nearest neighbors: ['only', 'شروع', 'нем', 'Bart', 'կամ']
[MASK] predictions: ['inconnu', '蹦', 'present', 'ancien', 'simple']

Processing 'algorithm' (expected: 'algorithme')
Nearest neighbors: ['schnell', 'Ergebnis', 'Flying', 'شروع', 'wahrscheinlich']
[MASK] predictions: ['inconnu', '蹦', 'proche', 'present', 'beteiligt']

Processing 'chemical' (expected: 'chimique')
Nearest neighbors: ['Pflanzen', 'ʸ', '浦', 'only', 'πριν']
[MASK] predictions: ['inconnu', 'simple', 'latin', 'proche', 'courant']

Processing 'delicious' (expected: 'délicieux')
Nearest neighbors: ['Ancient', 'only', 'flying', '達', 'pur']
[MASK] predictions: ['蹦', 'inconnu', 'simple', 'present', 'vrai']

Processing 'emotion' (expected: 'émotion')
Nearest neighbors: ['Ancient', 'on

## Load and Preprocess Dataset ##

In [58]:
from datasets import load_dataset

# Load the English-French subset of the Helsinki-NLP/europarl dataset
dataset = load_dataset("Helsinki-NLP/europarl", "en-fr")

# Inspect the dataset
print(dataset)

# Print a few examples
print("\nSample from the training set:")
print(dataset["train"][0])

# Check the number of examples in each split
print("\nNumber of examples in each split:")
print({split: len(dataset[split]) for split in dataset})

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 2051014
    })
})

Sample from the training set:
{'translation': {'en': 'Resumption of the session', 'fr': 'Reprise de la session'}}

Number of examples in each split:
{'train': 2051014}


In [59]:
# Shuffle the dataset and select 10,000 examples
small_dataset = dataset["train"].shuffle(seed=42).select(range(1000))

# Inspect the smaller dataset
print(f"\nSmall dataset size: {len(small_dataset)}")
print("\nSample from the small dataset:")
print(small_dataset[0])


Small dataset size: 1000

Sample from the small dataset:
{'translation': {'en': 'As Europeans, with our experience, our culture of peace and our economic opportunities, we too are called upon to make our contribution towards a better future for Iraq.', 'fr': "Les Européens que nous sommes, avec leur expérience, leur culture de la paix et leurs moyens économiques, sont appelés à apporter leur contribution en faveur d'un avenir meilleur en Irak."}}


In [60]:
# Split the small dataset into 80% train and 20% temporary
train_val_test_split = small_dataset.train_test_split(test_size=0.2, seed=42)

# Further split the temporary set (20%) into 10% validation and 10% test
val_test_split = train_val_test_split["test"].train_test_split(test_size=0.5, seed=42)

# Final splits
train_dataset = train_val_test_split["train"]
val_dataset = val_test_split["train"]
test_dataset = val_test_split["test"]

# Print sizes of the splits
print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Training set size: 800
Validation set size: 100
Test set size: 100


In [61]:
# Preprocessing function
def preprocess_function(examples):
    """
    Preprocess examples by tokenizing English and French text and preparing input and label tensors.
    """
    # Extract English and French sentences as lists
    inputs = [item["en"] for item in examples["translation"]]  # Extract "en" field from each item
    targets = [item["fr"] for item in examples["translation"]]  # Extract "fr" field from each item

    # Tokenize English and French sentences
    model_inputs = tokenizer(inputs, max_length=64, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=64, truncation=True, padding="max_length")

    # Replace padding tokens in labels with -100
    labels["input_ids"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in seq]
        for seq in labels["input_ids"]
    ]

    # Add tokenized French sentences as labels
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Apply preprocessing to each dataset split
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Remove unnecessary columns (translation field)
train_dataset = train_dataset.remove_columns(["translation"])
val_dataset = val_dataset.remove_columns(["translation"])
test_dataset = test_dataset.remove_columns(["translation"])

# Set the format to PyTorch for efficient data loading
train_dataset.set_format("torch")
val_dataset.set_format("torch")
test_dataset.set_format("torch")

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [62]:
# Inspect a preprocessed example from the training set
print("\nSample preprocessed training example:")
print(train_dataset[0])

# Verify dataset sizes after preprocessing
print(f"\nFinal dataset sizes:")
print(f"Training set: {len(train_dataset)}")
print(f"Validation set: {len(val_dataset)}")
print(f"Test set: {len(test_dataset)}")


Sample preprocessed training example:
{'input_ids': tensor([  101, 14962, 18322, 10114, 11572, 10531, 88840, 10108, 18926,   117,
        12916, 12811,   117, 11084, 10155, 22807,   131,   112, 40512, 21092,
        10112,   118, 10549,   118, 10549, 10157, 40512, 21092, 10112,   118,
        10549,   118, 13672, 10133,   112,   136,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'labels': tensor([  101, 40898,   117, 49641, 10173, 78225, 10246, 11464, 44364, 10822,
        10104, 18926,   117, 44356, 10141, 53475,   117, 13781, 10426, 10608,
          112, 10110, 36474, 10368,   1

## Finetune Model ##

In [63]:
from transformers import TrainingArguments
import evaluate

# Load accuracy metric
accuracy = evaluate.load("accuracy")

# Define metric computation function
def compute_metrics(eval_pred):
    """
    Computes accuracy during validation by ignoring padding tokens.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Flatten predictions and labels (remove -100 labels)
    flattened_predictions = []
    flattened_labels = []

    for pred, label in zip(predictions, labels):
        for p, l in zip(pred, label):
            if l != -100:  # Ignore padding token labels
                flattened_predictions.append(p)
                flattened_labels.append(l)

    return accuracy.compute(predictions=flattened_predictions, references=flattened_labels)

In [64]:
from transformers import Trainer

torch.cuda.empty_cache()

# Define training arguments
arguments = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/cognate_trainer",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=8,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to='none',
    seed=224
)

# Initialize the Trainer
trainer = Trainer(
    model=mlm_model,        # use MLM
    args=arguments,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [65]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,6.344,6.184851,0.084199


Epoch,Training Loss,Validation Loss,Accuracy
1,6.344,6.184851,0.084199
2,5.7307,5.801432,0.100719
3,5.5912,5.698327,0.107381
4,5.4405,5.657182,0.107647
5,5.3536,5.641131,0.107647


There were missing keys in the checkpoint model loaded: ['vocab_projector.weight'].


TrainOutput(global_step=250, training_loss=6.055001644134522, metrics={'train_runtime': 1283.17, 'train_samples_per_second': 3.117, 'train_steps_per_second': 0.195, 'total_flos': 66417320448000.0, 'train_loss': 6.055001644134522, 'epoch': 5.0})

In [66]:
# Save the trained model
output_dir = "/content/drive/MyDrive/Colab Notebooks/cognate_trainer_best_model"
trainer.save_model(output_dir)

# Evaluate the model on the test dataset
test_results = trainer.evaluate(test_dataset)

print("\nTest Results:")
print(test_results)


Test Results:
{'eval_loss': 5.649481296539307, 'eval_accuracy': 0.10710607621009269, 'eval_runtime': 16.49, 'eval_samples_per_second': 6.064, 'eval_steps_per_second': 0.425, 'epoch': 5.0}
