<a href="https://colab.research.google.com/github/fubotz/ICL_2024W/blob/main/FinalProject_Fabian_SCHAMBECK_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Final Project: Finetuning a Multilingual Model
Description here...

Model:

Dataset:

In [56]:
!pip install bertviz
!pip install datasets
!pip install evaluate
!pip install scikit-learn
!pip install transformers
!pip install torch



In [57]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [58]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load the pretrained model and tokenizer
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

print(tokenizer)

DistilBertTokenizerFast(name_or_path='distilbert-base-multilingual-cased', vocab_size=119547, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


In [59]:
import torch
import numpy as np

# Get the embeddings for the model's vocabulary
def get_vocab_embeddings(tokenizer, model):
    """
    Extract embeddings for all meaningful tokens in the model's vocabulary.
    Filters out subwords, special tokens, and non-alphabetical tokens.
    """
    vocab_size = len(tokenizer)
    embeddings = []
    words = []

    for token_id in range(vocab_size):
        token = tokenizer.convert_ids_to_tokens(token_id)
        # Filter: Exclude subwords (tokens starting with ##), special tokens, and non-alphabetical tokens
        if token.isalpha() and not token.startswith("##"):
            words.append(token)
            with torch.no_grad():
                # Create a tensor for the token and get its embedding
                inputs = torch.tensor([[token_id]])
                outputs = model.embeddings.word_embeddings(inputs)
                embeddings.append(outputs[0].squeeze(0).numpy())

    return words, np.array(embeddings)

In [60]:
# Retrieve embedding for a single word
def get_embedding(word, tokenizer, model):
    """
    Retrieve the embedding for a single word.
    """
    tokens = tokenizer(word, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokens)
        embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token embedding
    return embedding.squeeze(0)

In [61]:
from sklearn.metrics.pairwise import cosine_similarity

# Find nearest neighbors for a single word
def find_nearest_neighbors_direct(word_en, tokenizer, model, vocab_embeddings, vocab_words, top_k=5):
    """
    Find the nearest neighbors for an input word directly from the model's filtered vocabulary.
    """
    # Get the embedding for the input word
    en_embedding = get_embedding(word_en, tokenizer, model).numpy()

    # Compute cosine similarities
    similarities = cosine_similarity([en_embedding], vocab_embeddings)[0]

    # Get the top_k most similar tokens
    top_indices = similarities.argsort()[-top_k:][::-1]
    top_tokens = [(vocab_words[idx], similarities[idx]) for idx in top_indices]

    return top_tokens

In [62]:
# Define hardcoded word pairs for evaluation (n=21)
word_pairs = [
    ("academic", "académique"),
    ("administrator", "administrateur"),
    ("algorithm", "algorithme"),
    ("chemical", "chimique"),
    ("delicious", "délicieux"),
    ("emotion", "émotion"),
    ("exercise", "exercice"),
    ("gender", "genre"),
    ("gorilla", "gorille"),
    ("loyalty", "loyauté"),
    ("notation", "notamment"),
    ("objective", "objectif"),
    ("oratory", "oratoire"),
    ("particle", "particule"),
    ("quarter", "quartier"),
    ("september", "septembre"),
    ("skeleton", "squelette"),
    ("traditionally", "traditionnellement"),
    ("voice", "voix"),
    ("west", "ouest"),
    ("wine", "vin"),
]

vocab_words, vocab_embeddings = get_vocab_embeddings(tokenizer, model)

# Evaluate all words in the hardcoded list
for word_en, word_fr in word_pairs:
    # Find the nearest neighbors
    nearest_neighbors = find_nearest_neighbors_direct(word_en, tokenizer, model, vocab_embeddings, vocab_words)

    # Print results
    print(f"\nThe nearest neighbors for '{word_en}' (expected: '{word_fr}') are:")
    for token, similarity in nearest_neighbors:
        print(f"{token}: {similarity:.4f}")


The nearest neighbors for 'academic' (expected: 'académique') are:
only: 0.1267
Mad: 0.1172
Ancient: 0.1168
達: 0.1097
Flying: 0.1079

The nearest neighbors for 'administrator' (expected: 'administrateur') are:
only: 0.1250
شروع: 0.1145
нем: 0.1139
Bart: 0.1119
կամ: 0.1057

The nearest neighbors for 'algorithm' (expected: 'algorithme') are:
schnell: 0.1322
Ergebnis: 0.1231
Flying: 0.1169
شروع: 0.1145
wahrscheinlich: 0.1110

The nearest neighbors for 'chemical' (expected: 'chimique') are:
Pflanzen: 0.1225
ʸ: 0.1223
浦: 0.1155
only: 0.1125
πριν: 0.1115

The nearest neighbors for 'delicious' (expected: 'délicieux') are:
Ancient: 0.1184
only: 0.1132
flying: 0.1126
達: 0.1115
pur: 0.1069

The nearest neighbors for 'emotion' (expected: 'émotion') are:
Ancient: 0.1190
only: 0.1174
Mad: 0.1087
flying: 0.1071
растения: 0.1071

The nearest neighbors for 'exercise' (expected: 'exercice') are:
Flying: 0.1421
flying: 0.1317
exercise: 0.1310
сите: 0.1305
Ancient: 0.1275

The nearest neighbors for 'gen