<a href="https://colab.research.google.com/github/fubotz/ICL_2024W/blob/main/FinalProject_Fabian_SCHAMBECK_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Final Project: Finetuning a Multilingual Model
Description here...

Model:

Dataset:

In [1]:
!pip install bertviz
!pip install datasets
!pip install evaluate
!pip install transformers
!pip install torch

Collecting bertviz
  Downloading bertviz-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting boto3 (from bertviz)
  Downloading boto3-1.36.6-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<1.37.0,>=1.36.6 (from boto3->bertviz)
  Downloading botocore-1.36.6-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3->bertviz)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.12.0,>=0.11.0 (from boto3->bertviz)
  Downloading s3transfer-0.11.2-py3-none-any.whl.metadata (1.7 kB)
Downloading bertviz-1.4.0-py3-none-any.whl (157 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.6/157.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading boto3-1.36.6-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.36.6-py3-none-any.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch

# Load the multilingual DistilBERT model and tokenizer
model_name = "distilbert-base-multilingual-cased"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(tokenizer)

DistilBertTokenizerFast(name_or_path='distilbert-base-multilingual-cased', vocab_size=119547, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


In [14]:
def filter_vocab_by_language(tokenizer, english_words, french_words):
    """
    Filters the tokenizer vocabulary to include only English and French words.

    Parameters:
        tokenizer (AutoTokenizer): The tokenizer with a large vocabulary.
        english_words (set): A set of known English words.
        french_words (set): A set of known French words.

    Returns:
        list of str: A filtered list of tokens in English and French.
    """
    vocab = tokenizer.vocab.keys()
    return [token for token in vocab if token in english_words or token in french_words]

# Example: Predefined English and French word lists
english_words = {"academic", "administrator", "algorithm"}  # Replace with full list
french_words = {"académique", "administrateur", "algorithme"}  # Replace with full list
filtered_vocab = filter_vocab_by_language(tokenizer, english_words, french_words)
print(f"Filtered vocabulary size (English + French): {len(filtered_vocab)}")

Filtered vocabulary size (English + French): 3


In [15]:
def get_embedding_for_word(word):
    """
    Computes the embedding for a given word dynamically.

    Parameters:
        word (str): The input word.

    Returns:
        np.ndarray: The embedding of the input word as a numpy array.
    """
    tokens = tokenizer(word, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**tokens)
    hidden_states = outputs.last_hidden_state
    embedding = hidden_states[0, 1:-1].mean(dim=0)  # Average token embeddings
    return embedding.numpy()

In [16]:
def precompute_embeddings(words, model, batch_size=32):
    """
    Precomputes embeddings for a list of words using batch processing.

    Parameters:
        words (list of str): The list of words to compute embeddings for.
        model (AutoModel): The pretrained model.
        batch_size (int): Number of words to process in a single batch.

    Returns:
        dict: A dictionary mapping words to their embeddings.
    """
    embeddings = {}
    for i in range(0, len(words), batch_size):
        batch = words[i:i + batch_size]
        tokens = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**tokens)
        hidden_states = outputs.last_hidden_state
        batch_embeddings = hidden_states.mean(dim=1).cpu().numpy()
        for word, embedding in zip(batch, batch_embeddings):
            embeddings[word] = embedding
    return embeddings

# Precompute embeddings for filtered_vocab
target_embeddings = precompute_embeddings(filtered_vocab, model, batch_size=64)
print(f"Precomputed embeddings for {len(target_embeddings)} words.")


Precomputed embeddings for 3 words.


In [17]:
def get_nn(word, target_embeddings, top_k=5):
    """
    Finds the nearest neighbors for a given word using cosine similarity.

    Parameters:
        word (str): The input word.
        target_embeddings (dict): Precomputed embeddings for the target words.
        top_k (int): Number of top neighbors to return.

    Returns:
        list of tuple: The top-k most similar words and their similarity scores.
    """
    input_embedding = get_embedding_for_word(word)
    input_embedding = input_embedding / np.linalg.norm(input_embedding)
    target_words = list(target_embeddings.keys())
    target_vecs = np.array(list(target_embeddings.values()))
    target_vecs_norm = target_vecs / np.linalg.norm(target_vecs, axis=1, keepdims=True)
    similarities = cosine_similarity([input_embedding], target_vecs_norm).flatten()
    nearest_idxs = similarities.argsort()[-top_k:][::-1]
    return [(target_words[i], similarities[i]) for i in nearest_idxs]

# Example: Find nearest neighbors for an input word
input_word = "academic"
nearest_neighbors = get_nn(input_word, target_embeddings, top_k=5)
print(f"Nearest neighbors for '{input_word}': {nearest_neighbors}")


Nearest neighbors for 'academic': [('academic', 0.89108837), ('administrator', 0.6060672), ('algorithm', 0.48515216)]


In [18]:
# Define hardcoded word pairs for evaluation
word_pairs = [
    ("academic", "académique"),
    ("administrator", "administrateur"),
    ("algorithm", "algorithme"),
    ("chemical", "chimique"),
    ("delicious", "délicieux"),
    ("emotion", "émotion"),
    ("exercise", "exercice"),
    ("gender", "genre"),
    ("gorilla", "gorille"),
    ("loyalty", "loyauté"),
    ("notation", "notamment"),
    ("objective", "objectif"),
    ("oratory", "oratoire"),
    ("particle", "particule"),
    ("quarter", "quartier"),
    ("september", "septembre"),
    ("skeleton", "squelette"),
    ("traditionally", "traditionnellement"),
    ("voice", "voix"),
    ("west", "ouest"),
    ("wine", "vin"),
]

def evaluate_cognate_detection(word_pairs, target_embeddings, top_k=5):
    results = []
    for input_word, expected_word in word_pairs:
        nearest_neighbors = get_nn(input_word, target_embeddings, top_k=top_k)
        found_word, similarity = nearest_neighbors[0]
        results.append({
            "input_word": input_word,
            "expected_word": expected_word,
            "found_word": found_word,
            "similarity": similarity,
            "top_k": nearest_neighbors
        })
    return results

# Evaluate the model on hardcoded word pairs
results = evaluate_cognate_detection(word_pairs, target_embeddings, top_k=5)

# Display the results
for result in results:
    print(f"Input: {result['input_word']}, Expected: {result['expected_word']}, Found: {result['found_word']}, Similarity: {result['similarity']:.4f}")


Input: academic, Expected: académique, Found: academic, Similarity: 0.8911
Input: administrator, Expected: administrateur, Found: administrator, Similarity: 0.8158
Input: algorithm, Expected: algorithme, Found: algorithm, Similarity: 0.8657
Input: chemical, Expected: chimique, Found: academic, Similarity: 0.5725
Input: delicious, Expected: délicieux, Found: academic, Similarity: 0.5127
Input: emotion, Expected: émotion, Found: academic, Similarity: 0.5216
Input: exercise, Expected: exercice, Found: administrator, Similarity: 0.6087
Input: gender, Expected: genre, Found: academic, Similarity: 0.5664
Input: gorilla, Expected: gorille, Found: administrator, Similarity: 0.3993
Input: loyalty, Expected: loyauté, Found: academic, Similarity: 0.4809
Input: notation, Expected: notamment, Found: algorithm, Similarity: 0.6812
Input: objective, Expected: objectif, Found: administrator, Similarity: 0.6143
Input: oratory, Expected: oratoire, Found: administrator, Similarity: 0.5928
Input: particle,