### Embedding Contextuel: BERT

In [8]:
from transformers import AutoTokenizer, AutoModel
import torch

# Charger le tokenizer et le modèle pré-entraîné BERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

In [9]:
# Deux phrases avec des contextes différents
sentence_1 = "The bank is near the river."
sentence_2 = "I deposited money at the bank yesterday."

# Tokenisation et encodage
inputs_1 = tokenizer(sentence_1, return_tensors="pt")
inputs_2 = tokenizer(sentence_2, return_tensors="pt")

# Calcul des embeddings
with torch.no_grad():
    outputs_1 = model(**inputs_1)
    outputs_2 = model(**inputs_2)

# Extraire les embeddings pour le mot "bank"
tokens_1 = tokenizer.convert_ids_to_tokens(inputs_1["input_ids"][0])
tokens_2 = tokenizer.convert_ids_to_tokens(inputs_2["input_ids"][0])

index_bank_1 = tokens_1.index("bank")
index_bank_2 = tokens_2.index("bank")

embedding_bank_1 = outputs_1.last_hidden_state[0, index_bank_1, :]
embedding_bank_2 = outputs_2.last_hidden_state[0, index_bank_2, :]


In [10]:
embedding_bank_1.shape

torch.Size([768])

In [11]:
embedding_bank_2.shape

torch.Size([768])

In [12]:
# Calculer la similarité cosinus entre les deux vecteurs
from torch.nn.functional import cosine_similarity
similarity = cosine_similarity(embedding_bank_1.unsqueeze(0), embedding_bank_2.unsqueeze(0))

print(f"Similarité entre 'bank' dans deux contextes : {similarity.item():.4f}")


Similarité entre 'bank' dans deux contextes : 0.4829


### Embedding Statique: Word2Vec

In [13]:
from gensim.models import Word2Vec

# Corpus avec plusieurs contextes pour "bank"
corpus = [
    ["he", "sat", "by", "the", "river", "bank"],
    ["he", "went", "to", "the", "bank", "to", "deposit", "money"],
    ["the", "river", "bank", "is", "beautiful"],
    ["she", "opened", "an", "account", "at", "the", "bank"],
    ["the", "river", "flows", "near", "the", "bank"],
]

# Entraîner un modèle Word2Vec
model = Word2Vec(sentences=corpus, vector_size=50, window=3, min_count=1, sg=0)

# Vérifier les mots dans le vocabulaire
print("Vocabulaire :", model.wv.key_to_index.keys())


Vocabulaire : dict_keys(['the', 'bank', 'river', 'he', 'to', 'deposit', 'sat', 'by', 'went', 'near', 'flows', 'is', 'beautiful', 'she', 'opened', 'an', 'account', 'at', 'money'])


In [14]:
# Extraire l'embedding pour "bank"
embedding_bank = model.wv["bank"]

print("Embedding pour 'bank' :", embedding_bank)
print("Dimension de l'embedding :", len(embedding_bank))


Embedding pour 'bank' : [-0.01631583  0.0089916  -0.00827415  0.00164907  0.01699724 -0.00892435
  0.009035   -0.01357392 -0.00709698  0.01879702 -0.00315531  0.00064274
 -0.00828126 -0.01536538 -0.00301602  0.00493959 -0.00177605  0.01106732
 -0.00548595  0.00452013  0.01091159  0.01669191 -0.00290748 -0.01841629
  0.0087411   0.00114357  0.01488382 -0.00162657 -0.00527683 -0.01750602
 -0.00171311  0.00565313  0.01080286  0.01410531 -0.01140624  0.00371764
  0.01217773 -0.0095961  -0.00621452  0.01359526  0.00326295  0.00037983
  0.00694727  0.00043555  0.01923765  0.01012121 -0.01783478 -0.01408312
  0.00180291  0.01278507]
Dimension de l'embedding : 50
