### Imports

In [1]:
import torch
from transformers import FlaubertModel, FlaubertTokenizer

In [2]:
modelname = 'flaubert/flaubert_large_cased' 

In [3]:
flaubert, log = FlaubertModel.from_pretrained(modelname, output_loading_info=True)
flaubert_tokenizer = FlaubertTokenizer.from_pretrained(modelname, do_lowercase=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1496.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=553238687.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1561415.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=895731.0, style=ProgressStyle(descripti…




### Pure FlauBERT

In [4]:
sentence = "Le chat mange une pomme."
token_ids = torch.tensor([flaubert_tokenizer.encode(sentence)])

last_layer = flaubert(token_ids)[0]
print(last_layer.shape)
# torch.Size([1, 8, 768])  -> (batch size x number of tokens x embedding dimension)

# The BERT [CLS] token correspond to the first hidden state of the last layer
cls_embedding = last_layer[:, 0, :]

torch.Size([1, 8, 768])


In [5]:
last_layer

tensor([[[ 0.3711,  0.3744, -0.1044,  ...,  0.2788,  1.6171,  1.5743],
         [-0.3475, -0.4302, -1.2414,  ..., -0.7886, -0.2546,  0.2149],
         [-0.6474, -1.6147, -2.5066,  ...,  0.5554, -0.3403,  0.1027],
         ...,
         [-2.6465, -0.0552, -3.8672,  ..., -2.0638,  0.6116, -1.1269],
         [-1.1969, -1.1622,  1.4329,  ..., -2.3904, -0.9578,  1.3920],
         [-2.1017, -1.3640,  1.2869,  ..., -1.7465,  0.3996,  0.1256]]],
       grad_fn=<MulBackward0>)

### Custom UKP

In [8]:
from sentence_transformers import SentenceTransformer, models

In [12]:
pooling_model = models.Pooling(last_layer.shape[-1],
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

In [13]:
model = SentenceTransformer(modules=[flaubert, pooling_model])

In [14]:
corpus = ['Un homme mange un repas.',
          "Quelqu'un mange un moreceau de pain.",
          'La fille porte un bébé.',
          'Un homme est à cheval.',
          'Une femme joue du violon.',
          'Deux hommes poussent la remorque à travers les bois.',
          'Un homme est sur un cheval dans un champ cloturé.',
          'Un singe joue du tambour.',
          'Un léopard cours après sa proie.'
          ]

#### UKP way

In [22]:
word_embedding_model = models.Transformer('camembert/camembert-large')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1351074932.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=808767.0, style=ProgressStyle(descripti…




In [23]:
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [25]:
sentence_embeddings = model.encode(corpus)

# The result is a list of sentence embeddings as numpy arrays
for sentence, embedding in zip(corpus, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: Un homme mange un repas.
Embedding: [-0.09479044 -0.06176553 -0.05801992 ... -0.283704    0.01510611
 -0.0095601 ]

Sentence: Quelqu'un mange un moreceau de pain.
Embedding: [-0.01211299 -0.03671304 -0.04094827 ... -0.08826046  0.04286108
  0.14001717]

Sentence: La fille porte un bébé.
Embedding: [-0.04641659  0.13057843  0.07414299 ... -0.2190267   0.01468595
 -0.00437346]

Sentence: Un homme est à cheval.
Embedding: [-0.08473756 -0.07267409 -0.16023122 ... -0.32772416 -0.04395759
 -0.04428369]

Sentence: Une femme joue du violon.
Embedding: [-0.10331791  0.02757867  0.04416429 ... -0.04930815  0.05042475
 -0.06139763]

Sentence: Deux hommes poussent la remorque à travers les bois.
Embedding: [-0.01888581  0.08053998  0.05901992 ... -0.17994659 -0.00470037
 -0.06202056]

Sentence: Un homme est sur un cheval dans un champ cloturé.
Embedding: [ 0.09787734 -0.14155664 -0.08529281 ... -0.24030524 -0.06254562
 -0.01310084]

Sentence: Un singe joue du tambour.
Embedding: [-0.1317