# Multilingual-E5-large

* https://huggingface.co/intfloat/multilingual-e5-large

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/fuyu-quant/data-science-wiki/blob/main/nlp/text_embedding(japanese)/multilingual-e5-large.ipynb)

In [3]:
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel

#### Model

In [7]:
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large')

In [5]:
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

#### Data Set Preparation

In [10]:
texts = [
    '機械学習の勉強をする',
    'あいうえお',
    '量子コンピュータは量子力学の原理を応用したコンピュータ'
]

In [11]:
batch_dict = tokenizer(texts, max_length=512, padding=True, truncation=True, return_tensors='pt')

outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
embeddings = F.normalize(embeddings, p=2, dim=1)

In [12]:
embeddings

tensor([[ 0.0120, -0.0002, -0.0287,  ..., -0.0018, -0.0049,  0.0056],
        [ 0.0434,  0.0439, -0.0173,  ...,  0.0083, -0.0311,  0.0187],
        [ 0.0409,  0.0239, -0.0122,  ...,  0.0203, -0.0263,  0.0144]],
       grad_fn=<DivBackward0>)