# Sentence-LUKE
* https://huggingface.co/sonoisa/sentence-luke-japanese-base-lite

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/fuyu-quant/data-science-wiki/blob/main/nlp/text_embedding(japanese)/sentenceluke.ipynb)

In [1]:
%%capture
!pip install transformers
!pip install sentencepiece

In [1]:
from transformers import MLukeTokenizer, LukeModel
import torch

#### Model

In [2]:
class SentenceLukeJapanese:
    def __init__(self, model_name_or_path, device=None):
        self.tokenizer = MLukeTokenizer.from_pretrained(model_name_or_path)
        self.model = LukeModel.from_pretrained(model_name_or_path)
        self.model.eval()

        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = torch.device(device)
        self.model.to(device)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    @torch.no_grad()
    def encode(self, sentences, batch_size=8):
        all_embeddings = []
        iterator = range(0, len(sentences), batch_size)
        for batch_idx in iterator:
            batch = sentences[batch_idx:batch_idx + batch_size]

            encoded_input = self.tokenizer.batch_encode_plus(batch, padding="longest", 
                                           truncation=True, return_tensors="pt").to(self.device)
            model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output, encoded_input["attention_mask"]).to('cpu')

            all_embeddings.extend(sentence_embeddings)

        return torch.stack(all_embeddings)

model = SentenceLukeJapanese("sonoisa/sentence-luke-japanese-base-lite")

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/842k [00:00<?, ?B/s]

Downloading (…)in/entity_vocab.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/595 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/532M [00:00<?, ?B/s]

#### Data Set Preparation

In [6]:
text = [
    '機械学習の勉強をする',
    'あいうえお',
    '量子コンピュータは量子力学の原理を応用したコンピュータ'
]

In [7]:
embeddings = model.encode(text, batch_size=8)

In [8]:
embeddings

tensor([[-0.3750, -0.2552, -0.0532,  ...,  0.2665,  0.0394, -0.0278],
        [-0.2394,  0.2256,  0.2355,  ..., -0.4577,  0.5773, -0.3476],
        [-0.7417, -0.5376,  0.0325,  ...,  0.0666, -0.4599, -0.1132]])