# BERT Tryout
* Reference: https://leemeng.tw/attack_on_bert_transfer_learning_in_nlp.html

In [1]:
# import packages
from transformers import BertTokenizer
from IPython.display import clear_output
from sentence_transformers import SentenceTransformer, util
import numpy as np
import random
import torch

In [5]:
# read in model and tokenizer

PRETRAINED_MODEL_NAME = "bert-base-chinese"  # 指定繁簡中文 BERT-BASE 預訓練模型

'''
Available Models: 
bert-base-chinese
bert-base-uncased
bert-base-cased
bert-base-german-cased
bert-base-multilingual-uncased
bert-base-multilingual-cased
bert-large-cased
bert-large-uncased
bert-large-uncased-whole-word-masking
bert-large-cased-whole-word-masking
'''

# 取得此預訓練模型所使用的 tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

clear_output()

print("PyTorch Version：", torch.__version__)

PyTorch Version： 1.7.1+cu110


In [6]:
vocab = tokenizer.vocab
print("字典大小：", len(vocab))

字典大小： 21128


In [7]:
import random
random_tokens = random.sample(list(vocab), 10)
random_ids = [vocab[t] for t in random_tokens]

print("{0:20}{1:15}".format("token", "index"))
print("-" * 25)
for t, id in zip(random_tokens, random_ids):
    print("{0:15}{1:10}".format(t, id))

token               index          
-------------------------
made                10123
仪                     811
骨                    7755
##橋                 16635
##穢                 18008
tai                 13242
郁                    6944
social              10832
##霉                 20507
乎                     725


# Bert Sentences Similarity
* Reference: https://towardsdatascience.com/semantic-similarity-using-transformers-8f3cb5bf66d6
* Reference: https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [1]:
from sentence_transformers import SentenceTransformer, util

In [3]:
# model = SentenceTransformer('stsb-roberta-large', device="cpu")
# model = SentenceTransformer('distiluse-base-multilingual-cased-v1', device="cpu")
model = SentenceTransformer('distiluse-base-multilingual-cased-v2', device="cpu")
# model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1', device="cpu")
# model = SentenceTransformer('stsb-xlm-r-multilingual', device="cpu")

In [6]:
# sentence1 = "I like Python because I can build AI applications"
# sentence2 = "I like Python because I can do data analytics"

sentence1 = "晶圓製造是台灣未來的希望"
sentence2 = "黃光機台一天到晚都有問題"
# encode sentences to get their embeddings
embedding1 = model.encode(sentence1, convert_to_tensor=True)
embedding2 = model.encode(sentence2, convert_to_tensor=True)
# compute similarity scores of two embeddings
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
print("Sentence 1:", sentence1)
print("Sentence 2:", sentence2)
print("Similarity score:", cosine_scores.item())

Sentence 1: 晶圓製造是台灣未來的希望
Sentence 2: 黃光機台一天到晚都很棒
Similarity score: 0.18249928951263428
