<a href="https://colab.research.google.com/github/changedi/DPpro/blob/master/embeddingModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# gte-large-zh


In [None]:
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

input_texts = [
    "中国的首都是哪里",
    "你喜欢去哪里旅游",
    "北京",
    "今天中午吃什么"
]

tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-large-zh")
model = AutoModel.from_pretrained("thenlper/gte-large-zh")

# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')

outputs = model(**batch_dict)
embeddings = outputs.last_hidden_state[:, 0]

# (Optionally) normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:1] @ embeddings[1:].T) * 100
print(scores.tolist())

print()


In [5]:
!pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

import time

sentences = ['如何构建一个大模型业务应用', '开发大语言模型Application的方法有哪些', '如何使用ECS搭建运维助手']

model = SentenceTransformer('thenlper/gte-large-zh')

T1 = time.time()
embeddings = model.encode(sentences)
T2 = time.time()

print(embeddings)
print(cos_sim(embeddings[0], embeddings[1]))
print(cos_sim(embeddings[0], embeddings[2]))
print(cos_sim(embeddings[1], embeddings[2]))
print('程序运行时间:%s毫秒' % ((T2 - T1)*1000))



[[-0.0268689  -0.01580627 -0.01476139 ... -0.01111243 -0.02570463
  -0.03370997]
 [ 0.02440324  0.0036326  -0.02813137 ... -0.06495292 -0.01951394
  -0.03629984]
 [ 0.03203386  0.01059715 -0.05863168 ... -0.02679722 -0.02649011
   0.00272422]]
tensor([[0.7547]])
tensor([[0.5318]])
tensor([[0.5093]])
程序运行时间:36.41772270202637毫秒


In [4]:
sentences = ['如何构建一个大模型业务应用']
T1 = time.time()
embeddings = model.encode(sentences)
T2 = time.time()

print(embeddings)
print('程序运行时间:%s毫秒' % ((T2 - T1)*1000))

[[-0.02686889 -0.01580627 -0.01476139 ... -0.01111245 -0.02570462
  -0.03370999]]
程序运行时间:40.80557823181152毫秒


# BGE-large-zh

In [7]:
from sentence_transformers import SentenceTransformer

sentences = ['如何构建一个大模型业务应用', '开发大语言模型Application的方法有哪些', '如何使用ECS搭建运维助手']

model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
T1 = time.time()
embeddings = model.encode(sentences, normalize_embeddings=True)
T2 = time.time()

print(embeddings)
print(cos_sim(embeddings[0], embeddings[1]))
print(cos_sim(embeddings[0], embeddings[2]))
print(cos_sim(embeddings[1], embeddings[2]))
print('程序运行时间:%s毫秒' % ((T2 - T1)*1000))


[[-0.00188019  0.00298401 -0.04403339 ... -0.00515228 -0.00402004
  -0.0243677 ]
 [ 0.0108217   0.01520945 -0.04001054 ... -0.01379857  0.01848038
  -0.03649398]
 [ 0.01073227 -0.00615992 -0.00894783 ...  0.02641496  0.01565628
  -0.02235479]]
tensor([[0.7287]])
tensor([[0.4789]])
tensor([[0.4413]])
程序运行时间:32.276153564453125毫秒
