In [None]:
# 安装依赖
!pip install transformers
!pip install openai
!pip install jieba

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, http

In [None]:
import torch
import os
from transformers import AutoTokenizer, AutoModel
from argparse import Namespace
from scipy.spatial.distance import cosine

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"


# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Load model from HuggingFace Hub




In [None]:
model_names = [
    'shibing624/text2vec-base-chinese',
    'silk-road/luotuo-bert',
    'GanymedeNil/text2vec-large-chinese'
]

# 测试句向量

texts = [
    "今天天气很好",
    "今天天气很差",
    "今天天气晴朗，万里无云"
]
model_args = Namespace(do_mlm=None, pooler_type="cls", temp=0.05, mlp_only_train=False, init_embeddings_model=None)

for name in model_names:
  print(name)
  if name == 'silk-road/luotuo-bert':
    model = AutoModel.from_pretrained(name,trust_remote_code=True,model_args=model_args)
  else:
    model = AutoModel.from_pretrained(name,trust_remote_code=True)
  tokenizer = AutoTokenizer.from_pretrained(name)

  encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

  
  with torch.no_grad():
    if name == 'silk-road/luotuo-bert':
      sentence_embeddings = model(**encoded_input, output_hidden_states=True, return_dict=True, sent_emb=True).pooler_output
    else:    
      model_output = model(**encoded_input)
      # Perform pooling. In this case, max pooling.
      sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])


  # 计算余弦相似度  [-1, 1]，值越高，越相似
  # Cosine similarities are in [-1, 1]. Higher means more similar
  print(torch.nn.functional.cosine_similarity(sentence_embeddings[0], sentence_embeddings[1], dim=0))
  print(torch.nn.functional.cosine_similarity(sentence_embeddings[0], sentence_embeddings[2], dim=0))


In [None]:
!pip install text2vec

In [None]:
import sys

sys.path.append('..')
from text2vec import SentenceModel
from text2vec import Word2Vec
import numpy as np

def cosine_similarity(x, y):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def compute_emb(model,name):
  print(name) 
  sentence_embeddings = model.encode(texts)
  print(type(sentence_embeddings), sentence_embeddings.shape)

  print(cosine_similarity(sentence_embeddings[0], sentence_embeddings[1]))
  print(cosine_similarity(sentence_embeddings[0], sentence_embeddings[2]))


# 中文句向量模型(CoSENT)，中文语义匹配任务推荐，支持fine-tune继续训练
t2v_model = SentenceModel("shibing624/text2vec-base-chinese")
compute_emb(t2v_model,'shibing624/text2vec-base-chinese')

# 支持多语言的句向量模型（Sentence-BERT），英文语义匹配任务推荐，支持fine-tune继续训练
sbert_model = SentenceModel("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
compute_emb(sbert_model,'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# 中文词向量模型(word2vec)，中文字面匹配任务和冷启动适用
w2v_model = Word2Vec("w2v-light-tencent-chinese")
compute_emb(w2v_model,'w2v-light-tencent-chinese')


In [None]:
import jieba

# 手工用词向量表示句子，并计算相似度
embedings = []
for text in texts:
  seg_list = jieba.cut(text, cut_all=False) # 精确模式
  words = " ".join(seg_list).split(" ")
  embed = []
  for w in words:
    if w == "，":
      continue
    embed.append(w2v_model.encode(w))
  matrix = np.array(embed)
  # 沿着行的方向计算平均值
  mean = np.mean(matrix, axis=0)
  embedings.append(mean)

print(cosine_similarity(embedings[0], embedings[1]))
print(cosine_similarity(embedings[0], embedings[2]))

0.9204265172093801
0.7687565139133027


In [13]:
# openai 计算句向量
!pip install openai[embeddings]


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai[embeddings]
  Downloading openai-0.27.4-py3-none-any.whl (70 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
Collecting pandas-stubs>=1.1.0.11
  Downloading pandas_stubs-2.0.0.230412-py3-none-any.whl (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.6/149.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting types-pytz>=2022.1.1
  Downloading types_pytz-2023.3.0.0-py3-none-any.whl (4.7 kB)
Collecting multidict<7.0,>=4.5
  Down

In [15]:
import openai
openai.api_key = "sk-"  # 输入你自己的api key



model_ids = [
    # 'gpt-3.5-turbo', # 这个模型不支持embeding
    'text-embedding-ada-002',
    "text-similarity-davinci-001",
    "text-search-ada-doc-001"
]
for model_id in model_ids:
  embedings = []
  for text in texts:
    embedding = openai.Embedding.create(input=text, model=model_id)['data'][0]['embedding'] # 浮点数组
    print("len",len(embedding))
    embedings.append(np.array(embedding))

  print(model_id)
  print(cosine_similarity(embedings[0], embedings[1]))
  print(cosine_similarity(embedings[0], embedings[2]))

len 1536
len 1536
len 1536
text-embedding-ada-002
0.9297494324102838
0.9159156752383093
len 12288
len 12288
len 12288
text-similarity-davinci-001
0.8640324273741014
0.9050137753062327
len 1024
len 1024
len 1024
text-search-ada-doc-001
0.974462972994041
0.890246260231079


In [1]:
# 测试 llama
! wget https://huggingface.co/Mabbs/chinese-Alpaca-lora-7b-ggml/resolve/main/ggml-model-q4_0.bin

--2023-04-27 04:49:28--  https://huggingface.co/Mabbs/chinese-Alpaca-lora-7b-ggml/resolve/main/ggml-model-q4_0.bin
Resolving huggingface.co (huggingface.co)... 13.249.85.92, 13.249.85.127, 13.249.85.69, ...
Connecting to huggingface.co (huggingface.co)|13.249.85.92|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/ef/d3/efd3bb2456cfd758cef490dcd8acb14c5aa5c7a9cae386d3b7447f6b41b3a32a/399d858ec1e45f277c9a7c61a9cd7dbbed0aa2a357c92a6fd478b3c5bbf803e1?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27ggml-model-q4_0.bin%3B+filename%3D%22ggml-model-q4_0.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1682830169&Policy=eyJTdGF0ZW1lbnQiOlt7IlJlc291cmNlIjoiaHR0cHM6Ly9jZG4tbGZzLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2VmL2QzL2VmZDNiYjI0NTZjZmQ3NThjZWY0OTBkY2Q4YWNiMTRjNWFhNWM3YTljYWUzODZkM2I3NDQ3ZjZiNDFiM2EzMmEvMzk5ZDg1OGVjMWU0NWYyNzdjOWE3YzYxYTljZDdkYmJlZDBhYTJhMzU3YzkyYTZmZDQ3OGIzYzViYmY4MDNlMT9yZXNwb25zZS

In [2]:
!pip install llama-cpp-python


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.1.38.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.1.38-cp39-cp39-linux_x86_64.whl size=164837 sha256=7b4c41d6ac0d49c66282f24660747630895157214d9696a3641415ba2dac52fc
  Stored in directory: /root/.cache/pip/wheels/2a/2d/67/6f8385807f0fe541d2fe6ce446c3d5e75984828a7f6f09c992
Successfully built llama-cpp-python
Installing collected packages: llama-cpp-python
Successfully inst

In [16]:
from llama_cpp import Llama
import numpy as np


def cosine_similarity(x, y):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

llm = Llama(model_path='./ggml-model-q4_0.bin', embedding=True)

embedings = []
for text in texts:
  embedding = llm.create_embedding(text)['data'][0]['embedding']
  print("len",len(embedding))
  embedings.append(np.array(embedding))

print(cosine_similarity(embedings[0], embedings[1]))
print(cosine_similarity(embedings[0], embedings[2]))


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 


len 4096
len 4096
len 4096
0.8367674190917038
0.583370587132951
