In [1]:
# 使用英文
docs_en = [
    "Machine learning is changing our way of life.",
    "Deep learning performs exceptionally well in image recognition.",
    "Natural language processing is an important field in computer science.",
    "Autonomous driving relies on advanced algorithms.",
    "AI can help doctors diagnose diseases.",
    "Data analysis technology is widely applied in the financial field.",
    "Production efficiency can be improved through automation technology.",
    "The future of machine intelligence is full of potential.",
    "Big data support is key to the development of machine intelligence.",
    "The quantum tunneling effect allows electrons to pass through potential barriers that classical mechanics consider impassable, which has important applications in semiconductor devices."
]

使用modelscope的模型生成文档集合的稀疏向量，原文章中使用的是huggingface的
```python
model_name = "naver/splade-cocondenser-selfdistil"
```

国内无法访问，所以使用了"ibm-granite/granite-embedding-30m-sparse"模型，结果大差不差滴


In [6]:
from modelscope import snapshot_download
from pymilvus.model.sparse import SpladeEmbeddingFunction

query_en = ["How does artificial intelligence affect the automotive industry?"]

model_name = "ibm-granite/granite-embedding-30m-sparse"

# 下载模型到本地
model_dir = snapshot_download(model_name)
# 步骤2：使用本地模型路径初始化 SpladeEmbeddingFunction
splade_ef = SpladeEmbeddingFunction(
    model_name=model_dir,  
    device="cpu"          
)

# 确保模型加载成功
print(f"模型已下载至: {model_dir}")


# 生成文档集合的稀疏向量
sparse_vectors_splade = splade_ef.encode_documents(docs_en)
print(sparse_vectors_splade)

Downloading Model from https://www.modelscope.cn to directory: /mnt/workspace/.cache/modelscope/models/ibm-granite/granite-embedding-30m-sparse


2025-09-08 11:00:59,167 - modelscope - INFO - Got 15 files, start to download ...


Processing 15 items:   0%|          | 0.00/15.0 [00:00<?, ?it/s]

Downloading [1_SpladePooling/config.json]:   0%|          | 0.00/106 [00:00<?, ?B/s]

Downloading [config_sentence_transformers.json]:   0%|          | 0.00/274 [00:00<?, ?B/s]

Downloading [config.json]:   0%|          | 0.00/688 [00:00<?, ?B/s]

Downloading [merges.txt]:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading [model.safetensors]:   0%|          | 0.00/57.9M [00:00<?, ?B/s]

Downloading [configuration.json]:   0%|          | 0.00/77.0 [00:00<?, ?B/s]

Downloading [modules.json]:   0%|          | 0.00/274 [00:00<?, ?B/s]

Downloading [pytorch_model.bin]:   0%|          | 0.00/296M [00:00<?, ?B/s]

Downloading [README.md]:   0%|          | 0.00/12.4k [00:00<?, ?B/s]

Downloading [sentence_bert_config.json]:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

Downloading [special_tokens_map.json]:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading [tokenizer.json]:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading [tokenizer_config.json]:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading [vocab.json]:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading [zero_to_fp32.py]:   0%|          | 0.00/23.1k [00:00<?, ?B/s]

2025-09-08 11:01:03,034 - modelscope - INFO - Download model 'ibm-granite/granite-embedding-30m-sparse' successfully.


模型已下载至: /mnt/workspace/.cache/modelscope/models/ibm-granite/granite-embedding-30m-sparse
  (0, 4)	0.2877643406391144
  (0, 52)	0.4842591881752014
  (0, 55)	0.013229835778474808
  (0, 92)	0.2468601018190384
  (0, 122)	0.5920421481132507
  (0, 169)	0.03369762748479843
  (0, 197)	0.095177561044693
  (0, 210)	0.09415344893932343
  (0, 301)	1.1498292684555054
  (0, 448)	0.1370851695537567
  (0, 464)	1.1264784336090088
  (0, 474)	0.18824294209480286
  (0, 505)	0.03884601965546608
  (0, 532)	0.24697820842266083
  (0, 555)	0.026394851505756378
  (0, 596)	0.1440160721540451
  (0, 622)	0.37694254517555237
  (0, 666)	0.30827078223228455
  (0, 712)	0.1144619882106781
  (0, 806)	0.0590529665350914
  (0, 1022)	0.939770519733429
  (0, 1050)	0.22106267511844635
  (0, 1131)	0.3955904245376587
  (0, 1437)	0.5205164551734924
  (0, 1477)	0.45300784707069397
  :	:
  (9, 35233)	0.015529687516391277
  (9, 35235)	1.2405478954315186
  (9, 35644)	0.10154350847005844
  (9, 35772)	0.03416196629405022
  (9, 36713)

In [8]:
# splade词汇表中的单词数量
from transformers import AutoModelForMaskedLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_dir)
print(f"splade词汇表中的单词数量：{tokenizer.vocab_size}")

print(f"splade稀疏向量维度：{splade_ef.dim}")

splade词汇表中的单词数量：50265
splade稀疏向量维度：50265


In [9]:
# 查看查询的分词
tokens = tokenizer.tokenize(query_en[0])
print(f"“{query_en[0]}” 的分词结果：\n{tokens}")
print(f"tokens数量：{len(tokens)}")

# 生成查询的稀疏向量
query_sparse_vectors_splade = splade_ef.encode_queries(query_en)
print(query_sparse_vectors_splade)

“How does artificial intelligence affect the automotive industry?” 的分词结果：
['How', 'Ġdoes', 'Ġartificial', 'Ġintelligence', 'Ġaffect', 'Ġthe', 'Ġautomotive', 'Ġindustry', '?']
tokens数量：9
  (0, 138)	0.7028489708900452
  (0, 210)	0.15836457908153534
  (0, 216)	0.02560880221426487
  (0, 250)	0.0022231058683246374
  (0, 505)	0.2783266603946686
  (0, 512)	1.5052143335342407
  (0, 539)	1.3155847787857056
  (0, 596)	0.06378108263015747
  (0, 806)	0.09540452063083649
  (0, 913)	0.12891486287117004
  (0, 921)	0.23910488188266754
  (0, 931)	0.11032351851463318
  (0, 1050)	0.09377873688936234
  (0, 1078)	0.24715512990951538
  (0, 1272)	0.14475324749946594
  (0, 1293)	0.12612946331501007
  (0, 1653)	0.5624660849571228
  (0, 1683)	0.9857549667358398
  (0, 2239)	0.7662491798400879
  (0, 2267)	0.24620868265628815
  (0, 2316)	1.452373743057251
  (0, 2655)	0.14290887117385864
  (0, 2900)	0.18022646009922028
  (0, 2903)	0.04654909670352936
  (0, 2991)	0.14969828724861145
  :	:
  (0, 6741)	0.4997083246707

观察结果可以发现分词数量和稀疏向量的维度不一样。这就是splade和BM25的重要区别，splade可以举一反三，在最初9个分词的基础上，又增加了其他语义相近的单词。

那么，查询现在一共有多少个单词呢？或者说，它的稀疏向量的非零元素有多少呢？



In [10]:
# 获取稀疏向量的非零索引
nonzero_indices = query_sparse_vectors_splade.indices[query_sparse_vectors_splade.indptr[0]:query_sparse_vectors_splade.indptr[1]]

# 构建稀疏词权重列表
sparse_token_weights = [
    (splade_ef.model.tokenizer.decode(col), query_sparse_vectors_splade[0, col])
    for col in nonzero_indices
]

# 按权重降序排序
sparse_token_weights = sorted(sparse_token_weights, key=lambda item: item[1], reverse=True)

# 查询句只有9个tokens，splade通过举一反三，生成的稀疏向量维度增加到了98个。
print(f"splade 稀疏向量非零元素数量：{len(sparse_token_weights)}")

splade 稀疏向量非零元素数量：67


In [11]:
# 比如，和“artificial intelligence”语义相近的 “ai”，和“automotive”语义相近的“car”。
for token in sparse_token_weights:
    print(token)

(' AI', 1.5240402)
(' car', 1.5052143)
(' intelligence', 1.4523737)
(' industry', 1.3155848)
(' affect', 1.3052505)
(' artificial', 1.2752196)
(' Motor', 1.0611075)
(' effect', 0.98575497)
(' auto', 0.94583946)
('otive', 0.78177077)
(' learning', 0.7662492)
(' company', 0.702849)
('econom', 0.64115614)
(' computer', 0.60182554)
(' bias', 0.57746017)
(' Car', 0.5624661)
(' advantages', 0.50595564)
(' Industrial', 0.49970832)
(' Analytics', 0.46311656)
(' engineering', 0.46053436)
(' manufacturing', 0.45494542)
(' advancements', 0.34386268)
(' machine', 0.33768395)
(' regression', 0.28803518)
(' important', 0.27832666)
(' philosophy', 0.26905033)
(' Data', 0.25442016)
(' safety', 0.24715513)
(' consumer', 0.24620868)
(' automatic', 0.24117644)
(' road', 0.23910488)
(' engine', 0.2355993)
(' robot', 0.23191127)
(' Autom', 0.23061593)
(' PhD', 0.2206182)
(' accuracy', 0.19876806)
(' factor', 0.19704793)
(' IoT', 0.19466947)
(' prediction', 0.18816991)
('EV', 0.18229432)
(' brain', 0.180226

splade增加了大量语义相近的单词，比如和“artificial intelligence”语义相近的 “ai”，和“automotive”语义相近的“car”。