# SPLADE: Sparse Lexical and Expansion Model for First Stage Ranking

This notebook gives a minimal example usage of SPLADE.

* We provide models via Hugging Face (https://huggingface.co/naver)
* See [Naver Labs Europe website](https://europe.naverlabs.com/research/machine-learning-and-optimization/splade-models/) for other intermediate models.

| model | MRR@10 (MS MARCO dev) | recall@1000 (MS MARCO dev) | expected FLOPS | ~ avg q length | ~ avg d length | 
| --- | --- | --- | --- | --- | --- |
| `naver/splade_v2_max` (**v2** [HF](https://huggingface.co/naver/splade_v2_max)) | 34.0 | 96.5 | 1.32 | 18 | 92 |
| `naver/splade_v2_distil` (**v2** [HF](https://huggingface.co/naver/splade_v2_distil)) | 36.8 | 97.9 | 3.82 | 25 | 232 |
| `naver/splade-cocondenser-selfdistil` (**v2bis**, [HF](https://huggingface.co/naver/splade-cocondenser-selfdistil))| 37.6 | 98.4 | 2.32 | 56 | 134 |
| `naver/splade-cocondenser-ensembledistil` (**v2bis**, [HF](https://huggingface.co/naver/splade-cocondenser-ensembledistil)) | 38.3 | 98.3  | 1.85 | 44 | 120 |

In [3]:
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
from splade.models.transformer_rep import Splade

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# set the dir for trained weights

##### v2
# model_type_or_dir = "naver/splade_v2_max"
# model_type_or_dir = "naver/splade_v2_distil"

### v2bis, directly download from Hugging Face
# model_type_or_dir = "naver/splade-cocondenser-selfdistil"
model_type_or_dir = "naver/splade-cocondenser-ensembledistil"

In [3]:
# loading model and tokenizer

model = Splade(model_type_or_dir, agg="max")
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_type_or_dir)
reverse_voc = {v: k for k, v in tokenizer.vocab.items()}

Downloading: 100%|██████████| 670/670 [00:00<00:00, 1.62MB/s]
Downloading: 100%|██████████| 418M/418M [00:47<00:00, 9.25MB/s] 
  return torch.load(checkpoint_file, map_location="cpu")
Downloading: 100%|██████████| 466/466 [00:00<00:00, 1.30MB/s]
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 1.03MB/s]
Downloading: 100%|██████████| 455k/455k [00:00<00:00, 3.50MB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 322kB/s]


In [4]:
# example document from MS MARCO passage collection (doc_id = 8003157)

doc = "Glass and Thermal Stress. Thermal Stress is created when one area of a glass pane gets hotter than an adjacent area. If the stress is too great then the glass will crack. The stress level at which the glass will break is governed by several factors."

In [42]:
# now compute the document representation
with torch.no_grad():
    doc_rep = model(d_kwargs=tokenizer(doc, return_tensors="pt"))["d_rep"].squeeze()  # (sparse) doc rep in voc space, shape (30522,)

# get the number of non-zero dimensions in the rep:
col = torch.nonzero(doc_rep).squeeze().cpu().tolist()
print("number of actual dimensions: ", len(col))

# now let's inspect the bow representation:
weights = doc_rep[col].cpu().tolist()
print(weights)
d = {k: v for k, v in zip(col, weights)}
sorted_d = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
bow_rep = []
for k, v in sorted_d.items():
    bow_rep.append((reverse_voc[k], round(v, 2)))
print("SPLADE BOW rep:\n", bow_rep)

number of actual dimensions:  12
[0.0828016921877861, 1.2767744064331055, 1.948005199432373, 1.853646159172058, 1.8524913787841797, 1.068393588066101, 1.1146529912948608, 0.7086443305015564, 0.9308910369873047, 1.2376736402511597, 1.111029863357544, 1.1254249811172485]
SPLADE BOW rep:
 [('ĠÐ²ÑĭÑĪÐµ', 1.95), ('Ð½Ð¸ÑĤÐµ', 1.85), ('ĠÑģÐ¾ÑģÑĤÐ¾ÑıÐ½Ð¸Ñı', 1.85), ('ill', 1.28), ('Ð»ÑĮÐ·ÑĥÐ¹ÑĤÐµÑģÑĮ', 1.24), ('ĠÑįÑĦÑĦÐµÐºÑĤÐ¸Ð²Ð½ÑĭÑħ', 1.13), ('ĠÐ»ÑĥÑĩÑĪÐ¸Ñħ', 1.11), ('ĠÑĢÐµÐºÐ»Ð°Ð¼Ð½ÑĭÑħ', 1.11), ('ĠÐ¿ÐµÑĢÐ²ÑĭÑħ', 1.07), ('ĠÑģÐ¿Ð¾ÑĢÑĤÐ¸Ð²Ð½ÑĭÑħ', 0.93), ('ĠÐ¿ÑĢÐ¾Ð¸Ð·Ð²Ð¾Ð´Ð¸ÑĤÐµÐ»ÐµÐ¹', 0.71), ('Ð»', 0.08)]


  with torch.cuda.amp.autocast() if self.fp16 else NullContextManager():
  with torch.cuda.amp.autocast() if self.fp16 else NullContextManager():


Inference from deepvk

In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Prepare model
model_id = "deepvk/RuModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id)



  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Prepare input
text = "Моя [MASK] громко лает."
inputs = tokenizer(text, return_tensors="pt")
masked_index = inputs["input_ids"][0].tolist().index(tokenizer.mask_token_id)


In [7]:
inputs

{'input_ids': tensor([[50281,  8975,  3857,   390,  8272,   535, 16906,   335,   516, 50284,
            20, 50282]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [9]:
# Make prediction
outputs = model(**inputs)
outputs

MaskedLMOutput(loss=None, logits=tensor([[[-3.7083, -3.3538,  3.1179,  ..., -3.7091, -3.7051, -3.7077],
         [ 1.4417,  1.0466,  4.4271,  ...,  1.4445,  1.4185,  1.4389],
         [-1.1383, -2.6046, -0.6090,  ..., -1.1413, -1.1439, -1.1374],
         ...,
         [-3.2155, -5.7025,  3.1153,  ..., -3.2167, -3.2152, -3.2150],
         [-3.3494, -4.5575,  3.7914,  ..., -3.3513, -3.3539, -3.3494],
         [-2.4906, -0.2519,  0.6629,  ..., -2.4883, -2.5021, -2.4917]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [10]:
# Show prediction
predicted_token_id = outputs.logits[0, masked_index].argmax(axis=-1)
predicted_token = tokenizer.decode(predicted_token_id)
print("Predicted token:", predicted_token)

Predicted token:  собака


In [1]:
model_type_or_dir = "models/vk_ru-splade++_max/checkpoint/model"

In [3]:
from splade.models.transformer_rep import Splade
model = Splade(model_type_or_dir, agg="max")
model.eval()

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model.state_dict()

OrderedDict([('transformer_rep.transformer.model.embeddings.tok_embeddings.weight',
              tensor([[ 0.0005, -0.0036,  0.0050,  ...,  0.0053, -0.0022,  0.0077],
                      [ 0.0092, -0.0033, -0.0158,  ..., -0.0005,  0.0359,  0.0276],
                      [ 0.0387, -0.0449, -0.0224,  ...,  0.0220, -0.0061,  0.0033],
                      ...,
                      [ 0.0005, -0.0036,  0.0050,  ...,  0.0053, -0.0022,  0.0077],
                      [ 0.0005, -0.0035,  0.0050,  ...,  0.0052, -0.0021,  0.0077],
                      [ 0.0005, -0.0036,  0.0050,  ...,  0.0053, -0.0022,  0.0077]])),
             ('transformer_rep.transformer.model.embeddings.norm.weight',
              tensor([ 4.1816e-01,  6.2213e-01,  5.1890e-01,  5.8511e-01,  3.9909e-01,
                       1.3032e-01,  4.8813e-01,  5.1707e-01,  4.1634e-01,  5.4733e-01,
                       4.2233e-01,  6.6779e-01,  6.1531e-01,  4.3340e-01,  6.5799e-01,
                       6.1384e-01,  1.0677e+00,

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_type_or_dir)
reverse_voc = {v: k for k, v in tokenizer.vocab.items()}
doc = "Хорошая собака"
import torch
with torch.no_grad():
    doc_rep = model(d_kwargs=tokenizer(doc, return_tensors="pt"))["d_rep"].squeeze()  # (sparse) doc rep in voc space, shape (30522,)

print(doc_rep)
# get the number of non-zero dimensions in the rep:
col = torch.nonzero(doc_rep).squeeze().cpu().tolist()
print("number of actual dimensions: ", len(col))

# now let's inspect the bow representation:
weights = doc_rep[col].cpu().tolist()
d = {k: v for k, v in zip(col, weights)}
sorted_d = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
bow_rep = []
index_rep = [] 
for k, v in sorted_d.items():
    index_rep.append(k)
    bow_rep.append((reverse_voc[k], round(v, 2)))
print("SPLADE BOW rep:\n", bow_rep)
tokenizer.decode(index_rep)

In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Prepare model
model_id = "deepvk/RuModernBERT-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id)
model = model.eval()

# Prepare input
text = "Мама мыла [MASK]."
inputs = tokenizer(text, return_tensors="pt")
masked_index = inputs["input_ids"][0].tolist().index(tokenizer.mask_token_id)



  from .autonotebook import tqdm as notebook_tqdm


In [12]:
ids = tokenizer.encode("живетана")

In [14]:
ids

[50281, 595, 5120, 300, 50282]

In [16]:
for token in ids:
    print(tokenizer.decode(token))

[CLS]
жи
вета
на
[SEP]


In [13]:
tokenizer.decode(ids)

'[CLS]живетана[SEP]'

In [19]:
print(inputs)

{'input_ids': tensor([[50281, 49179, 42830, 50284,    20, 50282]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}


In [None]:
# Make prediction
outputs = model(**inputs)

# Show prediction
predicted_token_id = outputs.logits[0, masked_index].argmax(axis=-1)
predicted_token = tokenizer.decode(predicted_token_id)
print("Predicted token:", predicted_token)

In [22]:
tokenizer.decode(1747)

' поя'

In [18]:
import os 
import torch
from splade.utils.utils import set_seed, restore_model, get_initialize_config, get_loss, set_seed_from_config
from splade.models.transformer_rep import Splade


In [19]:


ckpt_path = "models/vk_ru-splade++_max/checkpoint/model_ckpt/model_ckpt_40000.tar"

print(os.path.join(ckpt_path))

ckpt = torch.load(os.path.join(ckpt_path), map_location="cpu", weights_only=False)




models/vk_ru-splade++_max/checkpoint/model_ckpt/model_ckpt_40000.tar


In [23]:
model = Splade("deepvk/RuModernBERT-base", agg="max")
restore_model(model, ckpt["model_state_dict"])

restoring model: Splade


In [24]:
model.state_dict()

OrderedDict([('transformer_rep.transformer.model.embeddings.tok_embeddings.weight',
              tensor([[ 0.0005, -0.0036,  0.0050,  ...,  0.0053, -0.0022,  0.0077],
                      [ 0.0092, -0.0033, -0.0158,  ..., -0.0005,  0.0359,  0.0276],
                      [ 0.0387, -0.0449, -0.0224,  ...,  0.0220, -0.0061,  0.0033],
                      ...,
                      [ 0.0005, -0.0036,  0.0050,  ...,  0.0053, -0.0022,  0.0077],
                      [ 0.0005, -0.0035,  0.0050,  ...,  0.0052, -0.0021,  0.0077],
                      [ 0.0005, -0.0036,  0.0050,  ...,  0.0053, -0.0022,  0.0077]])),
             ('transformer_rep.transformer.model.embeddings.norm.weight',
              tensor([ 4.1816e-01,  6.2213e-01,  5.1890e-01,  5.8511e-01,  3.9909e-01,
                       1.3032e-01,  4.8813e-01,  5.1707e-01,  4.1634e-01,  5.4733e-01,
                       4.2233e-01,  6.6779e-01,  6.1531e-01,  4.3340e-01,  6.5799e-01,
                       6.1384e-01,  1.0677e+00,

In [26]:
reverse_voc = {v: k for k, v in tokenizer.vocab.items()}


doc = "что как где когда"
with torch.no_grad():
    doc_rep = model(d_kwargs=tokenizer(doc, return_tensors="pt"))["d_rep"].squeeze()  # (sparse) doc rep in voc space, shape (30522,)

print(doc_rep)
# get the number of non-zero dimensions in the rep:
col = torch.nonzero(doc_rep).squeeze().cpu().tolist()
print("number of actual dimensions: ", len(col))

# now let's inspect the bow representation:
weights = doc_rep[col].cpu().tolist()
d = {k: v for k, v in zip(col, weights)}
sorted_d = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
bow_rep = []
index_rep = [] 
for k, v in sorted_d.items():
    index_rep.append(k)
    bow_rep.append((reverse_voc[k], round(v, 2)))
print("SPLADE BOW rep:\n", bow_rep)

tensor([0., 0., 0.,  ..., 0., 0., 0.])
number of actual dimensions:  4
SPLADE BOW rep:
 [('ĠÐ²ÑĭÑĪÐµ', 2.14), ('ill', 1.79), ('ĠÐ¿ÑĢÐ¾Ð¸Ð·Ð²Ð¾Ð´Ð¸ÑĤÐµÐ»ÐµÐ¹', 0.57), ('ĠÐ¿ÐµÑĢÐ²ÑĭÑħ', 0.38)]


In [28]:
for id in index_rep:
    print(tokenizer.decode(id))


 выше
ill
 производителей
 первых


In [None]:
tokenizer.encode("живетана")

In [29]:
tokenizer.encode("абвгдеёжзийклмнопрстуфхцчшщъыьэюя")

[50281,
 4468,
 278,
 118,
 284,
 3125,
 312,
 20969,
 12981,
 273,
 7045,
 264,
 933,
 855,
 305,
 364,
 291,
 357,
 385,
 1028,
 280,
 285,
 994,
 327,
 279,
 50282]

In [34]:
ids =tokenizer.encode("собака")

In [35]:
for id in ids:
    print(tokenizer.decode(id))

[CLS]
со
бака
[SEP]


In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Prepare model
model_id = "ai-forever/ruBert-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id)
model = model.eval()

# Prepare input
text = "Моя [MASK] громко лает."
inputs = tokenizer(text, return_tensors="pt")
masked_index = inputs["input_ids"][0].tolist().index(tokenizer.mask_token_id)


  from .autonotebook import tqdm as notebook_tqdm
BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at ai-forever/ruBert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of 

Error while downloading from https://cdn-lfs.hf.co/sberbank-ai/ruBert-base/096c3c1250be19873798e13717aea455ad6660a790c2c0e6a7233cbfc094ff04?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1745518195&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NTUxODE5NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9zYmVyYmFuay1haS9ydUJlcnQtYmFzZS8wOTZjM2MxMjUwYmUxOTg3Mzc5OGUxMzcxN2FlYTQ1NWFkNjY2MGE3OTBjMmMwZTZhNzIzM2NiZmMwOTRmZjA0P3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=bzi5AEIYGC-xGNiThvlskseN8JwMezDbeB0X%7ECH4ijfX--d7B8rkZQQ6hZb73gAtwa0WiIjw-sJ7a6Cy7Bg%7EQwllEkrVAilIXolYN2Dj56Ex2NUAS4ewtRg3QHtr3XMVfrDDFtQGHlnkFMJqXJ-CRpBr8k5kxgfmF5Uei-2IFiXJURw054lSPLqiKlY3flbYw1qBh80ba8rfxc9R1mxD%7E0hKV%7EWXo%7E9Gb1yhCmJykf8d9JLfErPvNoOMmw%7EfEGqcbIXUNvSotebSGANUc1DIHBIOv-ogynO9TplBCw1zCdurM8m3h6dtrrKqkiMGBiNlkfg%7Ej5q6pA4fvPGGrp6YqA__&Key-Pair-Id=K3RPWS32NSSJCE: HTTPSCo

In [2]:
outputs = model(**inputs)

# Show prediction
predicted_token_id = outputs.logits[0, masked_index].argmax(axis=-1)
predicted_token = tokenizer.decode(predicted_token_id)
print("Predicted token:", predicted_token)

Predicted token: собака


In [3]:
model.state_dict()

OrderedDict([('bert.embeddings.word_embeddings.weight',
              tensor([[-0.0279, -0.0080, -0.0127,  ...,  0.0099,  0.0004, -0.0009],
                      [-0.0388,  0.0022,  0.0264,  ...,  0.0068,  0.0117, -0.0185],
                      [-0.0589, -0.0092,  0.0043,  ...,  0.0090,  0.0010,  0.0158],
                      ...,
                      [-0.0403,  0.0152, -0.0609,  ..., -0.0260,  0.0587, -0.0274],
                      [-0.0807,  0.0445,  0.0360,  ..., -0.0074, -0.0486, -0.0236],
                      [-0.0532,  0.0153,  0.0033,  ..., -0.0458, -0.0397, -0.0135]])),
             ('bert.embeddings.position_embeddings.weight',
              tensor([[-0.0134,  0.0084, -0.0098,  ...,  0.0564,  0.1333,  0.1280],
                      [ 0.0103, -0.0482,  0.0084,  ..., -0.0005,  0.1193,  0.0245],
                      [ 0.0048, -0.0409,  0.0131,  ...,  0.0045,  0.0651,  0.0386],
                      ...,
                      [ 0.0085,  0.0743,  0.0114,  ..., -0.0080, -0.098