In [1]:
import os
os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'

### speculative decoding

- https://huggingface.co/blog/assisted-generation
- https://github.com/huggingface/transformers/blob/849367ccf741d8c58aa88ccfe1d52d8636eaf2b7/src/transformers/generation/utils.py#L4064
- the two models must share the same tokenizer

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

prompt = "Alice and Bob"
checkpoint = "EleutherAI/pythia-1.4b-deduped"
assistant_checkpoint = "EleutherAI/pythia-160m-deduped"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
inputs = tokenizer(prompt, return_tensors="pt").to(device)

model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint).to(device)
outputs = model.generate(**inputs, assistant_model=assistant_model)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end genera

In [3]:
tokenizer.batch_decode(outputs, skip_special_tokens=True)
# ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']

['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']

In [7]:
# bs * seq_len
inputs, inputs['input_ids'].shape

({'input_ids': tensor([[2422,  547,  285, 8679]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1]], device='cuda:0')},
 torch.Size([1, 4]))

#### assistant_model

In [18]:
# "vocab_size": 50304
# "hidden_size": 768,
# "num_attention_heads": 12,
# "num_hidden_layers": 12,
assistant_model.config

GPTNeoXConfig {
  "_name_or_path": "EleutherAI/pythia-160m-deduped",
  "architectures": [
    "GPTNeoXForCausalLM"
  ],
  "attention_bias": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.1,
  "eos_token_id": 0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neox",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "rope_scaling": null,
  "rotary_emb_base": 10000,
  "rotary_pct": 0.25,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.45.0.dev0",
  "use_cache": true,
  "use_parallel_residual": true,
  "vocab_size": 50304
}

In [17]:
768 / 12

64.0

In [15]:
tokenizer.batch_decode(assistant_model.generate(input_ids=inputs['input_ids'], 
                                                min_new_tokens=0, 
                                                max_new_tokens=5))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


['Alice and Bob, who were both in']

### config & kwargs

- `LogitsProcessorList`
- cache_name: `past_key_values` (kv-cache)
    - `model_kwargs['past_key_values'] = DynamicCache()`
- model_input_name: `input_ids`
    - model_kwargs
        - attention_mask
        - use_cache=True
        - past_key_values: 通过 `DynamicCache()` 的实例来存放；
- `generation_mode == GenerationMode.ASSISTED_GENERATION`
    - assisted generate is only supported for batch_size = 1
    - assisted generate requires `use_cache=True`

- cache
    - `cache_position = torch.ones_like(input_ids[0, :], dtype=torch.int64).cumsum(0) - 1`
    - `DynamicCache()`
        - `past_length = cache.get_seq_length()`

In [11]:
torch.ones_like(inputs['input_ids'][0, :]).cumsum(0) - 1

tensor([0, 1, 2, 3], device='cuda:0')

### `AssistedCandidateGenerator`

`def get_candidates(self, input_ids: torch.LongTensor)` => `candidate_logits, candidate_ids`
- candidate_input_ids.shape: [1, 9]
- candidate_logits.shape: [1, 5, 50304]

candidate_length = candidate_input_ids.shape[1] - input_ids.shape[1]
- 9-4=5