In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed

set_seed(42)

big_model_path = r"../../../models/SmolLM2_360m"
small_model_path = r"../../../models/SmolLM2-135M-Instruct"

device = "cuda" if torch.cuda.is_available() else "cpu"

small_model = AutoModelForCausalLM.from_pretrained(small_model_path, device_map=device)
small_tokenizer = AutoTokenizer.from_pretrained(small_model_path, device_map=device)

big_model = AutoModelForCausalLM.from_pretrained(big_model_path, device_map=device)
big_tokenizer = AutoTokenizer.from_pretrained(big_model_path, device_map=device)

In [None]:
def normal_inference(big_model, big_tokenizer, prompt, max_new_tokens=500):
    inputs = big_tokenizer(prompt, return_tensors='pt').to(device)
    outputs = big_model.generate(inputs['input_ids'], max_new_tokens=max_new_tokens)
    return big_tokenizer.decode(outputs[0], skip_special_tokens=True)

normal_inference(big_model, big_tokenizer, prompt="hello my name is")

In [None]:
def speculative_decoding(small_model, big_model, small_tokenizer, big_tokenizer, prompt, max_new_tokens=500):
    # Step 1: Use the small model to generate the draft
    inputs = small_tokenizer(prompt, return_tensors='pt').to(device)
    small_outputs = small_model.generate(inputs['input_ids'], max_new_tokens=max_new_tokens)
    draft = small_tokenizer.decode(small_outputs[0], skip_special_tokens=True)

    big_inputs = big_tokenizer(draft, return_tensors='pt').to(device)

    with torch.no_grad():
        outputs = big_model(big_inputs['input_ids'])
        log_probs = torch.log_softmax(outputs.logits, dim=-1)

    draft_token_ids = big_inputs['input_ids']
    log_likelihood = 0
    for i in range(draft_token_ids.size(1) - 1):
        token_id = draft_token_ids[0, i + 1]
        log_likelihood += log_probs[0, i, token_id].item()

    avg_log_likelihood = log_likelihood / (draft_token_ids.size(1) - 1)

    return draft, avg_log_likelihood

speculative_decoding(small_model, big_model, small_tokenizer, big_tokenizer, prompt="hello my name is")

In [None]:
import time

def measure_latency(small_model, big_model, small_tokenizer, big_tokenizer, prompt, max_new_tokens=500):
    start_time = time.time()
    normal_output = normal_inference(big_model, big_tokenizer, prompt, max_new_tokens)
    normal_inference_latency = time.time() - start_time
    print(f"Normal Inference Output: {normal_output}")
    print(f"Normal Inference Latency: {normal_inference_latency:.4f} seconds")
    print("\n\n")

    start_time = time.time()
    speculative_output, log_likelihood = speculative_decoding(
        small_model, big_model, small_tokenizer, big_tokenizer, prompt, max_new_tokens
    )
    speculative_decoding_latency = time.time() - start_time
    print(f"Speculative Decoding Output: {speculative_output}")
    print(f"Speculative Decoding Latency: {speculative_decoding_latency:.4f} seconds")
    print(f"Log Likelihood (Verification Score): {log_likelihood:.4f}")

    return normal_inference_latency, speculative_decoding_latency


measure_latency(small_model, big_model, small_tokenizer, big_tokenizer, prompt="Write a short story about Daniel. Hello my name is Danieal and ")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Normal Inference Output: Write a short story about Daniel. Hello my name is Danieal and  I am a 12 year old boy who loves to read and write. I am a very creative person and I love to draw and write. I am a very good student and I love to learn new things. I am very good at math and I love to learn new things. I am very good at sports and I love to play them. I am very good at music and I love to play it. I am very good at drawing and I love to draw. I am very good at writing and I love to write. I am very good at cooking and I love to cook. I am very good at math and I love to learn new things. I am very good at sports and I love to play them. I am very good at music and I love to play it. I am very good at drawing and I love to draw. I am very good at writing and I love to write. I am very good at cooking and I love to cook. I am very good at math and I love to learn new things. I am very good at sports and I love to play them. I am very good at music and I love to play it. I am very 

(13.676013946533203, 14.032550811767578)