In [1]:
import time
from transformers import pipeline

def run_benchmark(num_iterations=10):
    # Load the model
    print("Loading model...")
    chatbot = pipeline("text2text-generation", model="facebook/blenderbot-400M-distill")
    
    # Warm-up run
    _ = chatbot("Hello, how are you?")
    
    # Benchmark
    print(f"Running benchmark for {num_iterations} iterations...")
    start_time = time.time()
    for _ in range(num_iterations):
        response = chatbot("Hello, how are you?")
    end_time = time.time()
    
    avg_time = (end_time - start_time) / num_iterations
    print(f"Average inference time: {avg_time:.2f} seconds")

if __name__ == "__main__":
    run_benchmark()

  from .autonotebook import tqdm as notebook_tqdm


Loading model...


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Running benchmark for 10 iterations...
Average inference time: 3.52 seconds


In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def simple_test():
    print("Starting simple test...")
    
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
    
    print("Loading model...")
    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot-400M-distill")
    
    print("Preparing input...")
    inputs = tokenizer("Hello, how are you?", return_tensors="pt")
    
    print("Generating response...")
    outputs = model.generate(**inputs)
    
    print("Decoding response...")
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print(f"Response: {response}")
    print("Test completed successfully.")

if __name__ == "__main__":
    simple_test()

Starting simple test...
Loading tokenizer...
Loading model...
Preparing input...
Generating response...
Decoding response...
Response:  I'm doing well, thank you. How are you this fine evening? Do you have any plans?
Test completed successfully.
