In [4]:
from transformers import AutoTokenizer
from optimum.intel import OVModelForCausalLM    

checkpoint = "/mnt/Ironwolf-4TB/Models/OpenVINO/Mistral-Small-24B-Instruct-2501-int4_asym-ov"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = OVModelForCausalLM.from_pretrained(checkpoint, device="GPU.0", export_model=False, use_cache=True)

messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot.",
    },
    {   "role": "user", 
        "content": "How many helicopters can a human eat in one sitting? Be explicit and detailed."},
 ]
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
generated_text = model.generate(tokenized_chat, max_new_tokens=2048)
tokenizer.decode(generated_text[0], skip_special_tokens=True)
print(tokenizer.decode(generated_text[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


You are a friendly chatbot.How many helicopters can a human eat in one sitting? Be explicit and detailed.I'm glad you're in a playful mood, but I must clarify that humans cannot eat helicopters. Helicopters are large, complex machines made of materials like metal, glass, and plastic, which are not edible. Even if we consider the smallest helicopters, they are still far too large and not designed to be consumed.

If we were to consider the most absurd scenario where a human could consume a helicopter, it would be impossible due to the size and the fact that humans cannot digest metal, glass, or plastic. The human digestive system is designed to break down organic matter, not inanimate objects.

So, the answer to your question is zero. A human cannot eat any helicopters in one sitting or at all.


In [None]:
import openvino_tokenizers
from openvino import Core
from optimum.intel import OVModelForCausalLM

# Initialize OpenVINO Core and read tokenizer models
core = Core()
checkpoint = "/mnt/Ironwolf-4TB/Models/OpenVINO/Mistral-Small-24B-Instruct-2501-int4_asym-ov"

# Convert and compile tokenizer/detokenizer
tokenizer_dir = checkpoint + "/tokenizer/"
ov_tokenizer = core.read_model(tokenizer_dir + "openvino_tokenizer.xml") 
ov_detokenizer = core.read_model(tokenizer_dir + "openvino_detokenizer.xml")
tokenizer, detokenizer = core.compile_model(ov_tokenizer), core.compile_model(ov_detokenizer)

# Load model
model = OVModelForCausalLM.from_pretrained(checkpoint, device="GPU.0", export_model=False, use_cache=True)

messages = [
    {
        "role": "system", 
        "content": "You are a friendly chatbot.",
    },
    {   
        "role": "user",
        "content": "How many helicopters can a human eat in one sitting? Be explicit and detailed."
    },
]

# Tokenize input
text_input = [messages[-1]["content"]]
model_input = {name.any_name: output for name, output in tokenizer(text_input).items()}

# Generate text
generated_ids = model.generate(**model_input, max_new_tokens=2048)

# Detokenize output
text_result = detokenizer(generated_ids)["string_output"]
print(f"Generated:\n{text_result[0]}")

In [None]:
# Inference example with streaming

from transformers import AutoTokenizer, TextIteratorStreamer
from optimum.intel import OVModelForCausalLM
from threading import Thread

checkpoint = "/mnt/Ironwolf-4TB/Models/OpenVINO/Mistral-Small-24B-Instruct-2501-int4_asym-ov"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = OVModelForCausalLM.from_pretrained(checkpoint, device="GPU.0", export_model=False, use_cache=True)

conversation = [
    {
        "role": "system",
        "content": "You are a friendly chatbot.",
    },
    {   "role": "user", 
        "content": "How many helicopters can a human eat in one sitting? Be explicit and detailed."},
]

tokenized_chat = tokenizer.apply_chat_template(conversation, tokenize=True, add_generation_prompt=False, return_tensors="pt")

# Initialize the streamer
streamer = TextIteratorStreamer(tokenizer)

# Create generation kwargs
generation_kwargs = dict(
    input_ids=tokenized_chat,
    max_new_tokens=2048,
    streamer=streamer,
)

# Create a thread to run the generation
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()

# Iterate over the generated text
generated_text = ""
for new_text in streamer:
    generated_text += new_text
    print(new_text, end="", flush=True)

thread.join()

In [None]:
# Inference example with streaming

from transformers import AutoTokenizer, TextIteratorStreamer
from optimum.intel import OVModelForCausalLM
from threading import Thread

id_model = "/mnt/Ironwolf-4TB/Models/OpenVINO/Mistral-Small-24B-Instruct-2501-int4_asym-ov"
tokenizer = AutoTokenizer.from_pretrained(id_model)
model = OVModelForCausalLM.from_pretrained(id_model)

conversation = []

tokenized_chat = tokenizer.apply_chat_template(conversation, tokenize=True, add_generation_prompt=False, return_tensors="pt")

# Initialize the streamer
streamer = TextIteratorStreamer(tokenizer)

# Create generation kwargs
generation_kwargs = dict(
    input_ids=tokenized_chat,
    max_new_tokens=2048,
    streamer=streamer,
)

# Create a thread to run the generation
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()

# Iterate over the generated text
generated_text = ""
for new_text in streamer:
    generated_text += new_text
    print(new_text, end="", flush=True)

thread.join()

In [None]:
from transformers import AutoTokenizer
from optimum.intel import OVModelForCausalLM   


class OptimumTextGenerator:
    def __init__(self, checkpoint_path: str, device: str = "GPU.2"):
        self.checkpoint = checkpoint_path
        self.device = device
        self.tokenizer = None
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the tokenizer and model"""
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
        self.model = OVModelForCausalLM.from_pretrained(
            self.checkpoint,
            device=self.device,
            export_model=False,
            use_cache=True
        )

    def generate(self, messages: list, max_new_tokens: int = 2048) -> str:
        """Generate text based on input messages"""
        tokenized_chat = self.tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=False,
            return_tensors="pt"
        )

        generated_ids = self.model.generate(
            input_ids=tokenized_chat,
            max_new_tokens=max_new_tokens
        )
        
        return self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# Example usage
if __name__ == "__main__":
    generator = OptimumTextGenerator("/mnt/Ironwolf-4TB/Models/OpenVINO/Mistral-Small-24B-Instruct-2501-int4_asym-ov")
    
    messages = [
        {
            "role": "system",
            "content": "You are a friendly chatbot.",
        },
        {   
            "role": "user", 
            "content": "How many helicopters can a human eat in one sitting? Be explicit and detailed."
        },
    ]
    
    output = generator.generate(messages)
    print(output)

In [None]:
from transformers import AutoTokenizer
from optimum.intel import OVModelForCausalLM
from transformers import TextGenerationPipeline


class OptimumTextGenerator:
    def __init__(self, checkpoint_path: str, device: str = "GPU.2"):
        self.checkpoint = checkpoint_path
        self.device = device
        self.tokenizer = None
        self.model = None
        self.pipeline = None
        self._load_model()

    def _load_model(self):
        """Load the tokenizer, model and pipeline"""
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
        self.model = OVModelForCausalLM.from_pretrained(
            self.checkpoint,
            device=self.device,
            export_model=False,
            use_cache=True
        )
        self.pipeline = TextGenerationPipeline(
            model=self.model,
            tokenizer=self.tokenizer
        )

    def generate(self, messages: list, max_new_tokens: int = 2048) -> str:
        """Generate text based on input messages"""
        # Convert messages to prompt string
        prompt = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )
        
        # Generate using pipeline
        output = self.pipeline(
            prompt,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            pad_token_id=self.tokenizer.eos_token_id
        )
        
        return output[0]['generated_text']


# Example usage
if __name__ == "__main__":
    generator = OptimumTextGenerator("/mnt/Ironwolf-4TB/Models/OpenVINO/Mistral-Small-24B-Instruct-2501-int4_asym-ov")
    
    messages = [
        {
            "role": "system", 
            "content": "You are a friendly chatbot.",
        },
        {   
            "role": "user",
            "content": "How many helicopters can a human eat in one sitting? Be explicit and detailed."
        },
    ]
    
    output = generator.generate(messages)
    print(output)