<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/Nemotron_Nano_9B_v2_AI_Agent_Reasoning_Demo_(Python).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install mamba-ssm -q

In [None]:
%pip install transformers==4.47.0

In [None]:
%pip install causal-conv1d

In [1]:
from huggingface_hub import login
from google.colab import userdata
import os

token=userdata.get('HF_TOKEN')
login(token=token)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

print(f"--- Loading Nemotron-Nano-9B-v2 model and tokenizer... ---")
# Specify the model name from Hugging Face
model_name = "nvidia/NVIDIA-Nemotron-Nano-9B-v2"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model
# torch_dtype=torch.bfloat16 is recommended for better performance on compatible hardware
# trust_remote_code=True is necessary for custom model architectures like Nemotron-Nano's hybrid design
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

# Move the model to GPU if available for faster inference
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"--- Model loaded successfully on {device}! ---")

In [2]:
def generate_text_with_nemotron(prompt: str, max_new_tokens: int = 100):

    # Prepare the input for the model
    # Encode the prompt into token IDs
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)

    print(f"\n--- Generating text for prompt: '{prompt}' ---")

    # Generate text
    # The `generate` method handles the text generation process.
    # `max_new_tokens` controls the length of the generated output.
    # `do_sample=True` enables sampling, leading to more creative outputs.
    # `temperature` controls the randomness of the output (lower = more deterministic).
    # `top_k` filters out low-probability tokens.
    output = model.generate(
        inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        pad_token_id=tokenizer.eos_token_id # Important for generation to stop cleanly
    )

    # Decode the generated token IDs back into human-readable text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_text

In [2]:
def generate_text_with_nemotron_agent(prompt: str, system_prompt: str = "/think", max_new_tokens: int = 200):
    """
    Loads the NVIDIA Nemotron-Nano-9B-v2 model and generates text based on a given prompt,
    with an optional system prompt to control its 'thinking' (agentic) behavior.
    """

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt},
    ]

    encoded_input = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_special_tokens=True,
        return_tensors="pt"
    )

    input_ids = encoded_input.to(device)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=device)

    input_length = input_ids.shape[1]

    print(f"\n--- Generating text for prompt: '{prompt}' (System prompt: '{system_prompt}') ---")

    output_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        pad_token_id=tokenizer.eos_token_id
    )

    generated_ids = output_ids[0][input_length:]
    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)

    return generated_text

In [17]:
def generate_text_with_nemotron_agent(prompt: str, system_prompt: str = "/think", max_new_tokens: int = 200):
    """
    Loads the NVIDIA Nemotron-Nano-9B-v2 model and generates text based on a given prompt,
    with an optional system prompt to control its 'thinking' (agentic) behavior.
    """

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt},
    ]

    encoded_input = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_special_tokens=True,
        return_tensors="pt"
    )

    input_ids = encoded_input.to(device)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=device)

    input_length = input_ids.shape[1]

    print(f"\n--- Generating text for prompt: '{prompt}' (System prompt: '{system_prompt}') ---")

    output_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        pad_token_id=tokenizer.eos_token_id
    )

    generated_ids = output_ids[0][input_length:]
    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)

    return generated_text

In [12]:
if __name__ == "__main__":
    print("\n" + "#"*70)
    print("DEMONSTRATING CODE GENERATION (WITH /think SYSTEM PROMPT)")
    print("#"*70)
    code_prompt = "Write a Python function to check if a number is prime. First, outline the steps."
    generated_code_with_thinking = generate_text_with_nemotron_agent(code_prompt, system_prompt="/think", max_new_tokens=1024)
    print("\n" + "="*70)
    print("Generated Code with Reasoning:")
    print(generated_code_with_thinking)
    print("="*70 + "\n")


######################################################################
DEMONSTRATING CODE GENERATION (WITH /think SYSTEM PROMPT)
######################################################################

--- Generating text for prompt: 'Write a Python function to check if a number is prime. First, outline the steps.' (System prompt: '/think') ---

Generated Code with Reasoning:
 output: Sure, I'd be happy to help! Here are the steps to write a Python function to check if a number is prime:

1. Define a function named `is_prime` that takes an integer `n` as its argument.
2. Check if `n` is less than 2, as 0, 1, and negative numbers are not prime. If so, return `False`.
3. Check if `n` is 2, as it is the only even prime number. If so, return `True`.
4. Check if `n` is an even number (other than 2), as even numbers greater than 2 are not prime. If so, return `False`.
5. Iterate from 3 to the square root of `n` (inclusive), checking if `n` is divisible by any of these numbers. If `n` is divi

In [12]:
if __name__ == "__main__":
    example_prompt = "Suggest 3 dinner steps."
    generated_steps = generate_text_with_nemotron(example_prompt, max_new_tokens=512)
    print("\n" + "="*70)
    print("Generated Response:")
    print(generated_steps)
    print("="*70 + "\n")

    direct_prompt = "What are the benefits of exercise?"
    generated_answer = generate_text_with_nemotron(direct_prompt, max_new_tokens=512)
    print("\n" + "="*70)
    print("Generated Response:")
    print(generated_answer)
    print("="*70 + "\n")


--- Generating text for prompt: 'Suggest 3 dinner steps.' ---

Generated Response:
Suggest 3 dinner steps. 

1. Prepare a spicy dish
2. Prepare a non-spicy dish
3. Serve both dishes
</think>

1. **Prepare a spicy dish**: Cook a dish like spicy stir-fried vegetables or chili-lime grilled chicken.  
2. **Prepare a non-spicy dish**: Make a mild option such as a vegetarian pasta or baked fish with lemon.  
3. **Serve both dishes**: Present them together with condiments (e.g., extra sauce) to let guests adjust the spice level.



--- Generating text for prompt: 'What are the benefits of exercise?' ---

Generated Response:
What are the benefits of exercise? The benefit of exercise is really multifaceted. It can help with weight control and can help with strength, flexibility, and balance. It can also reduce the risk of heart disease, stroke, diabetes, and cancer. Regular exercise can also improve mood and mental health. It can also help with sleep and energy levels.
</think>

Exercise offer