In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
import sys
import os

# Load quantized lmstudio-community/Qwen3-1.7B-GGUF runs with 8G NVIDIA GeForce RTX 4070 Laptop GPU
model_name = "Qwen/Qwen3-1.7B"
max_new_tokens = 10000

# system prompt
system_prompt = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should be engaging and fun.
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""

# load cuda device
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Loading {model_name}...")
start_time = time.time()

# Loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    local_files_only=False
)

# Trying to load the model with 4-bit quantization for efficiency
try:
    print("Attempting to load model with 4-bit quantization...")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        local_files_only=False,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,  # Use bfloat16 for better performance
        device_map="auto",
        quantization_config={"load_in_4bit": True}  # 4-bit quantization for memory efficiency
    )
except Exception as e:
    print(f"Load failed failed with error: {str(e)}")
    os._exit(1)

load_time = time.time() - start_time
print(f"Model loaded in {load_time:.2f} seconds")

def generate_response(user_input, chat_history=None):
    if chat_history is None:
        chat_history = []
    
    # Formatting the conversation for the model
    messages = [{"role": "system", "content": system_prompt}]
    
    # Adding chat history for a full context of the conversation
    for message in chat_history:
        messages.append(message)
    
    # Adding the current user input
    messages.append({"role": "user", "content": user_input})
    
    # Tokenization: converting messages to model input format
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # Generating response: this part may take some time to execute at first
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id
        )
    
    # Decoding the generated response
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extracting only the assistant's response, instead of the full raw output
    assistant_response = full_response.split(user_input)[-1].strip()
    
    # Further cleaning up the response if it contains role markers or other artifacts
    if "assistant" in assistant_response.lower()[:20]:
        assistant_response = assistant_response.split(":", 1)[-1].strip()
    
    return assistant_response

# Create a simple UI for the personal assistant
def create_assistant_ui():
    output = widgets.Output()
    input_box = widgets.Text(
        value='',
        placeholder='Ask me anything...',
        description='Question:',
        layout=widgets.Layout(width='80%'),
        continuous_update=False
    )
    send_button = widgets.Button(description="Send")
    clear_button = widgets.Button(description="Clear Chat")
    
    chat_history = []
    
    def on_send_button_clicked(b):
        user_input = input_box.value
        if not user_input.strip():
            return
        
        with output:
            print(f"You: {user_input}")
            
            # Show thinking indicator
            print("Assistant: Thinking...", end="\r")
            
            # Generate response
            start_time = time.time()
            try:
                response = generate_response(user_input, chat_history)
                end_time = time.time()
                
                # Clear the "thinking" message
                clear_output(wait=True)
                
                # Display the exchange
                print(f"You: {user_input}")
                print(f"Assistant: {response}")
                print(f"\n(Response generated in {end_time - start_time:.2f} seconds)")
                
                # Update chat history
                chat_history.append({"role": "user", "content": user_input})
                chat_history.append({"role": "assistant", "content": response})
            except Exception as e:
                clear_output(wait=True)
                print(f"You: {user_input}")
                print(f"Error generating response: {str(e)}")
                import traceback
                traceback.print_exc()
        
        # Clear input box
        input_box.value = ''
    
    def on_clear_button_clicked(b):
        with output:
            clear_output()
            print("Chat cleared!")
        chat_history.clear()
    
    # Connect button clicks to functions
    send_button.on_click(on_send_button_clicked)
    clear_button.on_click(on_clear_button_clicked)
    
    # Handle Enter key in input box
    def on_enter(sender):
        on_send_button_clicked(None)
    input_box.observe(on_enter)
    
    # Arrange UI components
    input_row = widgets.HBox([input_box, send_button, clear_button])
    ui = widgets.VBox([output, input_row])
    
    return ui

# Example of a simpler approach to utilize the model (command line interface)
def cli_chat():
    print("\n=== Starting CLI Chat (type 'exit' to quit) ===")
    chat_history = []
    
    while True:
        user_input = input("\nYou: ")
        if user_input.lower() in ['exit', 'quit', 'q']:
            print("Goodbye!")
            break
        
        print("Assistant: ", end="")
        try:
            start_time = time.time()
            response = generate_response(user_input, chat_history)
            end_time = time.time()
            
            print(f"{response}")
            print(f"(Generated in {end_time - start_time:.2f} seconds)")
            
            # Update chat history
            chat_history.append({"role": "user", "content": user_input})
            chat_history.append({"role": "assistant", "content": response})
        except Exception as e:
            print(f"Error: {str(e)}")
            import traceback
            traceback.print_exc()

# Trying a simple test query to ensure everything is working
def quick_test():
    test_question = "What can you help me with?"
    print(f"\nTest Question: {test_question}")
    
    start_time = time.time()
    try:
        response = generate_response(test_question)
        end_time = time.time()
        
        print(f"Response: {response}")
        print(f"Generation time: {end_time - start_time:.2f} seconds")
        return True
    except Exception as e:
        print(f"Test failed with error: {str(e)}")
        import traceback
        traceback.print_exc()  # Print the full stack trace for debugging
        return False

# Overarching function for our application: we can choose here which interface to use
def run_assistant():
    print("\nRunning quick test...")
    test_success = quick_test()
    
    if test_success:
        # Ask user which interface they prefer
        interface_choice = input("\nChoose interface (1 for UI, 2 for CLI): ")
        
        if interface_choice == "2":
            cli_chat()
        else:
            print("\nStarting the personal assistant UI...")
            assistant_ui = create_assistant_ui()
            display(assistant_ui)
            
            # Usage instructions
            print("\n--- Instructions ---")
            print("1. Type your question in the text box")
            print("2. Press Enter or click 'Send'")
            print("3. Wait for the assistant's response")
            print("4. Click 'Clear Chat' to start a new conversation")
            print("----------------------")
    else:
        print("\nSkipping UI launch due to test failure.")
        print("You may want to try the CLI interface by calling cli_chat() directly")

# Running the conversational assistant
run_assistant()



Loading Qwen/Qwen3-1.7B...
Attempting to load model with 4-bit quantization...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded in 3.07 seconds

Running quick test...

Test Question: What can you help me with?
Response: " I need to respond appropriately. Let me think about how to approach this.

First, the user is probably looking for assistance with a specific problem, but they haven't mentioned it yet. As an AI, I should be helpful and engaging. I should ask them what they need help with, but in a way that's friendly and not too pushy.

I should make sure to keep the response positive and encouraging. Maybe offer some examples of what I can help with, like writing, coding, or general advice. But I need to be careful not to give false information. If I don't know the answer, I should just say I don't know and offer to help later.

Also, the user might be testing if I can help with something specific. I should respond in a way that invites them to ask a question, while also providing some value. Let me structure the response to be helpful and not too direct.
</think>

I'm here to help with any ques


Choose interface (1 for UI, 2 for CLI):  1



Starting the personal assistant UI...


VBox(children=(Output(), HBox(children=(Text(value='', continuous_update=False, description='Question:', layou…


--- Instructions ---
1. Type your question in the text box
2. Press Enter or click 'Send'
3. Wait for the assistant's response
4. Click 'Clear Chat' to start a new conversation
----------------------
