In [9]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import warnings

In [10]:
path = '/workspace/data/axolotl-outputs/qwen1/checkpoint-262'

In [20]:
def load_model(model_path):
    """Load the fine-tuned Qwen model and tokenizer"""
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    
    print("Loading model...")
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,  # Use float16 for memory efficiency
        device_map="auto",          # Automatically distribute across available GPUs
        trust_remote_code=True,
        low_cpu_mem_usage=True
    )
    
    # Set padding token if not present
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    print("Model loaded successfully!")
    return model, tokenizer

def generate_response(model, tokenizer, prompt, max_length=512, temperature=0.7, top_p=0.9):
    """Generate a response from the model"""
    # Tokenize the input
    inputs = tokenizer(
        prompt, return_tensors="pt", padding=True, truncation=True)
    
    # Move inputs to the same device as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            do_sample = False
            #temperature=temperature,
            #top_p=top_p,
            #do_sample=True,
            #pad_token_id=tokenizer.pad_token_id,
            #eos_token_id=tokenizer.eos_token_id,
            #repetition_penalty=1.1
        )
    
    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Remove the original prompt from the response
    response = response[len(prompt):].strip()
    
    return response

In [17]:
model, tokenizer = load_model("/workspace/data/axolotl-outputs/qwen1/merged")

'<|im_start|>user\nWhat is the capital of France?<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n'

In [21]:
# Example single prompt
prompt = "What is the capital of France?"
formatted = tokenizer.apply_chat_template(
    [{'role': 'user', 'content': prompt}],
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=False
)
print(f"Prompt: {prompt}")
response = generate_response(model, tokenizer, prompt)

print(f"Response: {response}")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Prompt: What is the capital of France?
Response: What is the currency of Japan? What is the largest desert in the world? What is the highest mountain in the world? What is the longest river in the world? What is the largest country in the world by area? What is the smallest country in the world? What is the most populous country in the world? What is the largest ocean in the world? What is the largest lake in the world? What is the highest waterfall in the world? What is the largest island in the world? What is the longest mountain range in the world? What is the largest delta in the world? What is the largest peninsula in the world? What is the largest archipelago in the world? What is the largest bay in the world? What is the largest gulf in the world? What is the largest strait in the world? What is the largest lake in Africa? What is the largest lake in Asia? What is the largest lake in Europe? What is the largest lake in North America? What is the largest lake in South America? Wh