In [5]:
import os
import torch
import gc

# --- 1. CRITICAL CONFIGURATION ---
# Forces the library to use the new v7 architecture
os.environ["RWKV_V7_ON"] = '1'

# IMPORTANT: Set to '0' first!
# Since you had issues with 'nvcc' (CUDA compiler) earlier, this 
# forces RWKV to run in pure PyTorch mode. It is slightly slower 
# but guaranteed to run without crashing your build environment.
os.environ["RWKV_CUDA_ON"] = '0' 

from rwkv.model import RWKV
from rwkv.utils import PIPELINE, PIPELINE_ARGS

# --- 2. LOAD MODEL ---
# Update this string to match your EXACT downloaded filename
# Example: "rwkv7-g1-2.9b-20250411-ctx4096.pth"
MODEL_FILE = "/home/ebrahim/brainaudio/llms/rwkv7-g1b-2.9b" 

print(f"Loading {MODEL_FILE}...")

# 'cuda fp16' is the standard for GPUs. 
# If you run out of VRAM, change to 'cuda fp16i8' (quantized).
model = RWKV(model=MODEL_FILE, strategy='cuda fp16')

# Load the standard tokenizer (it downloads automatically)
pipeline = PIPELINE(model, "rwkv_vocab_v20230424")

# --- 3. TEST GENERATION ---
prompt = "The quick brown fox jumps over the"
print(f"\nPrompt: '{prompt}'")

# Encode prompt
tokens = pipeline.encode(prompt)

# Forward pass (Get logic + state)
# Note: The initial state for RWKV is just None
logits, state = model.forward(tokens, None)

# Greedy decode next token
next_token_id = torch.argmax(logits, dim=-1).item()
next_word = pipeline.decode([next_token_id])

print(f"Output: '{next_word}'")
print("State Shape:", len(state), "tensors (This is your fixed state!)")

Loading /home/ebrahim/brainaudio/llms/rwkv7-g1b-2.9b...
Loading /home/ebrahim/brainaudio/llms/rwkv7-g1b-2.9b (cuda fp16)


Prompt: 'The quick brown fox jumps over the'
Output: ' lazy'
State Shape: 96 tensors (This is your fixed state!)
