In [None]:
# Create GRPO configuration (see https://huggingface.co/docs/trl/main/en/grpo_trainer)
grpo_config = GRPOConfig(
    model_name=MODEL_NAME,
    learning_rate=1e-5,
    batch_size=1,
    mini_batch_size=1,
    num_train_epochs=1,
    max_grad_norm=1.0,
)

# Initialize the GRPOTrainer.
grpo_trainer = GRPOTrainer(
    config=grpo_config,
    model=model,
    tokenizer=tokenizer,
)

# =============================================================================
# 3. Utility Functions: Prompt Generation and Parsing
# =============================================================================

def obs_and_news_to_prompt(obs: np.ndarray, news: str) -> str:
    """
    Convert the numeric observation and aggregated news into a prompt.
    The prompt instructs Qwen‑2.5 to output three comma-separated numbers:
      1) Trading action (integer number of shares to hold),
      2) Recommendation score (1‑5),
      3) Risk score (1‑5).
    """
    obs_str = ", ".join([f"{v:.2f}" for v in obs])
    prompt = (
        f"Observation: {obs_str}\n"
        f"News: {news}\n\n"
        "As a financial expert, please output three comma-separated numbers:\n"
        "1) Trading action (integer number of shares to hold),\n"
        "2) Recommendation score (1-5),\n"
        "3) Risk score (1-5)."
    )
    return prompt

def parse_response(response_text: str):
    """
    Parse Qwen‑2.5's output expecting three comma-separated numbers.
    Returns defaults (0, 3.0, 3.0) if parsing fails.
    """
    try:
        parts = response_text.strip().split(",")
        action = int(float(parts[0].strip()))
        rec = float(parts[1].strip())
        risk = float(parts[2].strip())
        return action, rec, risk
    except Exception:
        return 0, 3.0, 3.0

def map_rec_risk_to_action_mod(action, rec_score, risk_score):
    """
    Adjust the raw action based on the recommendation and risk scores.
    Modify this heuristic as needed.
    """
    factor = 1.0
    if rec_score >= 4.5:
        factor *= 1.2
    elif rec_score >= 4.0:
        factor *= 1.1
    if risk_score >= 4.5:
        factor *= 0.8
    elif risk_score >= 4.0:
        factor *= 0.9
    return int(round(action * factor))

# =============================================================================
# 4. Training Loop using GRPO (No CVaR)
# =============================================================================

# Create a dummy DataFrame for illustration (replace with your actual data)
df = pd.DataFrame({
    'close': np.random.uniform(90, 110, size=100),
    'news': [f"Macro event {i}" for i in range(100)]
})
env = WeeklyTradingEnvNoGym(df, initial_amount=1e6, episode_length=5)

NUM_EPISODES = 20
BATCH_SIZE = 5  # Update GRPO every BATCH_SIZE episodes

prompts_batch = []
responses_batch = []
rewards_batch = []

episode_count = 0

while episode_count < NUM_EPISODES:
    try:
        obs, news = env.reset()
    except StopIteration:
        print("No more data available.")
        break

    # Create prompt from observation and news.
    prompt = obs_and_news_to_prompt(obs, news)
    
    # Tokenize and generate a response from Qwen‑2.5.
    query_tensors = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    response_tensors = grpo_trainer.model.generate(
        input_ids=query_tensors,
        max_length=80,
        do_sample=True,
        top_k=50,
        top_p=0.95,
    )
    response_text = tokenizer.decode(response_tensors[0], skip_special_tokens=True)
    
    # Parse the model's output and adjust the action.
    raw_action, rec_score, risk_score = parse_response(response_text)
    final_action = map_rec_risk_to_action_mod(raw_action, rec_score, risk_score)
    
    # Step the environment using the chosen action.
    _, _, reward, done, _ = env.step(final_action)
    
    # Store the query, response, and reward for the GRPO update.
    prompts_batch.append(query_tensors[0])
    responses_batch.append(response_tensors[0])
    rewards_batch.append(reward)
    
    episode_count += 1
    
    # When a full batch is collected, update the policy via GRPO.
    if episode_count % BATCH_SIZE == 0 or episode_count == NUM_EPISODES:
        rewards_tensor = torch.tensor(rewards_batch, dtype=torch.float, device=model.device)
        grpo_trainer.step(
            queries=prompts_batch,
            responses=responses_batch,
            rewards=rewards_tensor,
        )
        prompts_batch = []
        responses_batch = []
        rewards_batch = []

print("GRPO training complete.")