In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOTrainer, GRPOConfig
from typing import Dict, List, Tuple
import json
from collections import deque
from datasets import Dataset

# =============================================================================
# 1. GRPO-Compatible Trading Environment
# =============================================================================

class GRPOTradingEnv:
    def __init__(self, data: pd.DataFrame, initial_cash: float = 1e6):
        self.data = data
        self.initial_cash = initial_cash
        self._reset_state()
        
    def _reset_state(self):
        self.current_step = 0
        self.cash = self.initial_cash
        self.position = 0.0
        self.portfolio_values = [self.initial_cash]
        self.news_window = deque(maxlen=7)
        
    def step(self, action: float) -> Tuple[float, Dict]:
        prev_price = self.data.iloc[self.current_step]['close']
        self.current_step += 1
        current_price = self.data.iloc[self.current_step]['close']
        
        # Calculate returns
        daily_return = (current_price/prev_price - 1) * self.position
        new_value = self.cash + self.position * current_price
        self.portfolio_values.append(new_value)
        
        return daily_return, {
            'new_value': new_value,
            'return': daily_return,
            'position': self.position,
            'step': self.current_step
        }

# =============================================================================
# 2. Reward Functions for GRPO
# =============================================================================

def risk_adjusted_reward(example):
    """Main reward function combining multiple factors"""
    return {
        'rewards': [example['return'] / (example['volatility'] + 1e-6) + 0.1 * example['position_persistence']]
    }

def position_stability_reward(example):
    """Reward for maintaining positions"""
    return {'rewards': [0.5 * (1 - abs(example['position_change']))]}

# =============================================================================
# 3. GRPO Training Configuration
# =============================================================================

model_name = "Qwen/Qwen2.5-0.5B-Instruct"
output_dir = "outputs/Qwen-0.5B-GRPO"
run_name = "Qwen-0.5B-GRPO-trading"

grpo_config = GRPOConfig(
    output_dir=output_dir,
    run_name=run_name,
    learning_rate=5e-6,
    adam_beta1=0.9,
    adam_beta2=0.99,
    weight_decay=0.1,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_generations=16,
    max_prompt_length=256,
    max_completion_length=200,
    num_train_epochs=1,
    max_grad_norm=0.1,
    use_vllm=True,
    vllm_gpu_memory_utilization=0.3,
    bf16=True,
    report_to="none"
)

# =============================================================================
# 4. Model & Dataset Preparation
# =============================================================================

def create_prompt(row: pd.Series, news_context: List[str]) -> str:
    return f"""Trading Decision for {row['date']}
Market Context:
- Price: {row['close']:.2f}
- 20D Volatility: {row['vol_20d']:.2%}
- RSI(14): {row['rsi_14']:.1f}
- Position: {row['position']:.2%}

Recent News:
{"\n".join(news_context)}

Output Format (JSON):
{{
  "analysis": "...",
  "position_target": [-1.0 to 1.0],
  "confidence": [0.0-1.0]
}}"""

def process_dataset(data: pd.DataFrame):
    dataset = []
    env = GRPOTradingEnv(data)
    
    for i in range(len(data)-1):
        news_context = list(env.news_window)
        prompt = create_prompt(data.iloc[i], news_context)
        
        # Store step data for reward calculation
        dataset.append({
            'prompt': prompt,
            'date': data.iloc[i]['date'],
            'current_price': data.iloc[i]['close'],
            'next_price': data.iloc[i+1]['close'],
            'position': env.position,
            'volatility': data.iloc[i]['vol_20d']
        })
        
        # Update environment
        env.news_window.append(data.iloc[i]['news'])
    
    return Dataset.from_pandas(pd.DataFrame(dataset))

# =============================================================================
# 5. GRPO Training Setup
# =============================================================================

def main():
    # Load and prepare data
    data = pd.read_csv('sp500_daily.csv', parse_dates=['date'])
    dataset = process_dataset(data)
    
    # Initialize model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    # Initialize GRPO Trainer
    trainer = GRPOTrainer(
        model=model,
        tokenizer=tokenizer,
        args=grpo_config,
        reward_funcs=[risk_adjusted_reward, position_stability_reward],
        train_dataset=dataset,
    )
    
    # Start training
    trainer.train()
    
    # Save final model
    trainer.save_model(output_dir)

if __name__ == "__main__":
    main()