In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import PPOTrainer, PPOConfig
from typing import Dict, List, Deque, Tuple
from collections import deque
import json

# =============================================================================
# 1. Stateful Trading Environment (No Changes)
# =============================================================================

class DailyTradingEnv:
    # Keep the original implementation unchanged
    ...

# =============================================================================
# 2. PPO-Compatible Trader Class
# =============================================================================

class PPOTrader:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-1.8B")
        self.model = AutoModelForCausalLM.from_pretrained(
            "Qwen/Qwen1.5-1.8B",
            torch_dtype=torch.bfloat16,
            device_map="auto"
        )
        self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # PPO configuration
        self.config = PPOConfig(
            learning_rate=1.2e-5,
            batch_size=32,
            mini_batch_size=8,
            ppo_epochs=3,
            max_grad_norm=0.5,
            kl_penalty="adaptive",
            target_kl=0.01,
            seed=42,
            adap_kl_ctrl=True,
            init_kl_coef=0.2,
            cliprange=0.2,
            cliprange_value=0.2,
            vf_coef=0.5,
            gamma=0.99,
            lam=0.95,
            log_with="wandb"  # or "tensorboard"
        )
        
        self.trainer = PPOTrainer(
            model=self.model,
            tokenizer=self.tokenizer,
            config=self.config
        )

    def format_state(self, env: DailyTradingEnv) -> str:
        # Keep the original prompt formatting
        ...

    def parse_response(self, response: str) -> Dict:
        # Keep the original parsing logic
        ...

# =============================================================================
# 3. Modified Training Loop for PPO
# =============================================================================

class PPOTrainingOrchestrator:
    def __init__(self, env: DailyTradingEnv, trader: PPOTrader):
        self.env = env
        self.trader = trader
        self.buffer = deque(maxlen=252*5)
        self.episode_length = 21
        self.gamma = 0.99
        self.lam = 0.95

    def calculate_advantages(self, rewards: List[float], values: List[float]) -> List[float]:
        """Calculate GAE advantages"""
        advantages = []
        last_advantage = 0
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + self.gamma * values[t+1] - values[t]
            advantages.append(last_advantage * self.gamma * self.lam + delta)
        return list(reversed(advantages))

    def run_episode(self):
        """Collect experience for one episode"""
        self.env._reset_state()
        states, actions, rewards, values = [], [], [], []
        
        for _ in range(self.episode_length):
            # Generate response
            prompt = self.trader.format_state(self.env)
            inputs = self.trader.tokenizer(prompt, return_tensors="pt").to(self.device)
            
            with torch.no_grad():
                response = self.trader.model.generate(**inputs, max_new_tokens=256)
                value = self.trader.model(**inputs).value.item()
            
            action = self.trader.parse_response(self.trader.tokenizer.decode(response[0]))
            
            # Execute action
            reward, _ = self.env.step(action['position'])
            
            # Store experience
            states.append(inputs.input_ids)
            actions.append(response)
            rewards.append(reward)
            values.append(value)
            
        # Calculate advantages and returns
        advantages = self.calculate_advantages(rewards, values)
        returns = [r + self.gamma * v for r, v in zip(rewards, values[1:])]
        
        return {
            'states': states,
            'actions': actions,
            'rewards': rewards,
            'returns': returns,
            'advantages': advantages
        }

    def train(self, num_episodes: int = 1000):
        """Main training loop"""
        for episode in range(num_episodes):
            # Collect experience
            batch = self.run_episode()
            
            # Prepare PPO inputs
            ppo_batch = {
                'query_tensors': batch['states'],
                'response_tensors': batch['actions'],
                'advantages': torch.tensor(batch['advantages']),
                'returns': torch.tensor(batch['returns']),
                'rewards': torch.tensor(batch['rewards'])
            }
            
            # Perform PPO update
            stats = self.trader.trainer.step(
                [ppo_batch['query_tensors']],
                [ppo_batch['response_tensors']],
                [ppo_batch['advantages']],
                [ppo_batch['returns']]
            )
            
            # Logging
            if (episode+1) % 10 == 0:
                print(f"Episode {episode+1}")
                print(f"Mean Reward: {np.mean(batch['rewards']):.2f}")
                print(f"Max Advantage: {np.max(batch['advantages']):.2f}")
                print("="*50)

# =============================================================================
# 4. Execution Workflow
# =============================================================================

if __name__ == "__main__":
    # Initialize components
    data = pd.read_csv('sp500_daily.csv', parse_dates=['date'])
    env = DailyTradingEnv(data)
    trader = PPOTrader()
    orchestrator = PPOTrainingOrchestrator(env, trader)
    
    # Start training
    orchestrator.train(num_episodes=500)