In [None]:
def ppo_step(self, query_tensors, response_tensors, rewards):
    """Run PPO optimization step
    
    Parameters:
    - query_tensors: List of tensors for queries
    - response_tensors: List of tensors for responses
    - rewards: List of reward values
    
    Returns:
    - Statistics from the PPO update
    """
    if self.trainer is None:
        self.initialize_trainer()
    
    # Format inputs for PPO trainer
    texts = []
    for i in range(len(query_tensors)):
        query = query_tensors[i]
        response = response_tensors[i]
        
        # Make sure we're working with individual tensors, not batches
        if len(query.shape) > 1 and query.shape[0] > 1:
            # Handle batched tensors - we'll process each item separately
            for j in range(query.shape[0]):
                prompt_text = self.tokenizer.decode(query[j], skip_special_tokens=True)
                # Calculate where the response starts in the full sequence
                query_length = query[j].shape[0]
                response_text = self.tokenizer.decode(
                    response[j][query_length:], 
                    skip_special_tokens=True
                )
                texts.append({
                    "prompt": prompt_text,
                    "response": response_text,
                })
        else:
            # Handle single tensors
            prompt_text = self.tokenizer.decode(query, skip_special_tokens=True)
            query_length = query.shape[0]
            response_text = self.tokenizer.decode(
                response[query_length:], 
                skip_special_tokens=True
            )
            texts.append({
                "prompt": prompt_text,
                "response": response_text,
            })
    
    # Ensure rewards match the number of texts
    if isinstance(rewards, list) and len(rewards) != len(texts):
        # If lengths don't match, we need to expand the rewards
        if len(rewards) == 1:
            # If we have a single reward, duplicate it
            rewards = [rewards[0]] * len(texts)
        else:
            # Otherwise, we need to handle this mismatch more carefully
            print(f"Warning: Number of rewards ({len(rewards)}) doesn't match number of texts ({len(texts)})")
            # Simple approach: truncate or pad with the last value
            if len(rewards) < len(texts):
                last_reward = rewards[-1]
                rewards = rewards + [last_reward] * (len(texts) - len(rewards))
            else:
                rewards = rewards[:len(texts)]
    
    # Log what we're passing to the PPO trainer
    print(f"Running PPO step with {len(texts)} text pairs and {len(rewards) if isinstance(rewards, list) else 'tensor'} rewards")
    
    # Run PPO step with formatted texts and rewards
    try:
        stats = self.trainer.step(texts, rewards)
        return stats
    except Exception as e:
        print(f"Error in PPO step: {e}")
        import traceback
        traceback.print_exc()
        return None

In [None]:
class TrainingOrchestrator:
    """Manages the PPO training process for the portfolio manager"""
    
    def __init__(self, env: MacroTradingEnv, agent: PPOPortfolioManager, output_dir="output/ppo_portfolio_manager", 
                 use_sequential_training=False):
        self.env = env
        self.agent = agent
        self.buffer = []
        self.buffer_size = 1000
        self.episode_length = 21  # Trading days per episode
        self.output_dir = output_dir
        self.use_sequential_training = use_sequential_training

    def compute_format_reward(self, response_text: str) -> float:
        """Calculate reward for formatting according to required XML structure"""
        # Check overall structure
        has_correct_format = self.agent.check_format(response_text)
        
        # Check individual tags
        has_macro_state = "<macro state>" in response_text and "</macro state>" in response_text
        has_reasoning = "<reasoning>" in response_text and "</reasoning>" in response_text
        has_positioning = "<positioning>" in response_text and "</positioning>" in response_text
        
        # Calculate format reward component
        if has_correct_format:
            return 0.5  # Full format reward
        elif has_macro_state and has_reasoning and has_positioning:
            return 0.3  # Tags exist but not in correct order/format
        elif (has_macro_state and has_reasoning) or (has_macro_state and has_positioning) or (has_reasoning and has_positioning):
            return 0.1  # Some tags exist
        else:
            return -0.2  # Format completely wrong
    
    def collect_experience(self, num_episodes=10):
        """Collect trading experience by running multiple episodes"""
        all_episode_rewards = []
        
        for episode in tqdm(range(num_episodes), desc="Collecting experience"):
            state = self.env.reset(random_start=True)
            episode_queries = []
            episode_responses = []
            episode_rewards = []
            episode_response_texts = []
            episode_format_rewards = []
            
            # Run one episode
            done = False
            step = 0
            while not done and step < self.episode_length:
                # Get action from agent
                position, response_text, query, response = self.agent.predict(state)
                
                # Take action in environment
                next_state, reward, done, info = self.env.step(position)
                
                # Calculate format reward
                format_reward = self.compute_format_reward(response_text)
                
                # Combine rewards
                total_reward = reward + format_reward
                
                # Store experience
                episode_queries.append(query)
                episode_responses.append(response)
                episode_rewards.append(total_reward)
                episode_response_texts.append(response_text)
                episode_format_rewards.append(format_reward)
                
                # Move to next state
                state = next_state
                step += 1
            
            # Calculate returns with discount
            returns = self._calculate_returns(episode_rewards)
            
            # Store episodes in buffer
            for i in range(len(episode_rewards)):
                self.buffer.append({
                    'query': episode_queries[i],
                    'response': episode_responses[i],
                    'reward': returns[i],
                    'raw_reward': episode_rewards[i],
                    'response_text': episode_response_texts[i],
                    'format_reward': episode_format_rewards[i]
                })
            
            # Keep buffer size in check
            if len(self.buffer) > self.buffer_size:
                self.buffer = self.buffer[-self.buffer_size:]
                
            all_episode_rewards.extend(episode_rewards)
        
        # Return average reward per step
        return np.mean(all_episode_rewards) if all_episode_rewards else 0.0

    def _calculate_returns(self, rewards):
        """Calculate discounted returns"""
        gamma = self.agent.ppo_config.gamma
        returns = []
        R = 0
        for r in reversed(rewards):
            R = r + gamma * R
            returns.insert(0, R)
        return returns

    def train(self, num_iterations=100, num_episodes_per_iter=5, batch_size=8):
        """Main training loop with full PPO training"""
        for iteration in range(num_iterations):
            # Collect new experience
            avg_reward = self.collect_experience(num_episodes=num_episodes_per_iter)
            
            # Skip training if buffer is too small
            if len(self.buffer) < batch_size:
                print(f"Iteration {iteration+1}/{num_iterations}: Buffer too small ({len(self.buffer)}), skipping training")
                continue
            
            try:
                # Select experiences for this training iteration
                if self.use_sequential_training:
                    # Sequential approach: Use most recent experiences
                    start_idx = max(0, len(self.buffer) - batch_size)
                    batch = self.buffer[start_idx:]
                    print(f"Using sequential batch of size {len(batch)} (from idx {start_idx})")
                else:
                    # Random sampling approach (original)
                    batch_indices = np.random.choice(len(self.buffer), min(batch_size, len(self.buffer)), replace=False)
                    batch = [self.buffer[i] for i in batch_indices]
                    print(f"Using random batch of size {len(batch)}")
                
                # Extract the components
                queries = [item['query'] for item in batch]
                responses = [item['response'] for item in batch]
                rewards = [item['reward'] for item in batch]
                
                # Create a dataset for training
                train_dataset = self.agent.create_training_dataset(queries, responses, rewards)
                
                # Initialize a fresh PPOTrainer with this dataset
                self.agent.initialize_trainer(train_dataset=train_dataset)
                
                # Train for a few steps
                print("Starting PPO training...")
                
                # We need to temporarily modify the PPOConfig for shorter training
                original_num_train_epochs = self.agent.ppo_config.num_train_epochs
                self.agent.ppo_config.num_train_epochs = 1  # Just do one epoch for this batch
                
                # Run the full PPO training process
                self.agent.trainer.train()
                
                # Restore original settings
                self.agent.ppo_config.num_train_epochs = original_num_train_epochs
                
                # Log sample response
                sample_idx = 0  # Just use the first sample for logging
                sample = batch[sample_idx]
                print(f"\nSample response: {sample['response_text'][:200]}...")
                print(f"Trading reward: {sample['raw_reward'] - sample['format_reward']:.4f}")
                print(f"Format reward: {sample['format_reward']:.4f}")
                print(f"Total return: {sample['reward']:.4f}")
                
            except Exception as e:
                print(f"Error during PPO training: {e}")
                import traceback
                traceback.print_exc()
            
            # Save model periodically
            if (iteration + 1) % 10 == 0:
                save_path = f"{self.output_dir}/checkpoint-{iteration+1}"
                self.agent.policy_model.save_pretrained(save_path)
                self.agent.tokenizer.save_pretrained(save_path)
            
            # Logging
            print(f"Iteration {iteration+1}/{num_iterations}")
            print(f"Average Reward: {avg_reward:.4f}")
            print(f"Buffer Size: {len(self.buffer)}")
            print("="*50)