In [None]:
def create_training_dataset(self, query_tensors, response_tensors, rewards):
    """Create a dataset for PPOTrainer with the correct format
    
    PPOTrainer expects a dataset with input_ids, not text fields
    
    Parameters:
    - query_tensors: List of query tensors
    - response_tensors: List of response tensors 
    - rewards: List of reward values
    
    Returns:
    - Dataset object ready for PPO training
    """
    # PPOTrainer expects tokenized inputs, not text
    formatted_data = {
        "input_ids": [],      # Tokenized prompts (queries)
        "query_ids": [],      # Store original queries for reference
        "response_ids": [],   # Tokenized responses
        "attention_mask": [], # Attention masks for the input_ids
        "rewards": []         # Reward values
    }
    
    print(f"Creating dataset from {len(query_tensors)} experiences")
    
    # Process each example
    for i in range(len(query_tensors)):
        try:
            # Get individual tensors
            query = query_tensors[i]
            response = response_tensors[i]
            
            # Handle any tensor shape by flattening to 1D if needed
            if len(query.shape) > 1:
                # For tensors with shape [1, 1, sequence_length] or [1, sequence_length]
                query = query.view(-1)  # Flatten to 1D
            
            # Same for response tensor
            if len(response.shape) > 1:
                response = response.view(-1)  # Flatten to 1D
            
            # Create attention mask (1s for all tokens)
            attention_mask = torch.ones_like(query, dtype=torch.long)
            
            # Add to formatted data
            formatted_data["input_ids"].append(query.cpu().numpy())
            formatted_data["query_ids"].append(query.cpu().numpy())
            formatted_data["response_ids"].append(response.cpu().numpy())
            formatted_data["attention_mask"].append(attention_mask.cpu().numpy())
            
            # Add reward
            if i < len(rewards):
                reward_value = float(rewards[i])
            else:
                reward_value = 0.0
            formatted_data["rewards"].append(reward_value)
                
        except Exception as e:
            print(f"Error processing experience {i}: {e}")
            # Skip this example
    
    # Make sure we have at least some data
    if len(formatted_data["input_ids"]) == 0:
        print("Warning: No valid examples. Creating dummy data.")
        # Create dummy data with proper token IDs
        dummy_text = "This is a dummy example."
        dummy_encoding = self.tokenizer(dummy_text, return_tensors="pt")
        dummy_ids = dummy_encoding.input_ids[0].cpu().numpy()
        dummy_mask = dummy_encoding.attention_mask[0].cpu().numpy()
        
        formatted_data["input_ids"] = [dummy_ids] * 2
        formatted_data["query_ids"] = [dummy_ids] * 2
        formatted_data["response_ids"] = [dummy_ids] * 2
        formatted_data["attention_mask"] = [dummy_mask] * 2
        formatted_data["rewards"] = [0.0] * 2
    
    # Create a dataset from the formatted data
    dataset = Dataset.from_dict(formatted_data)
    print(f"Created dataset with {len(dataset)} examples")
    return dataset

In [None]:
class CustomDataCollator:
    """Custom data collator for PPO training
    
    Converts the dataset items into the format expected by PPOTrainer
    """
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.pad_token_id = tokenizer.pad_token_id
        
    def __call__(self, features):
        # Get max length for padding
        max_length = max(len(feature["input_ids"]) for feature in features)
        
        # Initialize batch
        batch = {
            "input_ids": [],
            "attention_mask": [],
            "rewards": []
        }
        
        # Process each feature
        for feature in features:
            # Pad input_ids
            padded_input_ids = feature["input_ids"] + [self.pad_token_id] * (max_length - len(feature["input_ids"]))
            batch["input_ids"].append(padded_input_ids)
            
            # Pad attention_mask
            padded_attention_mask = feature["attention_mask"] + [0] * (max_length - len(feature["attention_mask"]))
            batch["attention_mask"].append(padded_attention_mask)
            
            # Add reward
            batch["rewards"].append(feature["rewards"])
        
        # Convert to tensors
        batch["input_ids"] = torch.tensor(batch["input_ids"], dtype=torch.long)
        batch["attention_mask"] = torch.tensor(batch["attention_mask"], dtype=torch.long)
        batch["rewards"] = torch.tensor(batch["rewards"], dtype=torch.float)
        
        return batch

class TrainingOrchestrator:
    """Manages the PPO training process for the portfolio manager"""
    
    def __init__(self, env: MacroTradingEnv, agent: PPOPortfolioManager, output_dir="output/ppo_portfolio_manager", 
                 use_sequential_training=False):
        self.env = env
        self.agent = agent
        self.buffer = []
        self.buffer_size = 1000
        self.episode_length = 21  # Trading days per episode
        self.output_dir = output_dir
        self.use_sequential_training = use_sequential_training

    def compute_format_reward(self, response_text: str) -> float:
        """Calculate reward for formatting according to required XML structure"""
        # Check overall structure
        has_correct_format = self.agent.check_format(response_text)
        
        # Check individual tags
        has_macro_state = "<macro state>" in response_text and "</macro state>" in response_text
        has_reasoning = "<reasoning>" in response_text and "</reasoning>" in response_text
        has_positioning = "<positioning>" in response_text and "</positioning>" in response_text
        
        # Calculate format reward component
        if has_correct_format:
            return 0.5  # Full format reward
        elif has_macro_state and has_reasoning and has_positioning:
            return 0.3  # Tags exist but not in correct order/format
        elif (has_macro_state and has_reasoning) or (has_macro_state and has_positioning) or (has_reasoning and has_positioning):
            return 0.1  # Some tags exist
        else:
            return -0.2  # Format completely wrong
    
    def collect_experience(self, num_episodes=10):
        """Collect trading experience by running multiple episodes"""
        all_episode_rewards = []
        
        for episode in tqdm(range(num_episodes), desc="Collecting experience"):
            state = self.env.reset(random_start=True)
            episode_queries = []
            episode_responses = []
            episode_rewards = []
            episode_response_texts = []
            episode_format_rewards = []
            
            # Run one episode
            done = False
            step = 0
            while not done and step < self.episode_length:
                # Get action from agent
                position, response_text, query, response = self.agent.predict(state)
                
                # Take action in environment
                next_state, reward, done, info = self.env.step(position)
                
                # Calculate format reward
                format_reward = self.compute_format_reward(response_text)
                
                # Combine rewards
                total_reward = reward + format_reward
                
                # Store experience
                episode_queries.append(query)
                episode_responses.append(response)
                episode_rewards.append(total_reward)
                episode_response_texts.append(response_text)
                episode_format_rewards.append(format_reward)
                
                # Move to next state
                state = next_state
                step += 1
            
            # Calculate returns with discount
            returns = self._calculate_returns(episode_rewards)
            
            # Store episodes in buffer
            for i in range(len(episode_rewards)):
                self.buffer.append({
                    'query': episode_queries[i],
                    'response': episode_responses[i],
                    'reward': returns[i],
                    'raw_reward': episode_rewards[i],
                    'response_text': episode_response_texts[i],
                    'format_reward': episode_format_rewards[i]
                })
            
            # Keep buffer size in check
            if len(self.buffer) > self.buffer_size:
                self.buffer = self.buffer[-self.buffer_size:]
                
            all_episode_rewards.extend(episode_rewards)
        
        # Return average reward per step
        return np.mean(all_episode_rewards) if all_episode_rewards else 0.0

    def _calculate_returns(self, rewards):
        """Calculate discounted returns"""
        gamma = self.agent.ppo_config.gamma
        returns = []
        R = 0
        for r in reversed(rewards):
            R = r + gamma * R
            returns.insert(0, R)
        return returns

    def train(self, num_iterations=100, num_episodes_per_iter=5, batch_size=8):
        """Main training loop with properly formatted data for PPOTrainer"""
        for iteration in range(num_iterations):
            # Collect new experience
            avg_reward = self.collect_experience(num_episodes=num_episodes_per_iter)
            
            # Skip training if buffer is too small
            if len(self.buffer) < batch_size:
                print(f"Iteration {iteration+1}/{num_iterations}: Buffer too small ({len(self.buffer)}), skipping training")
                continue
            
            try:
                # Select experiences for this training iteration
                if self.use_sequential_training:
                    # Sequential approach: Use most recent experiences
                    start_idx = max(0, len(self.buffer) - batch_size)
                    batch = self.buffer[start_idx:]
                    print(f"Using sequential batch of size {len(batch)} (from idx {start_idx})")
                else:
                    # Random sampling approach (original)
                    batch_indices = np.random.choice(len(self.buffer), min(batch_size, len(self.buffer)), replace=False)
                    batch = [self.buffer[i] for i in batch_indices]
                    print(f"Using random batch of size {len(batch)}")
                
                # Extract the components
                queries = [item['query'] for item in batch]
                responses = [item['response'] for item in batch]
                rewards = [item['reward'] for item in batch]
                
                # Create a dataset with the correct format for PPOTrainer
                train_dataset = self.agent.create_training_dataset(queries, responses, rewards)
                
                # Create a custom data collator
                data_collator = CustomDataCollator(self.agent.tokenizer)
                
                # Initialize a fresh PPOTrainer with this dataset and collator
                self.agent.initialize_trainer(
                    train_dataset=train_dataset,
                    data_collator=data_collator
                )
                
                print("Starting PPO training...")
                
                # Set shorter training for this batch
                original_num_train_epochs = self.agent.ppo_config.num_train_epochs
                self.agent.ppo_config.num_train_epochs = 1  # Just do one epoch
                
                # Run the full PPO training process
                self.agent.trainer.train()
                
                # Restore original settings
                self.agent.ppo_config.num_train_epochs = original_num_train_epochs
                
                # Log sample response
                sample_idx = 0  # Just use the first sample for logging
                sample = batch[sample_idx]
                print(f"\nSample response: {sample['response_text'][:200]}...")
                print(f"Trading reward: {sample['raw_reward'] - sample['format_reward']:.4f}")
                print(f"Format reward: {sample['format_reward']:.4f}")
                print(f"Total return: {sample['reward']:.4f}")
                
            except Exception as e:
                print(f"Error during PPO training: {e}")
                import traceback
                traceback.print_exc()
            
            # Save model periodically
            if (iteration + 1) % 10 == 0:
                save_path = f"{self.output_dir}/checkpoint-{iteration+1}"
                try:
                    self.agent.policy_model.save_pretrained(save_path)
                    self.agent.tokenizer.save_pretrained(save_path)
                    print(f"Model saved to {save_path}")
                except Exception as e:
                    print(f"Error saving model: {e}")
            
            # Logging
            print(f"Iteration {iteration+1}/{num_iterations}")
            print(f"Average Reward: {avg_reward:.4f}")
            print(f"Buffer Size: {len(self.buffer)}")
            print("="*50)

In [None]:
def initialize_trainer(self, train_dataset=None, data_collator=None):
    """Initialize the PPO trainer with the given dataset and collator
    
    Parameters:
    - train_dataset: Dataset object for training (optional)
    - data_collator: Custom data collator (optional)
    
    Returns:
    - The initialized PPOTrainer
    """
    try:
        # If no train dataset is provided, create a dummy one
        if train_dataset is None:
            # Create a dummy dataset with proper token IDs
            dummy_text = "This is a dummy example."
            dummy_encoding = self.tokenizer(dummy_text, return_tensors="pt")
            dummy_ids = dummy_encoding.input_ids[0].cpu().numpy()
            dummy_mask = dummy_encoding.attention_mask[0].cpu().numpy()
            
            # Create dataset with proper fields
            dummy_data = {
                "input_ids": [dummy_ids] * 2,
                "attention_mask": [dummy_mask] * 2,
                "rewards": [0.0] * 2
            }
            train_dataset = Dataset.from_dict(dummy_data)
        
        # If no data collator is provided, create a default one
        if data_collator is None:
            from transformers import DataCollatorWithPadding
            data_collator = DataCollatorWithPadding(self.tokenizer)
        
        # Initialize the trainer with our dataset and collator
        self.trainer = PPOTrainer(
            args=self.ppo_config,
            processing_class=self.tokenizer,
            model=self.policy_model,
            ref_model=self.ref_model,
            reward_model=self.reward_model,
            train_dataset=train_dataset,
            value_model=self.value_model,
            data_collator=data_collator
        )
        
        print("PPOTrainer initialized successfully!")
        return self.trainer
        
    except Exception as e:
        print(f"Error initializing PPOTrainer: {e}")
        import traceback
        traceback.print_exc()
        return None