In [None]:
class PPOPortfolioManager:
    """
    Portfolio manager using PPO to make macro-driven investment decisions
    Optimized for multi-GPU training
    """
    
    def __init__(self, model_name="Qwen/Qwen2.5-0.5B-Instruct", output_dir="output/ppo_portfolio_manager"):
        # Setup for multi-GPU training
        import os
        from accelerate import Accelerator
        
        # Check available GPUs
        self.num_gpus = torch.cuda.device_count()
        print(f"Found {self.num_gpus} GPU(s)")
        
        # Create accelerator for distributed training
        self.accelerator = Accelerator(
            gradient_accumulation_steps=4,
            mixed_precision="bf16" if torch.cuda.is_bf16_supported() else "fp16",
            log_with="tensorboard"
        )
        
        # Determine device allocation strategy
        self.policy_device = 0  # First GPU for policy model (primary model)
        self.ref_device = 1 if self.num_gpus > 1 else 0  # Second GPU for reference model if available
        self.value_device = 0  # Value model on first GPU with policy model
        self.reward_device = 1 if self.num_gpus > 1 else 0  # Reward model on second GPU if available
        
        # Multi-GPU optimized PPO configuration
        self.ppo_config = PPOConfig(
            # Larger per-device batch size since we have multiple GPUs
            per_device_train_batch_size=4 if self.num_gpus > 1 else 2,
            
            # Gradient accumulation for effective larger batch sizes
            gradient_accumulation_steps=4,
            
            # Enable mixed precision training
            bf16=torch.cuda.is_bf16_supported(),
            fp16=not torch.cuda.is_bf16_supported(),
            
            # Memory optimizations
            optimize_cuda_cache=True,
            gradient_checkpointing=True,
            
            # DeepSpeed ZeRO optimization for multi-GPU
            deepspeed={
                "zero_optimization": {
                    "stage": 2,
                    "offload_optimizer": {
                        "device": "cpu"
                    },
                    "contiguous_gradients": True,
                    "overlap_comm": True
                }
            } if self.num_gpus > 1 else None,
            
            # Standard parameters
            learning_rate=5e-5,
            max_grad_norm=1.0,
            num_train_epochs=3,
            seed=42,
            report_to=["tensorboard"],
            output_dir=output_dir,
            logging_dir=f"{output_dir}/logs",
            logging_steps=500,
            run_name="ppo_portfolio_manager",
            save_strategy="steps",
            save_steps=500,
            
            # PPO specific parameters
            num_ppo_epochs=4,
            gamma=0.99,
            lam=0.95,
            cliprange=0.2,
            cliprange_value=0.2,
            vf_coef=0.1,
            kl_coef=0.05,
            whiten_rewards=False,
            temperature=0.7,
            response_length=256,  # Reduced for memory efficiency
            remove_unused_columns=False
        )
        
        # Initialize tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # Load models with device mapping
        from transformers import AutoModelForCausalLM, GenerationConfig
        
        # Select precision
        dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
        
        # 1. Load policy model on first GPU
        if self.num_gpus > 1:
            # With multiple GPUs, we can use a specific device map
            device_map = {0: [0, 1, 2, 3, 4, 5], 1: [6, 7, 8, 9, 10, 11]}  # Example distribution of layers
            print(f"Loading policy model with custom device map: {device_map}")
        else:
            device_map = f"cuda:{self.policy_device}"
            print(f"Loading policy model on single GPU: {device_map}")
            
        self.policy_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=dtype,
            device_map=device_map,  # Spread across GPUs or use specific GPU
            low_cpu_mem_usage=True
        )
        
        # Set generation config
        self.policy_model.generation_config = GenerationConfig.from_pretrained(model_name)
        self.policy_model.config.use_cache = not self.ppo_config.gradient_checkpointing
        
        # 2. Load reference model on second GPU if available
        if self.num_gpus > 1:
            print(f"Loading reference model on GPU {self.ref_device}")
            self.ref_model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=dtype,
                device_map=f"cuda:{self.ref_device}",  # Use second GPU
                low_cpu_mem_usage=True
            )
        else:
            # For single GPU, we'll use a frozen copy of the policy model
            print("Using frozen copy of policy model as reference model")
            self.ref_model = None  # Will create a copy when needed
        
        # 3. Create a multi-GPU compatible value model
        # For multi-GPU, link it to the policy model but place the value head on the same GPU
        from transformers import AutoModelForCausalLMWithValueHead
        
        print(f"Creating value model linked to policy model's backbone")
        self.value_model = AutoModelForCausalLMWithValueHead.from_pretrained(
            self.policy_model,  # Use the existing policy model to share parameters
            torch_dtype=dtype,
        )
        
        # 4. Create a lightweight reward model on second GPU if available
        if self.num_gpus > 1:
            print(f"Creating reward model on GPU {self.reward_device}")
            device = f"cuda:{self.reward_device}"
        else:
            print(f"Creating reward model on the same GPU as policy")
            device = f"cuda:{self.policy_device}"
            
        class SimpleRewardModel(torch.nn.Module):
            def __init__(self, device):
                super().__init__()
                self.device = device
                self.reward_head = torch.nn.Linear(1, 1).to(device)
                
            def forward(self, input_ids, attention_mask=None):
                # Return a dummy reward
                return torch.ones((input_ids.shape[0], 1), device=input_ids.device)
                
            def to(self, device):
                self.device = device
                self.reward_head = self.reward_head.to(device)
                return self
        
        self.reward_model = SimpleRewardModel(device)
        
        # Initialize trainer to None
        self.trainer = None
        
        # Generation parameters
        self.generation_kwargs = {
            "max_new_tokens": 256,
            "temperature": 0.7,
            "top_p": 0.95,
            "do_sample": True,
            "use_cache": True,
        }
        
        print(f"Initialized PPO Portfolio Manager with {self.num_gpus} GPUs")
        print(f"GPU Memory Usage - Primary GPU: {torch.cuda.memory_allocated(0)/1024**2:.1f}MB / {torch.cuda.get_device_properties(0).total_memory/1024**2:.1f}MB")
        if self.num_gpus > 1:
            print(f"GPU Memory Usage - Secondary GPU: {torch.cuda.memory_allocated(1)/1024**2:.1f}MB / {torch.cuda.get_device_properties(1).total_memory/1024**2:.1f}MB")

    def initialize_trainer(self, train_dataset=None, data_collator=None):
        """Initialize the PPO trainer for multi-GPU training"""
        try:
            # Create a dummy dataset if none provided
            if train_dataset is None:
                # Create minimal dataset
                dummy_text = "Example."
                dummy_encoding = self.tokenizer(dummy_text, return_tensors="pt")
                dummy_ids = dummy_encoding.input_ids[0].cpu().numpy()
                dummy_mask = dummy_encoding.attention_mask[0].cpu().numpy()
                
                dummy_data = {
                    "input_ids": [dummy_ids] * 2,
                    "attention_mask": [dummy_mask] * 2,
                    "rewards": [0.0] * 2
                }
                train_dataset = Dataset.from_dict(dummy_data)
            
            # Create data collator if needed
            if data_collator is None:
                from transformers import DataCollatorWithPadding
                data_collator = DataCollatorWithPadding(
                    self.tokenizer, 
                    pad_to_multiple_of=8
                )
            
            # Free up memory
            import gc
            gc.collect()
            torch.cuda.empty_cache()
            
            # For single GPU: create reference model if needed
            if self.num_gpus <= 1 and self.ref_model is None:
                from ..models import create_reference_model
                print("Creating reference model from policy model")
                self.ref_model = create_reference_model(self.policy_model)
            
            # Initialize PPOTrainer with multi-GPU support
            self.trainer = PPOTrainer(
                args=self.ppo_config,
                processing_class=self.tokenizer,
                model=self.policy_model,
                ref_model=self.ref_model,
                reward_model=self.reward_model,
                train_dataset=train_dataset,
                value_model=self.value_model,
                data_collator=data_collator
            )
            
            print("PPOTrainer initialized successfully for multi-GPU training!")
            return self.trainer
            
        except Exception as e:
            print(f"Error initializing PPOTrainer: {e}")
            import traceback
            traceback.print_exc()
            return None

In [None]:
class TrainingOrchestrator:
    """Manages the PPO training process optimized for multi-GPU setups"""
    
    def __init__(self, env: MacroTradingEnv, agent: PPOPortfolioManager, output_dir="output/ppo_portfolio_manager", 
                 use_sequential_training=False):
        self.env = env
        self.agent = agent
        self.buffer = []
        self.buffer_size = 500
        self.episode_length = 10
        self.output_dir = output_dir
        self.use_sequential_training = use_sequential_training
        
        # Maximum sequence lengths
        self.max_query_length = 512
        self.max_response_length = 256
        
        # Determine GPU allocation
        self.num_gpus = torch.cuda.device_count()
        # For storing experiences, use second GPU if available (to avoid policy model GPU)
        self.experience_device = 1 if self.num_gpus > 1 else 0
        
        # Create device contexts for balanced GPU usage
        self.experience_context = f"cuda:{self.experience_device}"
        
        print(f"Training orchestrator initialized with {self.num_gpus} GPUs")
        print(f"Using GPU {self.experience_device} for experience collection")
    
    def compute_format_reward(self, response_text: str) -> float:
        """Calculate reward for formatting according to required XML structure"""
        # Keep existing implementation
        has_correct_format = self.agent.check_format(response_text)
        
        has_macro_state = "<macro state>" in response_text and "</macro state>" in response_text
        has_reasoning = "<reasoning>" in response_text and "</reasoning>" in response_text
        has_positioning = "<positioning>" in response_text and "</positioning>" in response_text
        
        if has_correct_format:
            return 0.5
        elif has_macro_state and has_reasoning and has_positioning:
            return 0.3
        elif (has_macro_state and has_reasoning) or (has_macro_state and has_positioning) or (has_reasoning and has_positioning):
            return 0.1
        else:
            return -0.2
    
    def print_gpu_memory_stats(self):
        """Print memory stats for all available GPUs"""
        for i in range(self.num_gpus):
            allocated = torch.cuda.memory_allocated(i) / 1024**2
            reserved = torch.cuda.memory_reserved(i) / 1024**2
            total = torch.cuda.get_device_properties(i).total_memory / 1024**2
            print(f"GPU {i}: {allocated:.1f}MB allocated, {reserved:.1f}MB reserved, {total:.1f}MB total ({allocated/total*100:.1f}%)")
    
    def collect_experience(self, num_episodes=3):
        """Collect trading experience with multi-GPU optimization"""
        all_episode_rewards = []
        
        # Clear experience collection GPU
        with torch.cuda.device(self.experience_device):
            torch.cuda.empty_cache()
        
        for episode in tqdm(range(num_episodes), desc="Collecting experience"):
            state = self.env.reset(random_start=True)
            episode_queries = []
            episode_responses = []
            episode_rewards = []
            episode_response_texts = []
            episode_format_rewards = []
            
            # Run one episode
            done = False
            step = 0
            while not done and step < self.episode_length:
                # Periodically clear cache
                if step % 3 == 0:
                    with torch.cuda.device(self.experience_device):
                        torch.cuda.empty_cache()
                
                # Get action from agent
                position, response_text, query, response = self.agent.predict(state)
                
                # Take action in environment
                next_state, reward, done, info = self.env.step(position)
                
                # Calculate format reward
                format_reward = self.compute_format_reward(response_text)
                
                # Combine rewards
                total_reward = reward + format_reward
                
                # Move tensors to experience device (if different)
                if self.experience_device != self.agent.policy_device:
                    query = query.to(f"cuda:{self.experience_device}")
                    response = response.to(f"cuda:{self.experience_device}")
                
                # Truncate tensors
                query = self._truncate_tensor(query, self.max_query_length)
                response = self._truncate_tensor(response, self.max_query_length + self.max_response_length)
                
                # Store experience
                episode_queries.append(query)
                episode_responses.append(response)
                episode_rewards.append(total_reward)
                episode_response_texts.append(response_text)
                episode_format_rewards.append(format_reward)
                
                # Move to next state
                state = next_state
                step += 1
            
            # Calculate returns with discount
            returns = self._calculate_returns(episode_rewards)
            
            # Manage buffer size
            if len(self.buffer) + len(episode_rewards) > self.buffer_size:
                self.buffer = self.buffer[-(self.buffer_size - len(episode_rewards)):]
            
            # Store episodes in buffer
            for i in range(len(episode_rewards)):
                self.buffer.append({
                    'query': episode_queries[i],
                    'response': episode_responses[i],
                    'reward': returns[i],
                    'raw_reward': episode_rewards[i],
                    'response_text': episode_response_texts[i],
                    'format_reward': episode_format_rewards[i]
                })
                
            all_episode_rewards.extend(episode_rewards)
            
            # Print GPU stats after each episode
            if self.num_gpus > 1:
                self.print_gpu_memory_stats()
        
        return np.mean(all_episode_rewards) if all_episode_rewards else 0.0
    
    def _truncate_tensor(self, tensor, max_length):
        """Truncate tensor to maximum length to save memory"""
        if tensor is None:
            return tensor
            
        # Handle different tensor shapes
        if len(tensor.shape) == 1:
            return tensor[:min(tensor.shape[0], max_length)]
        elif len(tensor.shape) == 2:
            return tensor[:, :min(tensor.shape[1], max_length)]
        elif len(tensor.shape) == 3:
            return tensor[:, :, :min(tensor.shape[2], max_length)]
        else:
            return tensor

    def _calculate_returns(self, rewards):
        """Calculate discounted returns"""
        gamma = self.agent.ppo_config.gamma
        returns = []
        R = 0
        for r in reversed(rewards):
            R = r + gamma * R
            returns.insert(0, R)
        return returns

    def create_training_dataset(self, queries, responses, rewards):
        """Create training dataset optimized for multi-GPU training"""
        # Format data for the dataset
        formatted_data = {
            "input_ids": [],
            "attention_mask": [],
            "rewards": []
        }
        
        # Limit examples for memory efficiency
        max_examples = 50
        if len(queries) > max_examples:
            print(f"Limiting to {max_examples} examples for memory efficiency (from {len(queries)} available)")
            indices = np.random.choice(len(queries), max_examples, replace=False)
            queries = [queries[i] for i in indices]
            responses = [responses[i] for i in indices]
            rewards = [rewards[i] for i in indices]
        
        # Process each example
        for i in range(len(queries)):
            try:
                # Get individual tensors
                query = queries[i].cpu().detach()
                response = responses[i].cpu().detach()
                
                # Truncate to save memory
                query = self._truncate_tensor(query, self.max_query_length)
                response = self._truncate_tensor(response, self.max_query_length + self.max_response_length)
                
                # Handle tensor shapes
                if len(query.shape) > 1:
                    query = query.flatten()
                if len(response.shape) > 1:
                    response = response.flatten()
                
                # Create attention mask
                attention_mask = torch.ones_like(query, dtype=torch.long)
                
                # Add to formatted data
                formatted_data["input_ids"].append(query.numpy())
                formatted_data["attention_mask"].append(attention_mask.numpy())
                
                # Add reward
                if i < len(rewards):
                    reward_value = float(rewards[i])
                else:
                    reward_value = 0.0
                formatted_data["rewards"].append(reward_value)
                    
            except Exception as e:
                print(f"Error processing experience {i}: {e}")
        
        # Create dataset
        dataset = Dataset.from_dict(formatted_data)
        print(f"Created dataset with {len(dataset)} examples")
        return dataset

    def train(self, num_iterations=20, num_episodes_per_iter=3, batch_size=4):
        """Training loop optimized for multi-GPU systems"""
        for iteration in range(num_iterations):
            # Clear memory on all GPUs
            for i in range(self.num_gpus):
                with torch.cuda.device(i):
                    torch.cuda.empty_cache()
                    
            import gc
            gc.collect()
            
            print(f"\n{'='*20} Iteration {iteration+1}/{num_iterations} {'='*20}")
            print("GPU memory status before collection:")
            self.print_gpu_memory_stats()
            
            # Collect new experience
            avg_reward = self.collect_experience(num_episodes=num_episodes_per_iter)
            
            # Skip training if buffer is too small
            if len(self.buffer) < batch_size:
                print(f"Buffer too small ({len(self.buffer)}), skipping training")
                continue
            
            try:
                # Clear memory before training
                for i in range(self.num_gpus):
                    with torch.cuda.device(i):
                        torch.cuda.empty_cache()
                
                # Select experiences
                if self.use_sequential_training:
                    start_idx = max(0, len(self.buffer) - batch_size)
                    batch = self.buffer[start_idx:]
                else:
                    batch_indices = np.random.choice(len(self.buffer), min(batch_size, len(self.buffer)), replace=False)
                    batch = [self.buffer[i] for i in batch_indices]
                
                print(f"Using batch of size {len(batch)}")
                
                # Extract components
                queries = [item['query'] for item in batch]
                responses = [item['response'] for item in batch]
                rewards = [item['reward'] for item in batch]
                
                # Create dataset
                train_dataset = self.create_training_dataset(queries, responses, rewards)
                
                # Create optimized data collator for multi-GPU
                from transformers import DataCollatorWithPadding
                data_collator = DataCollatorWithPadding(
                    self.agent.tokenizer,
                    pad_to_multiple_of=8
                )
                
                # Clear references to free memory
                del queries, responses
                for i in range(self.num_gpus):
                    with torch.cuda.device(i):
                        torch.cuda.empty_cache()
                
                # Print GPU status before training
                print("GPU memory status before training:")
                self.print_gpu_memory_stats()
                
                # Initialize trainer for this batch
                self.agent.initialize_trainer(
                    train_dataset=train_dataset,
                    data_collator=data_collator
                )
                
                print("Starting PPO training...")
                
                # Temporary adjust training parameters
                original_num_train_epochs = self.agent.ppo_config.num_train_epochs
                self.agent.ppo_config.num_train_epochs = 1
                
                # Run training with automatic multi-GPU distribution
                self.agent.trainer.train()
                
                # Restore settings
                self.agent.ppo_config.num_train_epochs = original_num_train_epochs
                
                # Log sample
                if len(batch) > 0:
                    sample = batch[0]
                    print(f"\nSample response: {sample['response_text'][:100]}...")
                    print(f"Total reward: {sample['reward']:.4f}")
                
                # Free memory after training
                del self.agent.trainer
                self.agent.trainer = None
                for i in range(self.num_gpus):
                    with torch.cuda.device(i):
                        torch.cuda.empty_cache()
                gc.collect()
                
                # Print GPU status after training
                print("GPU memory status after training:")
                self.print_gpu_memory_stats()
                
            except Exception as e:
                print(f"Error during training: {e}")
                import traceback
                traceback.print_exc()
                
                # Free memory after error
                if hasattr(self.agent, 'trainer') and self.agent.trainer is not None:
                    del self.agent.trainer
                    self.agent.trainer = None
                for i in range(self.num_gpus):
                    with torch.cuda.device(i):
                        torch.cuda.empty_cache()
                gc.collect()
            
            # Save model periodically
            if (iteration + 1) % 5 == 0:
                try:
                    save_path = f"{self.output_dir}/checkpoint-{iteration+1}"
                    self.agent.policy_model.save_pretrained(save_path)
                    self.agent.tokenizer.save_pretrained(save_path)
                    print(f"Model saved to {save_path}")
                except Exception as e:
                    print(f"Error saving model: {e}")
            
            # Logging
            print(f"\nIteration {iteration+1}/{num_iterations}")
            print(f"Average Reward: {avg_reward:.4f}")
            print(f"Buffer Size: {len(self.buffer)}")