In [None]:
class PPOPortfolioManager:
    """
    Portfolio manager using PPO to make macro-driven investment decisions
    Optimized for multi-GPU training with simpler, more reliable approach
    """
    
    def __init__(self, model_name="Qwen/Qwen2.5-0.5B-Instruct", output_dir="output/ppo_portfolio_manager"):
        # Check available GPUs
        self.num_gpus = torch.cuda.device_count()
        print(f"Found {self.num_gpus} GPU(s)")
        
        # Set device strategy
        self.policy_device = 0  # First GPU for policy model
        self.ref_device = 1 if self.num_gpus > 1 else 0  # Second GPU for reference model
        
        # Multi-GPU optimized PPO configuration without DeepSpeed
        self.ppo_config = PPOConfig(
            # Batch size depends on available GPUs
            per_device_train_batch_size=4 if self.num_gpus > 1 else 2,
            
            # Gradient accumulation for effective larger batch sizes
            gradient_accumulation_steps=4,
            
            # Enable mixed precision training
            bf16=torch.cuda.is_bf16_supported(),
            fp16=not torch.cuda.is_bf16_supported(),
            
            # Memory optimizations
            optimize_cuda_cache=True,
            gradient_checkpointing=True,
            
            # Standard parameters
            learning_rate=5e-5,
            max_grad_norm=1.0,
            num_train_epochs=3,
            seed=42,
            report_to=["tensorboard"],
            output_dir=output_dir,
            logging_dir=f"{output_dir}/logs",
            logging_steps=500,
            run_name="ppo_portfolio_manager",
            save_strategy="steps",
            save_steps=500,
            
            # PPO specific parameters
            num_ppo_epochs=4,
            gamma=0.99,
            lam=0.95,
            cliprange=0.2,
            cliprange_value=0.2,
            vf_coef=0.1,
            kl_coef=0.05,
            whiten_rewards=False,
            temperature=0.7,
            response_length=256,  # Reduced for memory efficiency
            remove_unused_columns=False
        )
        
        # Initialize tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # Load models with proper device placement
        from transformers import AutoModelForCausalLM, GenerationConfig
        
        # Select precision
        dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
        
        # 1. Load policy model on first GPU - simple device mapping
        print(f"Loading policy model on GPU {self.policy_device}")
        self.policy_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=dtype,
            device_map=f"cuda:{self.policy_device}",  # Simple device placement
            low_cpu_mem_usage=True
        )
        
        # Set generation config
        self.policy_model.generation_config = GenerationConfig.from_pretrained(model_name)
        self.policy_model.config.use_cache = not self.ppo_config.gradient_checkpointing
        
        # 2. Load reference model on second GPU if available
        if self.num_gpus > 1:
            print(f"Loading reference model on GPU {self.ref_device}")
            self.ref_model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=dtype,
                device_map=f"cuda:{self.ref_device}",  # Use second GPU
                low_cpu_mem_usage=True
            )
        else:
            # For single GPU, create a lightweight reference model
            print("Creating lightweight reference model on GPU 0")
            from ..models import create_reference_model
            self.ref_model = create_reference_model(self.policy_model)
        
        # 3. Create a value model linked to the policy model
        from transformers import AutoModelForCausalLMWithValueHead
        
        print(f"Creating value model on GPU {self.policy_device}")
        self.value_model = AutoModelForCausalLMWithValueHead.from_pretrained(
            self.policy_model,  # Use policy model backbone
            torch_dtype=dtype,
        )
        
        # 4. Create a simple reward model
        print(f"Creating simple reward model on GPU {self.policy_device}")
        
        class SimpleRewardModel(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.reward_head = torch.nn.Linear(1, 1)
                
            def forward(self, input_ids, attention_mask=None):
                # Return a simple reward
                return torch.ones((input_ids.shape[0], 1), device=input_ids.device)
                
            def to(self, device):
                self.reward_head = self.reward_head.to(device)
                return self
        
        self.reward_model = SimpleRewardModel().to(f"cuda:{self.policy_device}")
        
        # Initialize trainer to None
        self.trainer = None
        
        # Generation parameters
        self.generation_kwargs = {
            "max_new_tokens": 256,
            "temperature": 0.7,
            "top_p": 0.95,
            "do_sample": True,
            "use_cache": True,
        }
        
        # Print memory usage information
        print(f"Initialized PPO Portfolio Manager with {self.num_gpus} GPUs")
        print(f"GPU 0 Memory: {torch.cuda.memory_allocated(0)/1024**2:.1f}MB / {torch.cuda.get_device_properties(0).total_memory/1024**2:.1f}MB")
        if self.num_gpus > 1:
            print(f"GPU 1 Memory: {torch.cuda.memory_allocated(1)/1024**2:.1f}MB / {torch.cuda.get_device_properties(1).total_memory/1024**2:.1f}MB")

    def initialize_trainer(self, train_dataset=None, data_collator=None):
        """Initialize the PPO trainer with proper multi-GPU support"""
        try:
            # Create dummy dataset if none provided
            if train_dataset is None:
                # Create minimal dataset
                dummy_text = "Example."
                dummy_encoding = self.tokenizer(dummy_text, return_tensors="pt")
                dummy_ids = dummy_encoding.input_ids[0].cpu().numpy()
                dummy_mask = dummy_encoding.attention_mask[0].cpu().numpy()
                
                dummy_data = {
                    "input_ids": [dummy_ids] * 2,
                    "attention_mask": [dummy_mask] * 2,
                    "rewards": [0.0] * 2
                }
                train_dataset = Dataset.from_dict(dummy_data)
            
            # Create data collator if needed
            if data_collator is None:
                from transformers import DataCollatorWithPadding
                data_collator = DataCollatorWithPadding(
                    self.tokenizer, 
                    pad_to_multiple_of=8
                )
            
            # Free up memory
            import gc
            gc.collect()
            torch.cuda.empty_cache()
            
            # Initialize PPOTrainer
            self.trainer = PPOTrainer(
                args=self.ppo_config,
                processing_class=self.tokenizer,
                model=self.policy_model,
                ref_model=self.ref_model,
                reward_model=self.reward_model,
                train_dataset=train_dataset,
                value_model=self.value_model,
                data_collator=data_collator
            )
            
            print("PPOTrainer initialized successfully!")
            return self.trainer
            
        except Exception as e:
            print(f"Error initializing PPOTrainer: {e}")
            import traceback
            traceback.print_exc()
            return None
            
    def predict(self, state: Dict) -> Tuple[float, str, torch.Tensor, torch.Tensor]:
        """Generate trading decision based on current state - with multi-GPU awareness"""
        # Format the state into a prompt
        prompt = self.format_state(state)
        
        # Tokenize the input 
        inputs = self.tokenizer(prompt, return_tensors="pt").to(f"cuda:{self.policy_device}")
        
        # Generate response on the policy model GPU
        try:
            with torch.no_grad():
                outputs = self.policy_model.generate(
                    inputs.input_ids,
                    **self.generation_kwargs
                )
            
            # Decode response
            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            response_text = full_response[len(prompt):]
            
            # Extract positioning value
            position = self.extract_positioning(full_response)
            position = np.clip(position, -1.0, 1.0)
            
            # Return properly shaped tensors
            query_tensor = inputs.input_ids
            response_tensor = outputs
            
            return position, response_text, query_tensor, response_tensor
            
        except Exception as e:
            print(f"Error in prediction: {e}")
            import traceback
            traceback.print_exc()
            
            # Return safe defaults
            return 0.0, "", inputs.input_ids, inputs.input_ids

In [None]:
class TrainingOrchestrator:
    """Manages the PPO training process with simplified multi-GPU support"""
    
    def __init__(self, env: MacroTradingEnv, agent: PPOPortfolioManager, output_dir="output/ppo_portfolio_manager", 
                 use_sequential_training=False):
        self.env = env
        self.agent = agent
        self.buffer = []
        self.buffer_size = 500
        self.episode_length = 10
        self.output_dir = output_dir
        self.use_sequential_training = use_sequential_training
        
        # Maximum sequence lengths
        self.max_query_length = 512
        self.max_response_length = 256
        
        # Determine optimal GPU allocation
        self.num_gpus = torch.cuda.device_count()
        # For buffer storage, use second GPU if available
        self.buffer_device = 1 if self.num_gpus > 1 else 0
        
        print(f"Training orchestrator initialized with {self.num_gpus} GPUs")
        if self.num_gpus > 1:
            print(f"Using GPU 0 for policy model and GPU 1 for reference model and buffer")
        else:
            print(f"Using single GPU for all operations")
    
    def compute_format_reward(self, response_text: str) -> float:
        """Calculate reward for formatting according to required XML structure"""
        has_correct_format = self.agent.check_format(response_text)
        
        has_macro_state = "<macro state>" in response_text and "</macro state>" in response_text
        has_reasoning = "<reasoning>" in response_text and "</reasoning>" in response_text
        has_positioning = "<positioning>" in response_text and "</positioning>" in response_text
        
        if has_correct_format:
            return 0.5
        elif has_macro_state and has_reasoning and has_positioning:
            return 0.3
        elif (has_macro_state and has_reasoning) or (has_macro_state and has_positioning) or (has_reasoning and has_positioning):
            return 0.1
        else:
            return -0.2
    
    def print_gpu_memory_stats(self):
        """Print memory stats for all available GPUs"""
        for i in range(self.num_gpus):
            allocated = torch.cuda.memory_allocated(i) / 1024**2
            reserved = torch.cuda.memory_reserved(i) / 1024**2
            total = torch.cuda.get_device_properties(i).total_memory / 1024**2
            print(f"GPU {i}: {allocated:.1f}MB allocated, {reserved:.1f}MB reserved, {total:.1f}MB total ({allocated/total*100:.1f}%)")
    
    def collect_experience(self, num_episodes=3):
        """Collect trading experience with balanced GPU usage"""
        all_episode_rewards = []
        
        # Clear memory before starting collection
        torch.cuda.empty_cache()
        
        for episode in tqdm(range(num_episodes), desc="Collecting experience"):
            state = self.env.reset(random_start=True)
            episode_queries = []
            episode_responses = []
            episode_rewards = []
            episode_response_texts = []
            episode_format_rewards = []
            
            # Run one episode
            done = False
            step = 0
            while not done and step < self.episode_length:
                # Get action from agent (automatically uses policy_device)
                position, response_text, query, response = self.agent.predict(state)
                
                # Take action in environment
                next_state, reward, done, info = self.env.step(position)
                
                # Calculate format reward
                format_reward = self.compute_format_reward(response_text)
                total_reward = reward + format_reward
                
                # Process tensors for efficient storage
                # If we have multiple GPUs, store buffer data on secondary GPU
                if self.num_gpus > 1:
                    # Move to CPU first to avoid direct GPU-to-GPU transfer issues
                    query = query.cpu()
                    response = response.cpu()
                
                # Truncate tensors for memory efficiency
                query = self._truncate_tensor(query, self.max_query_length)
                response = self._truncate_tensor(response, self.max_query_length + self.max_response_length)
                
                # Store experience
                episode_queries.append(query)
                episode_responses.append(response)
                episode_rewards.append(total_reward)
                episode_response_texts.append(response_text)
                episode_format_rewards.append(format_reward)
                
                # Move to next state
                state = next_state
                step += 1
                
                # Periodically clear cache
                if step % 5 == 0:
                    torch.cuda.empty_cache()
            
            # Calculate returns with discount
            returns = self._calculate_returns(episode_rewards)
            
            # Manage buffer size
            if len(self.buffer) + len(episode_rewards) > self.buffer_size:
                self.buffer = self.buffer[-(self.buffer_size - len(episode_rewards)):]
            
            # Store episodes in buffer
            for i in range(len(episode_rewards)):
                self.buffer.append({
                    'query': episode_queries[i],
                    'response': episode_responses[i],
                    'reward': returns[i],
                    'raw_reward': episode_rewards[i],
                    'response_text': episode_response_texts[i],
                    'format_reward': episode_format_rewards[i]
                })
                
            all_episode_rewards.extend(episode_rewards)
            
            # Print GPU memory stats after each episode
            if self.num_gpus > 1:
                self.print_gpu_memory_stats()
        
        return np.mean(all_episode_rewards) if all_episode_rewards else 0.0
    
    def _truncate_tensor(self, tensor, max_length):
        """Truncate tensor to maximum length to save memory"""
        if tensor is None:
            return tensor
            
        # Handle different tensor shapes
        if len(tensor.shape) == 1:
            return tensor[:min(tensor.shape[0], max_length)]
        elif len(tensor.shape) == 2:
            return tensor[:, :min(tensor.shape[1], max_length)]
        elif len(tensor.shape) == 3:
            return tensor[:, :, :min(tensor.shape[2], max_length)]
        else:
            return tensor

    def _calculate_returns(self, rewards):
        """Calculate discounted returns"""
        gamma = self.agent.ppo_config.gamma
        returns = []
        R = 0
        for r in reversed(rewards):
            R = r + gamma * R
            returns.insert(0, R)
        return returns

    def create_training_dataset(self, queries, responses, rewards):
        """Create dataset for PPO training with proper memory management"""
        formatted_data = {
            "input_ids": [],
            "attention_mask": [],
            "rewards": []
        }
        
        # Limit examples if needed
        max_examples = 50  # Cap for memory efficiency
        if len(queries) > max_examples:
            print(f"Limiting to {max_examples} examples (from {len(queries)} available)")
            indices = np.random.choice(len(queries), max_examples, replace=False)
            queries = [queries[i] for i in indices]
            responses = [responses[i] for i in indices]
            rewards = [rewards[i] for i in indices]
        
        # Process each example
        for i in range(len(queries)):
            try:
                # Get individual tensors and ensure they're on CPU
                query = queries[i].detach().cpu() if isinstance(queries[i], torch.Tensor) else queries[i]
                response = responses[i].detach().cpu() if isinstance(responses[i], torch.Tensor) else responses[i]
                
                # Handle tensor shapes
                if isinstance(query, torch.Tensor) and len(query.shape) > 1:
                    query = query.flatten()
                if isinstance(response, torch.Tensor) and len(response.shape) > 1:
                    response = response.flatten()
                
                # Create attention mask
                attention_mask = torch.ones_like(query, dtype=torch.long) if isinstance(query, torch.Tensor) else torch.ones(len(query), dtype=torch.long)
                
                # Convert to numpy for dataset creation
                query_numpy = query.numpy() if isinstance(query, torch.Tensor) else query
                mask_numpy = attention_mask.numpy() if isinstance(attention_mask, torch.Tensor) else attention_mask
                
                # Add to formatted data
                formatted_data["input_ids"].append(query_numpy)
                formatted_data["attention_mask"].append(mask_numpy)
                
                # Add reward
                reward_value = float(rewards[i]) if i < len(rewards) else 0.0
                formatted_data["rewards"].append(reward_value)
                    
            except Exception as e:
                print(f"Error processing example {i}: {e}")
                import traceback
                traceback.print_exc()
        
        # Create dataset
        if len(formatted_data["input_ids"]) == 0:
            print("Warning: No valid examples processed. Creating dummy data.")
            # Create minimal dummy data
            dummy_data = {"input_ids": [[0, 1, 2]] * 2, 
                         "attention_mask": [[1, 1, 1]] * 2,
                         "rewards": [0.0] * 2}
            dataset = Dataset.from_dict(dummy_data)
        else:
            dataset = Dataset.from_dict(formatted_data)
            
        print(f"Created dataset with {len(dataset)} examples")
        return dataset

    def train(self, num_iterations=20, num_episodes_per_iter=3, batch_size=4):
        """Training loop with proper multi-GPU utilization"""
        for iteration in range(num_iterations):
            # Clear memory before starting a new iteration
            torch.cuda.empty_cache()
            import gc
            gc.collect()
            
            print(f"\n{'='*20} Iteration {iteration+1}/{num_iterations} {'='*20}")
            print("GPU memory status before collection:")
            self.print_gpu_memory_stats()
            
            # Collect new experience
            avg_reward = self.collect_experience(num_episodes=num_episodes_per_iter)
            
            # Skip training if buffer is too small
            if len(self.buffer) < batch_size:
                print(f"Buffer too small ({len(self.buffer)}), skipping training")
                continue
            
            try:
                # Clear memory before training
                torch.cuda.empty_cache()
                
                # Select experiences based on strategy
                if self.use_sequential_training:
                    start_idx = max(0, len(self.buffer) - batch_size)
                    batch = self.buffer[start_idx:]
                else:
                    batch_indices = np.random.choice(len(self.buffer), min(batch_size, len(self.buffer)), replace=False)
                    batch = [self.buffer[i] for i in batch_indices]
                
                print(f"Using batch of size {len(batch)}")
                
                # Extract components
                queries = [item['query'] for item in batch]
                responses = [item['response'] for item in batch]
                rewards = [item['reward'] for item in batch]
                
                # Create dataset with memory management
                train_dataset = self.create_training_dataset(queries, responses, rewards)
                
                # Create data collator
                from transformers import DataCollatorWithPadding
                data_collator = DataCollatorWithPadding(
                    self.agent.tokenizer,
                    pad_to_multiple_of=8
                )
                
                # Free memory
                del queries, responses
                gc.collect()
                torch.cuda.empty_cache()
                
                # Print memory status before training
                print("GPU memory status before training:")
                self.print_gpu_memory_stats()
                
                # Initialize trainer for this batch
                self.agent.initialize_trainer(
                    train_dataset=train_dataset,
                    data_collator=data_collator
                )
                
                print("Starting PPO training...")
                
                # Shorter training run
                original_num_train_epochs = self.agent.ppo_config.num_train_epochs
                self.agent.ppo_config.num_train_epochs = 1
                
                # Run training
                self.agent.trainer.train()
                
                # Restore settings
                self.agent.ppo_config.num_train_epochs = original_num_train_epochs
                
                # Log sample
                if len(batch) > 0:
                    sample = batch[0]
                    print(f"\nSample response: {sample['response_text'][:100]}...")
                    print(f"Total reward: {sample['reward']:.4f}")
                
                # Clean up trainer
                del self.agent.trainer
                self.agent.trainer = None
                torch.cuda.empty_cache()
                gc.collect()
                
            except Exception as e:
                print(f"Error during training: {e}")
                import traceback
                traceback.print_exc()
                
                # Clean up on error
                if hasattr(self.agent, 'trainer') and self.agent.trainer is not None:
                    del self.agent.trainer
                    self.agent.trainer = None
                torch.cuda.empty_cache()
                gc.collect()
            
            # Save model periodically
            if (iteration + 1) % 5 == 0:
                try:
                    save_path = f"{self.output_dir}/checkpoint-{iteration+1}"
                    self.agent.policy_model.save_pretrained(save_path)
                    self.agent.tokenizer.save_pretrained(save_path)
                    print(f"Model saved to {save_path}")
                except Exception as e:
                    print(f"Error saving model: {e}")
            
            # Logging
            print(f"\nIteration {iteration+1}/{num_iterations}")
            print(f"Average Reward: {avg_reward:.4f}")
            print(f"Buffer Size: {len(self.buffer)}")
            print("GPU memory status after iteration:")
            self.print_gpu_memory_stats()