In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from typing import Dict, List, Tuple
from collections import deque
import json
import re
import random
from tqdm import tqdm
from datasets import Dataset

# System prompt for the portfolio manager
SYSTEM_PROMPT = """
You are a macro event driven portfolio manager, you make positioning decision of S&P500 index based on market context and news.

Only edit macro state if there are changes in the macro regime that would impact returns of S&P500.

Positioning should be a float that ranges from -1 (full short) to 1 (full long).

You must respond in the following XML format:

<macro state>
...
</macro state>
<reasoning>
...
</reasoning>
<positioning>
...
</positioning>
"""

# =============================================================================
# 1. Trading Environment
# =============================================================================

class MacroTradingEnv:
    """
    Trading environment for macro-driven portfolio management
    Tracks market state, positions, and calculates returns
    """
    
    def __init__(self, data: pd.DataFrame, window_size: int = 7):
        self.data = data.copy()
        self.position = 0.0  # -1 (full short) to 1 (full long)
        self.current_step = 0
        self.window_size = window_size
        self.headline_window = deque(maxlen=window_size)
        self.action_history = deque(maxlen=5)
        
        # Ensure required columns exist
        required_cols = ['headline', 'returns']
        for col in required_cols:
            assert col in self.data.columns, f"DataFrame must contain '{col}' column"
        
        # Initialize action history with zeros
        for _ in range(5):
            self.action_history.append(0.0)

    def reset(self, random_start: bool = True):
        """Reset environment, optionally to a random starting point"""
        if random_start:
            # Ensure we have enough data ahead for a full episode
            max_start = len(self.data) - 30
            self.current_step = random.randint(self.window_size, max_start) if max_start > self.window_size else self.window_size
        else:
            self.current_step = self.window_size
        
        # Reset state
        self.position = 0.0
        self.headline_window.clear()
        self.action_history.clear()
        
        # Initialize headline window with past headlines
        for i in range(self.window_size):
            idx = self.current_step - i - 1
            if idx >= 0:
                self.headline_window.appendleft(self.data.iloc[idx]['headline'])
            else:
                self.headline_window.appendleft("No headline available")
        
        # Initialize action history with zeros
        for _ in range(5):
            self.action_history.append(0.0)
            
        return self.get_state()

    def step(self, new_position: float) -> Tuple[Dict, float, bool, Dict]:
        """Execute position adjustment and return (state, reward, done, info)"""
        if self.current_step >= len(self.data) - 1:
            return self.get_state(), 0.0, True, {'status': 'completed'}
        
        # Calculate position change
        position_change = new_position - self.position
        transaction_cost = abs(position_change) * 0.001  # 0.1% friction
        
        # Update position
        self.position = new_position
        
        # Calculate return (using pre-calculated returns from dataframe)
        next_return = self.data.iloc[self.current_step]['returns']
        position_return = next_return * self.position
        
        # Move to next day
        self.current_step += 1
        
        # Update headline window and action history
        if self.current_step < len(self.data):
            self.headline_window.append(self.data.iloc[self.current_step]['headline'])
        self.action_history.append(new_position)
        
        # Calculate reward (return minus transaction cost)
        reward = position_return - transaction_cost
        
        # Check if episode is done
        done = (self.current_step >= len(self.data) - 1)
        
        info = {
            'return': position_return,
            'transaction_cost': transaction_cost,
            'position_change': position_change
        }
        
        return self.get_state(), reward, done, info

    def get_state(self) -> Dict:
        """Return current environment state dictionary"""
        if self.current_step >= len(self.data):
            self.current_step = len(self.data) - 1
            
        current_row = self.data.iloc[self.current_step]
        
        # Create context dictionary with all technical indicators
        context = {}
        for col in current_row.index:
            # Skip specific columns
            if col not in ['headline', 'returns', 'date']:
                context[col] = current_row[col]
        
        return {
            'market_context': context,
            'headlines': list(self.headline_window),
            'position': self.position,
            'action_history': list(self.action_history)
        }

# =============================================================================
# 2. PPO Portfolio Manager
# =============================================================================

class PPOPortfolioManager:
    """
    Portfolio manager using PPO to make macro-driven investment decisions
    """
    
    def __init__(self, model_name="Qwen/Qwen2.5-0.5B-Instruct"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_name = model_name
        
        # PPO configuration
        self.ppo_config = PPOConfig(
            model_name=model_name,
            learning_rate=5e-6,
            batch_size=16,
            mini_batch_size=4,
            gradient_accumulation_steps=1,
            optimize_cuda_cache=True,
            ppo_epochs=4,
            gamma=0.99,
            remove_unused_columns=False,
            target_kl=0.1,
            seed=42,
            log_with="tensorboard",
            max_grad_norm=0.5
        )
        
        # Initialize tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # Initialize model with value head for PPO
        self.model = AutoModelForCausalLMWithValueHead.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto"
        )
        
        # Initialize trainer
        self.trainer = None
        
        # Define text generation parameters
        self.generation_kwargs = {
            "max_new_tokens": 512,
            "temperature": 0.7,
            "top_p": 0.95,
            "do_sample": True,
        }

    def format_state(self, state: Dict) -> str:
        """Create prompt from current state"""
        # Format market context
        context_str = []
        for k, v in state['market_context'].items():
            # Format number with appropriate precision
            if isinstance(v, (int, float)):
                if abs(v) < 0.01:
                    formatted_value = f"{v:.6f}"
                elif abs(v) < 1:
                    formatted_value = f"{v:.4f}"
                else:
                    formatted_value = f"{v:.2f}"
            else:
                formatted_value = str(v)
                
            context_str.append(f"{k}: {formatted_value}")
        
        # Format headlines
        headlines_str = "\n".join([f"- {h}" for h in state['headlines']])
        
        # Format previous positions
        positions_str = ", ".join([f"{pos:.2f}" for pos in state['action_history']])
        
        # Combine all context
        prompt = f"{SYSTEM_PROMPT.strip()}\n\n"
        prompt += "Market Context:\n"
        prompt += ", ".join(context_str) + "\n\n"
        prompt += f"Current Position: {state['position']:.2f}\n\n"
        prompt += "Recent Headlines:\n"
        prompt += headlines_str + "\n\n"
        prompt += f"Previous Positions: [{positions_str}]"
        
        return prompt

    def extract_positioning(self, text: str) -> float:
        """Extract positioning value from XML response"""
        try:
            match = re.search(r"<positioning>(.*?)</positioning>", text, re.DOTALL)
            if match:
                position_str = match.group(1).strip()
                # Try to extract a float from the text
                try:
                    # First look for float patterns
                    float_pattern = r"[-+]?\d*\.\d+|\d+"
                    float_match = re.search(float_pattern, position_str)
                    if float_match:
                        return float(float_match.group())
                    else:
                        return float(position_str)
                except ValueError:
                    print(f"Could not convert position to float: {position_str}")
                    return 0.0
            return 0.0
        except Exception as e:
            print(f"Error extracting position: {e}")
            return 0.0

    def check_format(self, text: str) -> bool:
        """Check if response follows the required XML format"""
        pattern = r"<macro state>.*?</macro state>.*?<reasoning>.*?</reasoning>.*?<positioning>.*?</positioning>"
        return bool(re.search(pattern, text, re.DOTALL))

    def predict(self, state: Dict) -> Tuple[float, str, torch.Tensor, torch.Tensor]:
        """Generate trading decision based on current state"""
        prompt = self.format_state(state)
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        
        # Generate response
        outputs = self.model.generate(
            inputs.input_ids,
            **self.generation_kwargs
        )
        
        # Decode response
        response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
        
        # Extract positioning
        position = self.extract_positioning(response)
        position = np.clip(position, -1.0, 1.0)
        
        return position, response, inputs.input_ids, outputs

    def initialize_trainer(self):
        """Initialize the PPO trainer with dummy dataset"""
        dummy_data = {"prompt": [""] * 2, "response": [""] * 2, "reward": [0.0] * 2}
        dummy_dataset = Dataset.from_dict(dummy_data)
        
        # Create a reference model (copy of the base model)
        ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
            self.model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto"
        )
        
        # The current PPO Trainer requires a reward model - we'll use the main model
        # This is a simplification - in a full implementation, you might want a separate reward model
        reward_model = self.model
        
        self.trainer = PPOTrainer(
            args=self.ppo_config,
            processing_class=self.tokenizer,
            model=self.model,
            ref_model=ref_model,
            reward_model=reward_model,
            train_dataset=dummy_dataset
        )
        
    def ppo_step(self, query_tensors, response_tensors, rewards):
        """Run PPO optimization step"""
        if self.trainer is None:
            self.initialize_trainer()
            
        # Ensure all inputs are properly formatted
        query_tensors = [t.to(self.model.device) if isinstance(t, torch.Tensor) else t for t in query_tensors]
        response_tensors = [t.to(self.model.device) if isinstance(t, torch.Tensor) else t for t in response_tensors]
        rewards = torch.tensor(rewards, dtype=torch.float).to(self.model.device)
        
        # Prepare inputs in the format expected by the current PPOTrainer
        ppo_inputs = []
        for query, response, reward in zip(query_tensors, response_tensors, rewards):
            # Tokenize the combined prompt+response
            prompt_tokens = query
            response_tokens = response[query.shape[0]:]  # Remove prompt part
            
            ppo_inputs.append({
                "prompt_input_ids": prompt_tokens,
                "response": self.tokenizer.decode(response_tokens, skip_special_tokens=True),
                "reward": reward.item()
            })
        
        # Run PPO step
        stats = self.trainer.step(ppo_inputs)
        return stats

# =============================================================================
# 3. Training Orchestrator
# =============================================================================

class TrainingOrchestrator:
    """Manages the PPO training process for the portfolio manager"""
    
    def __init__(self, env: MacroTradingEnv, agent: PPOPortfolioManager, output_dir="output/ppo_portfolio_manager"):
        self.env = env
        self.agent = agent
        self.buffer = []
        self.buffer_size = 1000
        self.episode_length = 21  # Trading days per episode
        self.output_dir = output_dir
        
        # Ensure the agent's PPO trainer is initialized
        if agent.trainer is None:
            agent.initialize_trainer()

    def compute_format_reward(self, response_text: str) -> float:
        """Calculate reward for formatting according to required XML structure"""
        # Check overall structure
        has_correct_format = self.agent.check_format(response_text)
        
        # Check individual tags
        has_macro_state = "<macro state>" in response_text and "</macro state>" in response_text
        has_reasoning = "<reasoning>" in response_text and "</reasoning>" in response_text
        has_positioning = "<positioning>" in response_text and "</positioning>" in response_text
        
        # Calculate format reward component
        if has_correct_format:
            return 0.5  # Full format reward
        elif has_macro_state and has_reasoning and has_positioning:
            return 0.3  # Tags exist but not in correct order/format
        elif (has_macro_state and has_reasoning) or (has_macro_state and has_positioning) or (has_reasoning and has_positioning):
            return 0.1  # Some tags exist
        else:
            return -0.2  # Format completely wrong
    
    def collect_experience(self, num_episodes=10):
        """Collect trading experience by running multiple episodes"""
        all_episode_rewards = []
        
        for episode in tqdm(range(num_episodes), desc="Collecting experience"):
            state = self.env.reset(random_start=True)
            episode_queries = []
            episode_responses = []
            episode_rewards = []
            episode_response_texts = []
            episode_format_rewards = []
            
            # Run one episode
            done = False
            step = 0
            while not done and step < self.episode_length:
                # Get action from agent
                position, response_text, query, response = self.agent.predict(state)
                
                # Take action in environment
                next_state, reward, done, info = self.env.step(position)
                
                # Calculate format reward
                format_reward = self.compute_format_reward(response_text)
                
                # Combine rewards
                total_reward = reward + format_reward
                
                # Store experience
                episode_queries.append(query)
                episode_responses.append(response)
                episode_rewards.append(total_reward)
                episode_response_texts.append(response_text)
                episode_format_rewards.append(format_reward)
                
                # Move to next state
                state = next_state
                step += 1
            
            # Calculate returns with discount
            returns = self._calculate_returns(episode_rewards)
            
            # Store episodes in buffer
            for i in range(len(episode_rewards)):
                self.buffer.append({
                    'query': episode_queries[i],
                    'response': episode_responses[i],
                    'reward': returns[i],
                    'raw_reward': episode_rewards[i],
                    'response_text': episode_response_texts[i],
                    'format_reward': episode_format_rewards[i]
                })
            
            # Keep buffer size in check
            if len(self.buffer) > self.buffer_size:
                self.buffer = self.buffer[-self.buffer_size:]
                
            all_episode_rewards.extend(episode_rewards)
        
        # Return average reward per step
        return np.mean(all_episode_rewards) if all_episode_rewards else 0.0

    def _calculate_returns(self, rewards):
        """Calculate discounted returns"""
        gamma = self.agent.ppo_config.gamma
        returns = []
        R = 0
        for r in reversed(rewards):
            R = r + gamma * R
            returns.insert(0, R)
        return returns

    def train(self, num_iterations=100, num_episodes_per_iter=5, batch_size=8):
        """Main training loop with experience replay"""
        for iteration in range(num_iterations):
            # Collect new experience
            avg_reward = self.collect_experience(num_episodes=num_episodes_per_iter)
            
            # Skip training if buffer is too small
            if len(self.buffer) < batch_size:
                print(f"Iteration {iteration+1}/{num_iterations}: Buffer too small ({len(self.buffer)}), skipping training")
                continue
            
            # Run multiple PPO updates using samples from buffer
            for update in range(4):  # Number of PPO updates per iteration
                # Sample from buffer
                batch_indices = np.random.choice(len(self.buffer), min(batch_size, len(self.buffer)), replace=False)
                batch = [self.buffer[i] for i in batch_indices]
                
                # Prepare training data
                queries = [item['query'] for item in batch]
                responses = [item['response'] for item in batch]
                rewards = [item['reward'] for item in batch]
                
                # Run PPO update
                stats = self.agent.ppo_step(queries, responses, rewards)
                
                # Log sample response and reward for first update
                if update == 0:
                    sample_idx = np.random.choice(len(batch))
                    sample = batch[sample_idx]
                    print(f"\nSample response: {sample['response_text'][:200]}...")
                    print(f"Trading reward: {sample['raw_reward'] - sample['format_reward']:.4f}")
                    print(f"Format reward: {sample['format_reward']:.4f}")
                    print(f"Total return: {sample['reward']:.4f}")
            
            # Save model periodically
            if (iteration + 1) % 10 == 0:
                save_path = f"{self.output_dir}/checkpoint-{iteration+1}"
                self.agent.model.save_pretrained(save_path)
                self.agent.tokenizer.save_pretrained(save_path)
            
            # Logging
            print(f"Iteration {iteration+1}/{num_iterations}")
            print(f"Average Reward: {avg_reward:.4f}")
            print(f"Buffer Size: {len(self.buffer)}")
            print("="*50)

# =============================================================================
# 4. Execution Function
# =============================================================================

def train_portfolio_manager(df, model_name="Qwen/Qwen2.5-0.5B-Instruct", 
                           num_iterations=50, output_dir="output/ppo_portfolio_manager"):
    """Main execution function for training the PPO portfolio manager"""
    # Set random seeds for reproducibility
    np.random.seed(42)
    torch.manual_seed(42)
    random.seed(42)
    
    # Initialize environment and agent
    env = MacroTradingEnv(df)
    agent = PPOPortfolioManager(model_name=model_name)
    orchestrator = TrainingOrchestrator(env, agent, output_dir=output_dir)
    
    # Start training
    orchestrator.train(num_iterations=num_iterations, num_episodes_per_iter=5, batch_size=8)
    
    # Save final model
    agent.model.save_pretrained(f"{output_dir}/final_model")
    agent.tokenizer.save_pretrained(f"{output_dir}/final_model")
    
    print("Training completed successfully!")
    return agent, env

# Usage example
if __name__ == "__main__":
    # Load data (example)
    df = pd.read_csv('market_data.csv')
    
    # Add 'returns' column if not present
    if 'returns' not in df.columns:
        df['returns'] = df['close'].pct_change().shift(-1)
    
    # Train portfolio manager
    agent, env = train_portfolio_manager(df, num_iterations=50)