In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from peft import LoraConfig, get_peft_model
from typing import Dict, List, Tuple, Optional
from collections import deque
import json
import re
import random
from tqdm import tqdm

# =============================================================================
# 1. Stateful Trading Environment with Daily Rollover
# =============================================================================

class DailyTradingEnv:
    """
    Daily trading environment that:
    - Maintains rolling window of market data
    - Tracks open positions and news history
    - Calculates incremental rewards
    - Manages position sizing automatically
    """
    
    def __init__(self, data: pd.DataFrame, initial_cash: float = 1e6):
        self.data = data.sort_index().reset_index(drop=True)
        self.initial_cash = initial_cash
        self.position = 0.0  # -1 (full short) to 1 (full long)
        self.cash = initial_cash
        self.current_step = 0
        self.news_window = deque(maxlen=7)  # 7-day news memory
        self.action_history = deque(maxlen=5)  # Last 5 actions
        
        # State normalization
        self._init_normalization()

    def _init_normalization(self):
        """Initialize rolling normalization parameters"""
        windows = [5, 20, 60]
        for w in windows:
            self.data[f'vol_{w}d'] = self.data['close'].pct_change().rolling(w).std().fillna(0)
            self.data[f'sma_{w}d'] = self.data['close'].rolling(w).mean().fillna(0)
        
        # Calculate RSI with proper handling of NaN values
        price_change = self.data['close'].pct_change()
        gain = price_change.where(price_change > 0, 0).fillna(0)
        loss = -price_change.where(price_change < 0, 0).fillna(0)
        avg_gain = gain.rolling(14).mean().fillna(0)
        avg_loss = loss.rolling(14).mean().fillna(0)
        rs = avg_gain / avg_loss.replace(0, np.nan).fillna(1)
        self.data['rsi_14'] = 100 - (100 / (1 + rs))
        self.data['rsi_14'] = self.data['rsi_14'].fillna(50)  # Default to neutral RSI if not enough data

    def reset(self, random_start: bool = True):
        """Reset environment to starting state, optionally at a random point"""
        if random_start:
            # Ensure we have enough data ahead for a full episode
            max_start = len(self.data) - 30  # Minimum 30 days of future data
            self.current_step = random.randint(7, max_start) if max_start > 7 else 7
        else:
            self.current_step = 7  # Start with some history
        
        # Reset state
        self.position = 0.0
        self.cash = self.initial_cash
        self.news_window.clear()
        self.action_history.clear()
        
        # Initialize news window with past days
        for i in range(7):
            if self.current_step - i - 1 >= 0:
                self.news_window.appendleft(self.data.iloc[self.current_step - i - 1].get('news', 'No news'))
        
        # Initialize action history with zeros
        for _ in range(5):
            self.action_history.append(0.0)
            
        return self.get_state()
        
    def get_portfolio_value(self) -> float:
        """Current total portfolio value"""
        return self.cash + self.position * self.data.iloc[self.current_step]['close']

    def step(self, new_position: float) -> Tuple[Dict, float, bool, Dict]:
        """Execute daily position adjustment and return (state, reward, done, info)"""
        if self.current_step >= len(self.data) - 1:
            return self.get_state(), 0.0, True, {'status': 'completed'}
            
        prev_value = self.get_portfolio_value()
        current_price = self.data.iloc[self.current_step]['close']
        
        # Calculate position delta
        position_change = new_position - self.position
        transaction_cost = abs(position_change * current_price) * 0.001  # 0.1% friction
        self.cash -= transaction_cost
        
        # Update position
        self.position = new_position
        
        # Move to next day
        self.current_step += 1
        new_price = self.data.iloc[self.current_step]['close']
        
        # Calculate daily return
        daily_return = (new_price / current_price - 1) * self.position
        new_value = self.get_portfolio_value()
        
        # Add today's news to the window
        current_news = self.data.iloc[self.current_step].get('news', 'No news')
        self.news_window.append(current_news)
        
        # Add today's action to history
        self.action_history.append(new_position)
        
        # Calculate reward (daily P&L considering transaction costs)
        reward = daily_return - (transaction_cost / prev_value)
        
        # Check if done
        done = (self.current_step >= len(self.data) - 1) or (new_value <= 0)
        
        info = {
            'value': new_value,
            'return': daily_return,
            'volatility': self.data.iloc[self.current_step]['vol_5d'],
            'max_drawdown': self._calculate_drawdown(),
            'position_change': position_change,
            'transaction_cost': transaction_cost
        }
        
        return self.get_state(), reward, done, info

    def get_state(self) -> Dict:
        """Return current environment state as a dictionary"""
        current_data = self.data.iloc[self.current_step]
        return {
            'date': current_data.get('date', f'Day {self.current_step}'),
            'close': current_data['close'],
            'vol_5d': current_data['vol_5d'],
            'vol_20d': current_data['vol_20d'],
            'sma_5d': current_data['sma_5d'],
            'sma_20d': current_data['sma_20d'],
            'rsi_14': current_data['rsi_14'],
            'position': self.position,
            'portfolio_value': self.get_portfolio_value(),
            'news_window': list(self.news_window),
            'action_history': list(self.action_history)
        }

    def _calculate_drawdown(self, window=21) -> float:
        """Rolling maximum drawdown"""
        values = [self.get_portfolio_value()]
        if len(values) < window:
            return 0.0
        peak = max(values[-window:])
        return (min(values[-window:]) - peak) / peak if peak > 0 else 0.0

# =============================================================================
# 2. Context-Aware LLM Trader with Memory (using PPO)
# =============================================================================

class DailyTrader:
    """
    LLM trader with:
    - Rolling news context
    - Action history memory
    - Position-aware decision making
    - PPO-based reinforcement learning
    """
    
    def __init__(self, model_name="Qwen/Qwen2.5-0.5B-Instruct"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_name = model_name
        
        # PPO configuration
        self.ppo_config = PPOConfig(
            learning_rate=1.2e-5,
            batch_size=32,
            mini_batch_size=8,
            gradient_accumulation_steps=1,
            optimize_cuda_cache=True,
            ppo_epochs=3,
            gamma=0.98,
            cliprange=0.15,
            cliprange_value=0.1,
            vf_coef=0.1,
            adap_kl_ctrl=True,
            init_kl_coef=0.02,
            target_kl=0.01,
            seed=42,
            log_with="tensorboard",
            use_score_scaling=True,
            use_score_norm=True,
            score_clip=None,
        )
        
        # Initialize tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        # Initialize model with value head for PPO
        self.model = AutoModelForCausalLMWithValueHead.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto"
        )
        
        # Apply LoRA for parameter-efficient fine-tuning
        peft_config = LoraConfig(
            r=16,
            lora_alpha=32,
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]  # Adjust based on model architecture
        )
        self.model = get_peft_model(self.model, peft_config)
        
        # Initialize trainer (we'll update this later with the dataset)
        self.trainer = None
        
        # Define text generation parameters
        self.generation_kwargs = {
            "max_new_tokens": 256,
            "temperature": 0.7,
            "top_p": 0.9,
            "do_sample": True,
        }

    def format_state(self, state: Dict) -> str:
        """Create daily trading prompt with context"""
        return f"""Daily S&P 500 Trading Decision - {state['date']}
Market Context:
- Price: {state['close']:.2f}
- 5D Volatility: {state['vol_5d']:.4f}
- 20D Volatility: {state['vol_20d']:.4f}
- 5D SMA: {state['sma_5d']:.2f}
- 20D SMA: {state['sma_20d']:.2f}
- RSI(14): {state['rsi_14']:.1f}
- Current Position: {state['position']:.2f}
- Portfolio Value: ${state['portfolio_value']:,.2f}

Recent News:
{'\n'.join(state['news_window'])}

Previous Actions (Last 5 Days):
{'\n'.join([f"Day-{i+1}: {pos:.2f}" for i, pos in enumerate(reversed(state['action_history']))])}

Output Format (JSON):
{{
  "analysis": "<market_analysis>",
  "decision": {{
    "position_target": [-1.0 to 1.0],
    "rationale": "<risk-adjusted reasoning>",
    "confidence": [0.0-1.0]
  }},
  "risk_management": {{
    "stop_loss": <optional_price>,
    "profit_target": <optional_price>
  }}
}}"""

    def parse_response(self, response: str) -> Dict:
        """Robust JSON parsing with error correction"""
        try:
            # First, try to extract the JSON block if it's in markdown format
            if "```json" in response:
                json_str = response.split("```json")[1].split("```")[0].strip()
            elif "```" in response:
                json_str = response.split("```")[1].strip()
            else:
                # Try to find a JSON object using regex
                match = re.search(r'({[\s\S]*})', response)
                if match:
                    json_str = match.group(1)
                else:
                    # Fallback to using the entire response
                    json_str = response
            
            # Parse the JSON
            decision = json.loads(json_str)
            
            # Extract and validate the position target
            position = float(decision['decision']['position_target'])
            position = np.clip(position, -1.0, 1.0)
            
            # Extract confidence if available, default to 0.5
            confidence = decision['decision'].get('confidence', 0.5)
            confidence = np.clip(float(confidence), 0.0, 1.0)
            
            return {
                'position': position,
                'confidence': confidence,
                'stop_loss': decision['risk_management'].get('stop_loss'),
                'take_profit': decision['risk_management'].get('profit_target')
            }
        except Exception as e:
            print(f"Parse error: {e}")
            print(f"Response: {response}")
            # Return default values if parsing fails
            return {'position': 0.0, 'confidence': 0.5}

    def predict(self, state: Dict) -> Dict:
        """Generate a trading decision based on current state"""
        prompt = self.format_state(state)
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        outputs = self.model.generate(
            inputs.input_ids,
            **self.generation_kwargs
        )
        response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
        decision = self.parse_response(response)
        return decision, response, inputs.input_ids, outputs
        
    def initialize_trainer(self, buffer_size=1000):
        """Initialize the PPO trainer with an empty dataset"""
        # Create a small dummy dataset to initialize the trainer
        dummy_data = {"prompt": [""] * 10, "response": [""] * 10, "reward": [0.0] * 10}
        from datasets import Dataset
        dummy_dataset = Dataset.from_dict(dummy_data)
        
        # Initialize the PPO trainer
        from trl import PPOTrainer
        self.trainer = PPOTrainer(
            config=self.ppo_config,
            model=self.model,
            tokenizer=self.tokenizer,
            dataset=dummy_dataset
        )
        
    def ppo_step(self, query_tensors, response_tensors, rewards):
        """Run PPO optimization step"""
        if self.trainer is None:
            self.initialize_trainer()
        
        # Ensure all inputs are properly formatted for the trainer
        query_tensors = [t.to(self.model.device) if isinstance(t, torch.Tensor) else t for t in query_tensors]
        response_tensors = [t.to(self.model.device) if isinstance(t, torch.Tensor) else t for t in response_tensors]
        rewards = torch.tensor(rewards, dtype=torch.float).to(self.model.device)
        
        # Run PPO step
        stats = self.trainer.step(query_tensors, response_tensors, rewards)
        return stats

# =============================================================================
# 3. Training Loop with Continuous Learning
# =============================================================================

class DailyTrainingOrchestrator:
    """
    Manages daily training process with:
    - Experience replay buffer
    - Dynamic reward calculation
    - Risk-aware policy updates
    """
    
    def __init__(self, env: DailyTradingEnv, trader: DailyTrader, output_dir="output/ppo_trading"):
        self.env = env
        self.trader = trader
        self.buffer = []  # Experience replay buffer
        self.buffer_size = 1000
        self.episode_length = 21  # Rolling 1-month episodes
        self.output_dir = output_dir
        
        # Ensure the trader's PPO trainer is initialized
        if trader.trainer is None:
            trader.initialize_trainer()

    def collect_experience(self, num_episodes=10):
        """Collect trading experience by running multiple episodes"""
        all_episode_rewards = []
        
        for episode in tqdm(range(num_episodes), desc="Collecting experience"):
            state = self.env.reset(random_start=True)
            episode_states = []
            episode_queries = []
            episode_responses = []
            episode_actions = []
            episode_rewards = []
            episode_response_texts = []
            
            # Run one episode
            done = False
            step = 0
            while not done and step < self.episode_length:
                # Get action from trader
                action_dict, response_text, query, response = self.trader.predict(state)
                
                # Take action in environment
                next_state, reward, done, info = self.env.step(action_dict['position'])
                
                # Store experience
                episode_states.append(state)
                episode_queries.append(query)
                episode_responses.append(response)
                episode_actions.append(action_dict)
                episode_rewards.append(reward)
                episode_response_texts.append(response_text)
                
                # Move to next state
                state = next_state
                step += 1
            
            # Calculate returns with discount
            returns = self._calculate_returns(episode_rewards)
            
            # Store episodes in buffer
            for i in range(len(episode_states)):
                self.buffer.append({
                    'query': episode_queries[i],
                    'response': episode_responses[i],
                    'reward': returns[i],
                    'raw_reward': episode_rewards[i],
                    'response_text': episode_response_texts[i],
                    'confidence': episode_actions[i]['confidence']
                })
            
            # Keep buffer size in check
            if len(self.buffer) > self.buffer_size:
                self.buffer = self.buffer[-self.buffer_size:]
                
            all_episode_rewards.extend(episode_rewards)
        
        # Return average reward per step
        return np.mean(all_episode_rewards) if all_episode_rewards else 0.0

    def _calculate_returns(self, rewards):
        """Calculate discounted returns"""
        gamma = self.trader.ppo_config.gamma
        returns = []
        R = 0
        for r in reversed(rewards):
            R = r + gamma * R
            returns.insert(0, R)
        return returns

    def train(self, num_iterations=100, num_episodes_per_iter=5, batch_size=8):
        """Main training loop with experience replay"""
        for iteration in range(num_iterations):
            # Collect new experience
            avg_reward = self.collect_experience(num_episodes=num_episodes_per_iter)
            
            # Skip training if buffer is too small
            if len(self.buffer) < batch_size:
                print(f"Iteration {iteration+1}/{num_iterations}: Buffer too small ({len(self.buffer)}), skipping training")
                continue
            
            # Run multiple PPO updates using samples from buffer
            for _ in range(4):  # Number of PPO updates per iteration
                # Sample from buffer
                batch_indices = np.random.choice(len(self.buffer), min(batch_size, len(self.buffer)), replace=False)
                batch = [self.buffer[i] for i in batch_indices]
                
                # Prepare training data
                queries = [item['query'] for item in batch]
                responses = [item['response'] for item in batch]
                rewards = [item['reward'] * item['confidence'] for item in batch]
                
                # Run PPO update
                stats = self.trader.ppo_step(queries, responses, rewards)
                
                # Log some sample responses and their rewards
                if _ == 0:
                    sample_idx = np.random.choice(len(batch))
                    print(f"\nSample response: {batch[sample_idx]['response_text'][:100]}...")
                    print(f"Reward: {batch[sample_idx]['raw_reward']:.4f}, Return: {batch[sample_idx]['reward']:.4f}")
            
            # Save model periodically
            if (iteration + 1) % 10 == 0:
                save_path = f"{self.output_dir}/checkpoint-{iteration+1}"
                self.trader.model.save_pretrained(save_path)
                self.trader.tokenizer.save_pretrained(save_path)
            
            # Logging
            print(f"Iteration {iteration+1}/{num_iterations}")
            print(f"Average Reward: {avg_reward:.4f}")
            print(f"Buffer Size: {len(self.buffer)}")
            print("="*50)

# =============================================================================
# 4. Execution Workflow
# =============================================================================

def run_trading_training(data_path, model_name="Qwen/Qwen2.5-0.5B-Instruct", num_iterations=50):
    """Main execution function for training the PPO trader"""
    # Load and prepare data
    data = pd.read_csv(data_path, parse_dates=['date'])
    data['news'] = data['news'].fillna("No significant news")
    
    # Initialize components
    env = DailyTradingEnv(data)
    trader = DailyTrader(model_name=model_name)
    orchestrator = DailyTrainingOrchestrator(env, trader)
    
    # Start training
    orchestrator.train(num_iterations=num_iterations, num_episodes_per_iter=5)
    
    return trader, env

if __name__ == "__main__":
    import os
    
    # Set random seeds for reproducibility
    np.random.seed(42)
    torch.manual_seed(42)
    random.seed(42)
    
    # Example usage
    data_path = 'sp500_daily.csv'
    model_name = "Qwen/Qwen2.5-0.5B-Instruct"  # Use appropriate model name
    
    # Run training
    trader, env = run_trading_training(data_path, model_name, num_iterations=50)
    
    # Save final model
    trader.model.save_pretrained("output/ppo_trading/final_model")
    trader.tokenizer.save_pretrained("output/ppo_trading/final_model")
    
    print("Training completed successfully!")