In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOTrainer, GRPOConfig
from typing import Dict, List, Deque
from collections import deque
import json

# =============================================================================
# 1. Stateful Trading Environment with Daily Rollover
# =============================================================================

class DailyTradingEnv:
    """
    Daily trading environment that:
    - Maintains rolling window of market data
    - Tracks open positions and news history
    - Calculates incremental rewards
    - Manages position sizing automatically
    """
    
    def __init__(self, data: pd.DataFrame, initial_cash: float = 1e6):
        self.data = data.sort_index().reset_index(drop=True)
        self.initial_cash = initial_cash
        self.position = 0.0  # -1 (full short) to 1 (full long)
        self.cash = initial_cash
        self.current_step = 0
        self.news_window = deque(maxlen=7)  # 7-day news memory
        self.action_history = deque(maxlen=5)  # Last 5 actions
        
        # State normalization
        self._init_normalization()

    def _init_normalization(self):
        """Initialize rolling normalization parameters"""
        windows = [5, 20, 60]
        for w in windows:
            self.data[f'vol_{w}d'] = self.data['close'].pct_change().rolling(w).std()
            self.data[f'sma_{w}d'] = self.data['close'].rolling(w).mean()
        
        self.data['rsi_14'] = 100 - (100 / (1 + self.data['close'].pct_change().rolling(14).mean()))

    def get_portfolio_value(self) -> float:
        """Current total portfolio value"""
        return self.cash + self.position * self.data.iloc[self.current_step]['close']

    def step(self, new_position: float) -> Tuple[float, Dict]:
        """Execute daily position adjustment"""
        if self.current_step >= len(self.data) - 1:
            return 0.0, {'status': 'completed'}
            
        prev_value = self.get_portfolio_value()
        current_price = self.data.iloc[self.current_step]['close']
        
        # Calculate position delta
        position_change = new_position - self.position
        transaction_cost = abs(position_change * current_price) * 0.001  # 0.1% friction
        self.cash -= transaction_cost
        
        # Update position
        self.position = new_position
        
        # Move to next day
        self.current_step += 1
        new_price = self.data.iloc[self.current_step]['close']
        
        # Calculate daily return
        daily_return = (new_price / current_price - 1) * self.position
        new_value = self.get_portfolio_value()
        
        # Update news window
        self.news_window.append(self.data.iloc[self.current_step]['news'])
        self.action_history.append(new_position)
        
        return daily_return, {
            'value': new_value,
            'return': daily_return,
            'volatility': self.data.iloc[self.current_step]['vol_5d'],
            'max_drawdown': self._calculate_drawdown(),
            'position_change': position_change
        }

    def _calculate_drawdown(self, window=21) -> float:
        """Rolling maximum drawdown"""
        values = [self.get_portfolio_value()]
        if len(values) < window:
            return 0.0
        peak = max(values[-window:])
        return (min(values[-window:]) - peak) / peak

# =============================================================================
# 2. Context-Aware LLM Trader with Memory
# =============================================================================

class DailyTrader:
    """
    LLM trader with:
    - Rolling news context
    - Action history memory
    - Position-aware decision making
    """
    
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-1.8B")
        self.model = AutoModelForCausalLM.from_pretrained(
            "Qwen/Qwen1.5-1.8B",
            torch_dtype=torch.bfloat16,
            device_map="auto"
        )
        self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # GRPO configuration for daily trading
        self.config = GRPOConfig(
            learning_rate=1.2e-5,
            batch_size=32,
            mini_batch_size=8,
            ppo_epochs=3,
            max_grad_norm=0.5,
            kl_coeff=0.02,
            gamma=0.98,
            cliprange=0.15,
            target_kl=0.01,
            seed=42
        )
        
        self.trainer = GRPOTrainer(
            model=self.model,
            tokenizer=self.tokenizer,
            config=self.config
        )

    def format_state(self, env: DailyTradingEnv) -> str:
        """Create daily trading prompt with context"""
        current_data = env.data.iloc[env.current_step]
        return f"""Daily S&P 500 Trading Decision - {current_data['date']}
Market Context:
- Price: {current_data['close']:.2f}
- 5D Volatility: {current_data['vol_5d']:.2%}
- RSI(14): {current_data['rsi_14']:.1f}
- Position: {env.position:.2%}
- Portfolio Value: ${env.get_portfolio_value():,.2f}

Recent News:
{'\n'.join(env.news_window)}

Previous Actions (Last 5 Days):
{'\n'.join([f"Day-{i}: {pos:.2%}" for i, pos in enumerate(reversed(env.action_history))])}

Output Format (JSON):
{{
  "analysis": "<market_analysis>",
  "decision": {{
    "position_target": [-1.0 to 1.0],
    "rationale": "<risk-adjusted reasoning>",
    "confidence": [0.0-1.0]
  }},
  "risk_management": {{
    "stop_loss": <optional_price>,
    "profit_target": <optional_price>
  }}
}}"""

    def parse_response(self, response: str) -> Dict:
        """Robust JSON parsing with error correction"""
        try:
            json_str = response.split("```json")[1].split("```")[0]
            decision = json.loads(json_str)
            return {
                'position': np.clip(float(decision['decision']['position_target']), -1, 1),
                'confidence': np.clip(float(decision['decision'].get('confidence', 0.5)), 0, 1),
                'stop_loss': decision['risk_management'].get('stop_loss'),
                'take_profit': decision['risk_management'].get('profit_target')
            }
        except Exception as e:
            print(f"Parse error: {e}")
            return {'position': 0.0, 'confidence': 0.5}

# =============================================================================
# 3. Training Loop with Continuous Learning
# =============================================================================

class DailyTrainingOrchestrator:
    """
    Manages daily training process with:
    - Experience replay buffer
    - Dynamic reward calculation
    - Risk-aware policy updates
    """
    
    def __init__(self, env: DailyTradingEnv, trader: DailyTrader):
        self.env = env
        self.trader = trader
        self.buffer = deque(maxlen=252*5)  # 5 years of daily data
        self.episode_length = 21  # Rolling 1-month episodes
        
        # Reward calculation parameters
        self.gamma = 0.95  # Discount factor
        self.risk_aversion = 0.5

    def _calculate_risk_adjusted_reward(self, rewards: List[float]) -> float:
        """Calculate discounted cumulative reward with risk penalty"""
        cumulative = 0
        for i, r in enumerate(reversed(rewards)):
            cumulative = r + self.gamma * cumulative
        return cumulative / (1 + self.risk_aversion * np.std(rewards))

    def run_episode(self):
        """Execute one trading month (21 days)"""
        self.env.current_step = np.random.randint(0, len(self.env.data)-self.episode_length)
        self.env.cash = self.env.initial_cash
        self.env.position = 0.0
        
        episode_rewards = []
        states = []
        actions = []
        
        for _ in range(self.episode_length):
            # Generate daily decision
            prompt = self.trader.format_state(self.env)
            inputs = self.trader.tokenizer(prompt, return_tensors="pt").to(self.trader.device)
            response = self.trader.model.generate(
                inputs.input_ids,
                max_new_tokens=256,
                temperature=0.7,
                top_p=0.9
            )
            action = self.trader.parse_response(self.trader.tokenizer.decode(response[0]))
            
            # Execute trade
            reward, _ = self.env.step(action['position'])
            
            # Store experience
            self.buffer.append({
                'state': inputs.input_ids,
                'response': response,
                'reward': reward,
                'confidence': action['confidence']
            })
            
            episode_rewards.append(reward)
            states.append(inputs)
            actions.append(action)
            
            # Early termination
            if self.env.current_step >= len(self.env.data) - 1:
                break
        
        # Calculate risk-adjusted returns
        total_reward = self._calculate_risk_adjusted_reward(episode_rewards)
        return total_reward

    def train(self, num_episodes: int = 1000):
        """Main training loop with experience replay"""
        for episode in range(num_episodes):
            # Run new episode
            episode_reward = self.run_episode()
            
            # Sample from buffer
            batch_size = min(len(self.buffer), self.trader.config.batch_size)
            batch = np.random.choice(self.buffer, batch_size, replace=False)
            
            # Prepare training data
            queries = [item['state'] for item in batch]
            responses = [item['response'] for item in batch]
            rewards = torch.tensor(
                [item['reward'] * item['confidence'] for item in batch],
                device=self.trader.device
            )
            
            # Policy update
            self.trader.trainer.step(
                queries=queries,
                responses=responses,
                rewards=rewards
            )
            
            # Logging
            if (episode+1) % 50 == 0:
                print(f"Episode {episode+1}")
                print(f"Avg Reward: {episode_reward:.2%}")
                print(f"Buffer Size: {len(self.buffer)}")
                print("="*50)

# =============================================================================
# 4. Execution Workflow
# =============================================================================

if __name__ == "__main__":
    # Load and prepare data
    data = pd.read_csv('sp500_daily.csv', parse_dates=['date'])
    data['news'] = data['news'].fillna("No significant news")
    
    # Initialize components
    env = DailyTradingEnv(data)
    trader = DailyTrader()
    orchestrator = DailyTrainingOrchestrator(env, trader)
    
    # Start training
    orchestrator.train(num_episodes=500)