In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random
from typing import Tuple, Optional
import gym

class TrueRandomSequenceDataset(Dataset):
    """Dataset that generates random sequences for MDP training"""
    def __init__(self, dataframe, sequence_length, max_overlap_ratio=0.9):
        self.dataframe = dataframe
        self.sequence_length = sequence_length
        self.max_overlap_ratio = max_overlap_ratio
        self.min_stride = max(1, int(sequence_length * (1 - max_overlap_ratio)))
        self.max_start_idx = len(dataframe) - sequence_length
    
    def __len__(self):
        return max(10000, self.max_start_idx)
    
    def __getitem__(self, idx):
        start_idx = random.randint(0, self.max_start_idx)
        end_idx = start_idx + self.sequence_length
        sequence = self.dataframe.iloc[start_idx:end_idx].values
        return torch.FloatTensor(sequence)

class MDPEnvironment:
    """Custom MDP Environment using sequence data"""
    def __init__(self, sequence_ torch.Tensor, reward_function=None):
        """
        Initialize MDP environment with sequence data
        
        Args:
            sequence_data: Tensor of shape (T, E) - one sequence from dataset
            reward_function: Custom reward function (optional)
        """
        self.sequence_data = sequence_data  # Shape: (T, E)
        self.T, self.E = sequence_data.shape
        self.current_step = 0
        self.reward_function = reward_function or self._default_reward_function
        
        # State space: current observation + additional features
        self.observation_dim = self.E + 2  # features + step + time_remaining
        
    def reset(self) -> torch.Tensor:
        """Reset environment to initial state"""
        self.current_step = 0
        return self._get_observation()
    
    def step(self, action: int) -> Tuple[torch.Tensor, float, bool, dict]:
        """
        Take action and return (next_state, reward, done, info)
        
        Args:
            action: Action to take (0: hold, 1: buy, 2: sell for trading example)
        """
        # Get current observation
        current_obs = self._get_observation()
        
        # Calculate reward based on action and current state
        reward = self.reward_function(current_obs, action, self.current_step, self.sequence_data)
        
        # Advance time step
        self.current_step += 1
        
        # Check if episode is done
        done = self.current_step >= self.T - 1
        
        # Get next observation
        next_obs = self._get_observation() if not done else torch.zeros(self.observation_dim)
        
        info = {
            'current_step': self.current_step,
            'action': action,
            'raw_data': self.sequence_data[self.current_step] if not done else None
        }
        
        return next_obs, reward, done, info
    
    def _get_observation(self) -> torch.Tensor:
        """Get current observation"""
        if self.current_step >= self.T:
            return torch.zeros(self.observation_dim)
        
        # Current features
        current_features = self.sequence_data[self.current_step]
        
        # Additional MDP features
        step_feature = torch.tensor([float(self.current_step) / self.T])  # Normalized step
        time_remaining = torch.tensor([float(self.T - self.current_step) / self.T])  # Normalized time remaining
        
        # Combine all features
        observation = torch.cat([current_features, step_feature, time_remaining], dim=0)
        return observation
    
    def _default_reward_function(self, observation, action, step, sequence_data):
        """Default reward function - customize based on your problem"""
        # Example: Simple trading reward function
        if step >= len(sequence_data) - 1:
            return 0.0
        
        current_price = sequence_data[step, 0]  # Assume first feature is price
        next_price = sequence_data[step + 1, 0] if step + 1 < len(sequence_data) else current_price
        
        price_change = (next_price - current_price) / current_price
        
        # Simple trading logic: 
        # action 0 = hold, action 1 = buy, action 2 = sell
        if action == 1:  # Buy
            reward = price_change  # Profit from price increase
        elif action == 2:  # Sell
            reward = -price_change  # Profit from price decrease
        else:  # Hold
            reward = 0.0
        
        return float(reward)

class SimplePolicyNetwork(nn.Module):
    """Simple policy network for the MDP"""
    def __init__(self, observation_dim: int, action_dim: int, hidden_dim: int = 64):
        super(SimplePolicyNetwork, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(observation_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, observation: torch.Tensor) -> torch.Tensor:
        return self.network(observation)

class MDPAgent:
    """Agent that learns to act in the MDP environment"""
    def __init__(self, observation_dim: int, action_dim: int, learning_rate: float = 0.001):
        self.policy_network = SimplePolicyNetwork(observation_dim, action_dim)
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=learning_rate)
        self.action_dim = action_dim
    
    def select_action(self, observation: torch.Tensor, deterministic: bool = False) -> int:
        """Select action based on current observation"""
        with torch.no_grad():
            action_probs = self.policy_network(observation.unsqueeze(0))  # Add batch dimension
            if deterministic:
                return action_probs.argmax().item()
            else:
                # Sample action from probability distribution
                action = torch.multinomial(action_probs, 1).item()
                return action
    
    def update_policy(self, log_probs: list, rewards: list, gamma: float = 0.99):
        """Update policy using REINFORCE algorithm"""
        # Calculate discounted returns
        returns = []
        R = 0
        for r in reversed(rewards):
            R = r + gamma * R
            returns.insert(0, R)
        
        returns = torch.tensor(returns)
        # Normalize returns
        if len(returns) > 1:
            returns = (returns - returns.mean()) / (returns.std() + 1e-8)
        
        # Calculate policy loss
        policy_loss = []
        for log_prob, R in zip(log_probs, returns):
            policy_loss.append(-log_prob * R)
        
        self.optimizer.zero_grad()
        policy_loss = torch.stack(policy_loss).sum()
        policy_loss.backward()
        self.optimizer.step()

def train_mdp_with_sequences(dataframe, num_epochs: int = 10, batch_size: int = 32):
    """Train MDP using sequences from TrueRandomSequenceDataset"""
    
    # Create dataset and dataloader
    sequence_length = 50
    dataset = TrueRandomSequenceDataset(dataframe, sequence_length=sequence_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    # Initialize agent
    observation_dim = len(dataframe.columns) + 2  # features + step + time_remaining
    action_dim = 3  # hold, buy, sell (example actions)
    agent = MDPAgent(observation_dim, action_dim)
    
    print(f"Training MDP:")
    print(f"  Sequence length: {sequence_length}")
    print(f"  Observation dim: {observation_dim}")
    print(f"  Action dim: {action_dim}")
    print(f"  Batch size: {batch_size}")
    print()
    
    # Training loop
    for epoch in range(num_epochs):
        epoch_rewards = []
        
        for batch_idx, batch in enumerate(dataloader):
            batch_rewards = []
            
            # Process each sequence in the batch
            for seq_idx in range(batch.shape[0]):
                sequence = batch[seq_idx]  # Shape: (T, E)
                
                # Create MDP environment for this sequence
                env = MDPEnvironment(sequence)
                
                # Run episode
                observation = env.reset()
                log_probs = []
                rewards = []
                
                done = False
                while not done:
                    # Select action
                    action = agent.select_action(observation)
                    
                    # Take action in environment
                    next_observation, reward, done, info = env.step(action)
                    
                    # Store experience
                    action_probs = agent.policy_network(observation.unsqueeze(0))
                    log_prob = torch.log(action_probs[0, action] + 1e-8)
                    log_probs.append(log_prob)
                    rewards.append(reward)
                    
                    # Update observation
                    observation = next_observation
                
                # Update policy with this episode's data
                if len(log_probs) > 0:
                    agent.update_policy(log_probs, rewards)
                
                # Track rewards
                total_reward = sum(rewards)
                batch_rewards.append(total_reward)
                epoch_rewards.extend(rewards)
            
            # Print progress
            if batch_idx % 10 == 0:
                avg_batch_reward = np.mean(batch_rewards)
                print(f"Epoch {epoch+1}, Batch {batch_idx}: "
                      f"Avg Reward = {avg_batch_reward:.4f}")
            
            # Limit batches per epoch for demo
            if batch_idx >= 50:
                break
        
        avg_epoch_reward = np.mean(epoch_rewards)
        print(f"Epoch {epoch+Bruce+1} completed. Average reward: {avg_epoch_reward:.4f}")
    
    return agent

# Example usage
if __name__ == "__main__":
    # Create sample financial data for demonstration
    np.random.seed(42)
    timestamps = 1000
    price = 100.0
    prices = []
    
    # Generate synthetic price data with trends and noise
    for i in range(timestamps):
        price_change = np.random.normal(0, 0.02) + 0.0001  # Small upward drift
        price = price * (1 + price_change)
        prices.append(price)
    
    # Create DataFrame with multiple features
    df = pd.DataFrame({
        'price': prices,
        'volume': np.random.exponential(1000, timestamps),  # Trading volume
        'ma_5': pd.Series(prices).rolling(5).mean().fillna(prices[0]),  # 5-period moving average
        'rsi': np.random.uniform(30, 70, timestamps),  # Relative Strength Index
        'volatility': np.abs(np.random.normal(0, 0.01, timestamps))  # Volatility measure
    })
    
    print("Sample ")
    print(df.head())
    print(f"Data shape: {df.shape}")
    print()
    
    # Train MDP agent
    print("Starting MDP training...")
    trained_agent = train_mdp_with_sequences(df, num_epochs=3, batch_size=16)
    
    print("\nTraining completed!")
    print("Agent can now be used for inference on new sequences.")