Course:        Introduction to Reinforcement Learning
Instructor:    Dr. Teddy Lazebnik

Students:      
               -Lior Vanono
               -Daniel Yesharim
               -Dima Levin  
               -Shiraz Hemo

Group number : 11      

In [1]:
# Import the libraries I need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
from gymnasium import spaces
import warnings
warnings.filterwarnings('ignore')

# Try to import stable-baselines3, with fallback if not available
try:
    from stable_baselines3 import PPO, DQN, A2C
    STABLE_BASELINES_AVAILABLE = True
    print("stable-baselines3 imported successfully")
except ImportError:
    print(" stable-baselines3 not available. Using simplified RL implementation.")
    STABLE_BASELINES_AVAILABLE = False

# Load the data from CSV
df = pd.read_csv("BTCUSDT_data.csv")

# Convert date column to proper date format
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

print("Period:", df.index[0].strftime('%Y-%m-%d'), "to", df.index[-1].strftime('%Y-%m-%d'))
print("Number of days:", len(df))



stable-baselines3 imported successfully
Period: 2025-01-01 to 2025-06-16
Number of days: 167


In [2]:
# SECTION 1: ENHANCED STATE SPACE - מרחב מצבים מורחב
def calculate_advanced_indicators(df):
    """
    חישוב אינדיקטורים טכניים מתקדמים
    """
    # Moving Averages
    df['ma20'] = df['close'].rolling(20, min_periods=1).mean()
    df['ma50'] = df['close'].rolling(50, min_periods=1).mean()

    # RSI (Relative Strength Index)
    price_change = df['close'].diff()
    gains = price_change.where(price_change > 0, 0)
    losses = -price_change.where(price_change < 0, 0)
    avg_gains = gains.rolling(14, min_periods=1).mean()
    avg_losses = losses.rolling(14, min_periods=1).mean()
    rs = avg_gains / avg_losses
    df['rsi'] = 100 - (100 / (1 + rs))

    # MACD (Moving Average Convergence Divergence)
    exp1 = df['close'].ewm(span=12).mean()
    exp2 = df['close'].ewm(span=26).mean()
    df['macd'] = exp1 - exp2
    df['macd_signal'] = df['macd'].ewm(span=9).mean()
    df['macd_histogram'] = df['macd'] - df['macd_signal']

    # Bollinger Bands
    df['bb_middle'] = df['close'].rolling(20, min_periods=1).mean()
    bb_std = df['close'].rolling(20, min_periods=1).std()
    df['bb_upper'] = df['bb_middle'] + (bb_std * 2)
    df['bb_lower'] = df['bb_middle'] - (bb_std * 2)
    df['bb_position'] = (df['close'] - df['bb_lower']) / (df['bb_upper'] - df['bb_lower'])

    # Volatility (20-day rolling standard deviation)
    df['volatility'] = df['close'].pct_change().rolling(20, min_periods=1).std()

    # Volume indicators (if volume data exists)
    if 'volume' in df.columns:
        df['volume_ma'] = df['volume'].rolling(20, min_periods=1).mean()
        df['volume_ratio'] = df['volume'] / df['volume_ma']
    else:
        # Create synthetic volume data for demonstration
        df['volume'] = np.random.lognormal(mean=10, sigma=0.5, size=len(df))
        df['volume_ma'] = df['volume'].rolling(20, min_periods=1).mean()
        df['volume_ratio'] = df['volume'] / df['volume_ma']

    # Price momentum
    df['momentum_5'] = df['close'].pct_change(5)
    df['momentum_10'] = df['close'].pct_change(10)

    # Support/Resistance levels (simplified)
    df['support'] = df['low'].rolling(20, min_periods=1).min()
    df['resistance'] = df['high'].rolling(20, min_periods=1).max()
    df['support_distance'] = (df['close'] - df['support']) / df['close']
    df['resistance_distance'] = (df['resistance'] - df['close']) / df['close']

    return df



In [3]:
# Apply advanced indicators
df = calculate_advanced_indicators(df)

print("✅ Enhanced technical indicators calculated")
print("New features:", ['ma20', 'ma50', 'rsi', 'macd', 'bb_position', 'volatility', 'volume_ratio', 'momentum_5', 'momentum_10'])

# SECTION 2: ENHANCED ACTION SPACE - מרחב פעולות מורחב
class EnhancedBTCTradingEnv(gym.Env):
    """
    Enhanced Bitcoin trading environment with multiple actions and advanced features
    Actions: 0=Wait, 1=Buy_25%, 2=Buy_50%, 3=Buy_100%, 4=Sell_25%, 5=Sell_50%, 6=Sell_100%
    """

    def __init__(self, data, initial_balance=10000):
        super().__init__()
        self.data = data.reset_index(drop=True)
        self.initial_balance = initial_balance

        # ENHANCED ACTION SPACE - 7 possible actions
        self.action_space = spaces.Discrete(7)
        self.action_names = {
            0: "Wait", 1: "Buy_25%", 2: "Buy_50%", 3: "Buy_100%",
            4: "Sell_25%", 5: "Sell_50%", 6: "Sell_100%"
        }

        # ENHANCED OBSERVATION SPACE - 15 features
        self.observation_space = spaces.Box(
            low=-10, high=10, shape=(15,), dtype=np.float32
        )

        print(f"Enhanced trading environment created with {len(self.data)} days")
        print(f"Action space: {len(self.action_names)} actions")
        print(f"Observation space: {self.observation_space.shape[0]} features")

        self.reset()

    def reset(self, seed=None):
        """Reset environment to initial state"""
        self.current_step = 50  # Start after enough data for indicators
        self.balance = self.initial_balance
        self.btc_held = 0
        self.total_trades = 0
        self.winning_trades = 0
        self.trade_history = []
        self.portfolio_values = []
        self.max_portfolio_value = self.initial_balance
        self.drawdown_history = []

        return self._get_observation(), {}

    def _get_observation(self):
        """
        Get enhanced observation with 15 features
        """
        if self.current_step >= len(self.data):
            return np.zeros(15, dtype=np.float32)

        row = self.data.iloc[self.current_step]

        # Technical indicators (normalized)
        features = [
            # 1-2: Moving average relationships
            (row['close'] - row['ma20']) / row['close'],
            (row['ma20'] - row['ma50']) / row['ma20'],

            # 3: RSI (normalized to -1 to 1)
            (row['rsi'] - 50) / 50,

            # 4-5: MACD signals
            row['macd'] / row['close'] * 100,
            row['macd_histogram'] / row['close'] * 100,

            # 6: Bollinger Bands position
            row['bb_position'] * 2 - 1,  # Convert 0-1 to -1 to 1

            # 7: Volatility (normalized)
            min(row['volatility'] * 10, 2),  # Cap at 2

            # 8: Volume ratio
            np.log(row['volume_ratio']) if row['volume_ratio'] > 0 else 0,

            # 9-10: Momentum indicators
            row['momentum_5'] * 5,  # 5-day momentum
            row['momentum_10'] * 3,  # 10-day momentum

            # 11-12: Support/Resistance distances
            row['support_distance'] * 10,
            row['resistance_distance'] * 10,

            # 13-15: Portfolio state
            self.btc_held / 1000,  # BTC position (normalized)
            self.balance / self.initial_balance - 1,  # Cash ratio change
            self._get_current_portfolio_value() / self.initial_balance - 1  # Total return
        ]

        # Convert to numpy array and handle NaN values
        obs = np.array(features, dtype=np.float32)
        obs = np.nan_to_num(obs, nan=0.0, posinf=2.0, neginf=-2.0)
        obs = np.clip(obs, -10, 10)  # Clip extreme values

        return obs

    def _get_current_portfolio_value(self):
        """Calculate total portfolio value"""
        if self.current_step >= len(self.data):
            return self.balance

        current_price = self.data.iloc[self.current_step]['close']
        return self.balance + (self.btc_held * current_price)

    # SECTION 3: ADVANCED REWARD MECHANISM - מנגנון תגמולים מתקדם
    def _calculate_advanced_reward(self, action, old_portfolio_value, new_portfolio_value):
        """
        Advanced reward calculation considering multiple factors
        """
        # Base return reward
        portfolio_return = (new_portfolio_value - old_portfolio_value) / old_portfolio_value
        base_reward = portfolio_return * 100

        # Risk-adjusted reward components
        current_drawdown = (self.max_portfolio_value - new_portfolio_value) / self.max_portfolio_value

        # Update max portfolio value
        if new_portfolio_value > self.max_portfolio_value:
            self.max_portfolio_value = new_portfolio_value
            drawdown_penalty = 0
        else:
            # Penalty for drawdown
            drawdown_penalty = -current_drawdown * 5

        # Volatility penalty (encourage stable growth)
        if len(self.portfolio_values) > 10:
            recent_returns = np.diff(self.portfolio_values[-10:]) / self.portfolio_values[-11:-1]
            volatility = np.std(recent_returns)
            volatility_penalty = -volatility * 10
        else:
            volatility_penalty = 0

        # Action-specific rewards
        action_reward = 0
        if action == 0:  # Wait
            action_reward = 0.001  # Small reward for patience
        elif action in [1, 2, 3]:  # Buy actions
            # Reward buying in oversold conditions
            current_rsi = self.data.iloc[self.current_step]['rsi']
            if current_rsi < 40:
                action_reward = 0.01
            elif current_rsi > 60:
                action_reward = -0.01
        elif action in [4, 5, 6]:  # Sell actions
            # Reward selling in overbought conditions
            current_rsi = self.data.iloc[self.current_step]['rsi']
            if current_rsi > 60:
                action_reward = 0.01
            elif current_rsi < 40:
                action_reward = -0.01

        # Trade frequency penalty (discourage overtrading)
        if len(self.trade_history) > 0:
            recent_trades = sum(1 for t in self.trade_history[-10:] if t['step'] > self.current_step - 10)
            if recent_trades > 3:
                overtrading_penalty = -0.02 * recent_trades
            else:
                overtrading_penalty = 0
        else:
            overtrading_penalty = 0

        # Combine all reward components
        total_reward = (base_reward +
                       drawdown_penalty +
                       volatility_penalty +
                       action_reward +
                       overtrading_penalty)

        return total_reward, {
            'base_reward': base_reward,
            'drawdown_penalty': drawdown_penalty,
            'volatility_penalty': volatility_penalty,
            'action_reward': action_reward,
            'overtrading_penalty': overtrading_penalty,
            'current_drawdown': current_drawdown
        }

    def step(self, action):
        """Execute trading action with enhanced logic"""
        if self.current_step >= len(self.data) - 1:
            return self._get_observation(), 0, True, False, self._get_info()

        # Get current state
        current_price = self.data.iloc[self.current_step]['close']
        old_portfolio_value = self._get_current_portfolio_value()

        # Execute action
        executed = self._execute_action(action, current_price)

        # Calculate new portfolio value
        new_portfolio_value = self._get_current_portfolio_value()
        self.portfolio_values.append(new_portfolio_value)

        # Calculate advanced reward
        reward, reward_components = self._calculate_advanced_reward(
            action, old_portfolio_value, new_portfolio_value
        )

        # Move to next step
        self.current_step += 1
        done = self.current_step >= len(self.data) - 1

        # Update info
        info = self._get_info()
        info.update(reward_components)
        info['action_executed'] = executed
        info['action_name'] = self.action_names[action]

        return self._get_observation(), reward, done, False, info

    def _execute_action(self, action, current_price):
        """Execute the trading action"""
        executed = False

        if action == 0:  # Wait
            executed = True

        elif action in [1, 2, 3]:  # Buy actions
            buy_percentages = {1: 0.25, 2: 0.50, 3: 1.0}
            buy_amount = self.balance * buy_percentages[action]

            if buy_amount > 10:  # Minimum trade size
                btc_to_buy = buy_amount / current_price
                self.btc_held += btc_to_buy
                self.balance -= buy_amount
                self.total_trades += 1
                executed = True

                self.trade_history.append({
                    'step': self.current_step,
                    'action': 'BUY',
                    'amount': btc_to_buy,
                    'price': current_price,
                    'percentage': buy_percentages[action]
                })

        elif action in [4, 5, 6]:  # Sell actions
            sell_percentages = {4: 0.25, 5: 0.50, 6: 1.0}
            btc_to_sell = self.btc_held * sell_percentages[action]

            if btc_to_sell > 0.001:  # Minimum BTC to sell
                sell_amount = btc_to_sell * current_price
                self.btc_held -= btc_to_sell
                self.balance += sell_amount
                self.total_trades += 1
                executed = True

                self.trade_history.append({
                    'step': self.current_step,
                    'action': 'SELL',
                    'amount': btc_to_sell,
                    'price': current_price,
                    'percentage': sell_percentages[action]
                })

        return executed

    def _get_info(self):
        """Get comprehensive trading statistics"""
        current_portfolio_value = self._get_current_portfolio_value()
        total_return = (current_portfolio_value / self.initial_balance - 1) * 100

        # Calculate Sharpe ratio (simplified)
        if len(self.portfolio_values) > 1:
            returns = np.diff(self.portfolio_values) / self.portfolio_values[:-1]
            sharpe_ratio = np.mean(returns) / (np.std(returns) + 1e-8) * np.sqrt(252)
        else:
            sharpe_ratio = 0

        # Calculate maximum drawdown
        if self.portfolio_values:
            peak = np.maximum.accumulate(self.portfolio_values)
            drawdown = (peak - self.portfolio_values) / peak
            max_drawdown = np.max(drawdown) if len(drawdown) > 0 else 0
        else:
            max_drawdown = 0

        return {
            'portfolio_value': current_portfolio_value,
            'total_return': total_return,
            'balance': self.balance,
            'btc_held': self.btc_held,
            'total_trades': self.total_trades,
            'current_step': self.current_step,
            'sharpe_ratio': sharpe_ratio,
            'max_drawdown': max_drawdown,
            'trade_history_length': len(self.trade_history)
        }



✅ Enhanced technical indicators calculated
New features: ['ma20', 'ma50', 'rsi', 'macd', 'bb_position', 'volatility', 'volume_ratio', 'momentum_5', 'momentum_10']


In [4]:
# Create and test the enhanced environment
print("\n=== Creating Enhanced Trading Environment ===")
enhanced_env = EnhancedBTCTradingEnv(df)

# Test the enhanced environment
obs, info = enhanced_env.reset()
print(f"Initial observation shape: {obs.shape}")
print(f"Initial portfolio value: ${info['portfolio_value']:.2f}")

# Test a few actions
test_actions = [0, 1, 4, 2, 5, 0]  # Wait, Buy25%, Sell25%, Buy50%, Sell50%, Wait
print(f"\nTesting enhanced environment:")

for i, action in enumerate(test_actions):
    obs, reward, done, _, info = enhanced_env.step(action)
    print(f"Step {i+1}: {enhanced_env.action_names[action]:>8} -> "
          f"Reward: {reward:>7.3f}, Portfolio: ${info['portfolio_value']:>8.0f}, "
          f"BTC: {info['btc_held']:>6.3f}")

    if done:
        break

print(f"\n✅ Enhanced environment ready with:")
print(f"   • {enhanced_env.observation_space.shape[0]} state features")
print(f"   • {enhanced_env.action_space.n} possible actions")
print(f"   • Advanced reward system with risk management")
print(f"   • Portfolio tracking and statistics")




=== Creating Enhanced Trading Environment ===
Enhanced trading environment created with 167 days
Action space: 7 actions
Observation space: 15 features
Initial observation shape: (15,)


KeyError: 'portfolio_value'