In [11]:
import numpy as np
import pandas as pd
from pandas_datareader import data as pdr
import random
import seaborn as sns
from IPython.display import display
import matplotlib.pyplot as plt
# from matplotlib import animation
# from JSAnimation.IPython_display import display_animation
# import gym
import scipy.stats as stats
from scipy.stats import norm
from collections import namedtuple
import statistics
import time
import os
from collections import deque
%matplotlib inline

import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque

import yfinance as yf
import datetime as dt

import pickle

In [12]:

class Environment:
    def __init__(self, stock_data, option_data, features_data, T, n_steps, num_sold_opt, kappa, alpha=0, gamma=0.99, beta=10):
        self.stock_data = stock_data
        self.option_data = option_data
        self.features_data = features_data
        self.T = T
        self.n_steps = n_steps
        self.num_sold_opt = num_sold_opt
        self.kappa = kappa
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.dt = T / n_steps
        self.state = None
        self.idx_time = 0
        self.initial_idx_time = 0
        self.min_holdings = -100    # Short selling up to 100 stocks
        self.max_holdings = 100     # Long position up to 100 stocks
        self.cash_balance = 0  # Initialize cash balance

        # Initialize PnL history deque with a maximum length of 30
        self.pnl_history = deque(maxlen=30)

        #Seed
        self.seed = 42
        self.random_state = random.Random(self.seed)

        
         # Adjust the number of states based on whether features are provided
        if self.features_data is not None:
            self.num_states = 4 + self.features_data.shape[1]  #  T, S, num_stk, cash_balance + number of features
        else:
            self.num_states = 4  # Only core state variables: T, S, num_stk, cash_balance
            
        self.num_actions = 1
        
    def reset(self):
        # Choose the length based on whether features_data is provided
        data_length = len(self.features_data) if self.features_data is not None else len(self.stock_data)
        # Select a random starting point, ensuring there's enough room for the episode
        self.initial_idx_time = self.random_state.randint(0, data_length - self.n_steps - 1)
        # self.initial_idx_time = random.randint(0, len(self.stock_data) - self.n_steps - 1) --> not using this bc features may have fewer rows if drop nan
        self.idx_time = self.initial_idx_time
        self.current_max_steps = self.idx_time + self.n_steps

        # Initialize the number of stocks held and cash balance
        self.num_stk = 0

        # Initialize cash balance with the premium from selling options
        initial_option_price = self.option_data['OptionPrice'].iloc[self.idx_time]
        self.cash_balance = self.num_sold_opt * initial_option_price

        # Clear PnL history on reset
        self.pnl_history.clear()
        
        self.state = self._get_state()
        return self.state

    def step(self, action):
        #Take action in the environment
        if self.idx_time >= self.n_steps + self.initial_idx_time:
            # Terminal state: no next state
            reward = self._calculate_reward(terminal=True)
            pnl_value = self._calculate_pnl(terminal=True)
            self.pnl_history.append(pnl_value)
            return None, reward, True, pnl_value
        
        # Extract core state variables using names
        T0 = self.state['T']
        S0 = self.state['S']
        num_stk0 = self.state['num_stk']
        cash_balance0 = self.state['cash_balance']
        # The rest are features
        features0 = self.state.drop(['T', 'S', 'num_stk', 'cash_balance'])
        nS0 = self.num_stk  # Number of stocks held at time t0

         # **Take the action and update holdings**
        nS1 = action  # Assuming action represents the desired total holdings
        nS1 = np.clip(nS1, self.min_holdings, self.max_holdings)  # Enforce holding constraints
         # **Calculate change in stock holdings**
        delta_nS = nS1 - nS0

        # **Update cash balance**
        # Cash flow from trading stocks
        cash_flow_stocks = -delta_nS * S0

        # Transaction costs
        transaction_cost = 0
        if self.alpha > 0:
            transaction_cost = self.__get_cost(S=S0, chg_nS=delta_nS)

        r = self._get_risk_free_rate()

        # Update cash balance with interest, trading costs, and transaction costs
        self.cash_balance *= np.exp(r * self.dt)  # Accrue interest
        self.cash_balance += cash_flow_stocks - transaction_cost

        # **Update the number of stocks held**
        self.num_stk = nS1

        # Move to the next time step
        self.idx_time += 1
        next_state = self._get_state()
        # Extract core state variables for the next state
        T1 = next_state['T']
        S1 = next_state['S']
        
        # Retrieve daily volatility from option_data for reward calculation
        # vol = self.option_data['impliedVolatility'].iloc[self.idx_time % len(self.option_data)] 
        idx = self.idx_time % len(self.option_data)
        vol = self.option_data['impliedVolatility'].iloc[idx]
        
        C0 = self.option_data['OptionPrice'].iloc[idx - 1]
        C1 = self.option_data['OptionPrice'].iloc[idx]

        delta0 = self.features_data['delta'].iloc[idx - 1]
        delta1 = self.features_data['delta'].iloc[idx]

        # Reward calculation
        reward = self._calculate_reward(T0, T1, S0, S1, delta0, delta1, nS0, nS1, vol, action, C0, C1, terminal=False)
        done = (self.idx_time >= self.current_max_steps)
        pnl_value = self._calculate_pnl(T0, T1, S0, S1, delta0, delta1, nS0, nS1, vol, action, C0, C1, terminal=False)

        # Append PnL to history
        self.pnl_history.append(pnl_value)

        self.state = next_state
        return self._get_state(), reward, done, pnl_value

    def _get_risk_free_rate(self):
        """
        Helper function to retrieve the risk-free rate based on data availability.
        """
        if self.features_data is not None:
            # Check if 'RiskFreeRate' is in a multi-index format
            if isinstance(self.features_data.columns, pd.MultiIndex):
                return self.features_data.loc[self.stock_data.index[self.idx_time], ('RiskFreeRate', '')]
            else:
                return self.features_data.loc[self.stock_data.index[self.idx_time], 'RiskFreeRate']
        else:
            # Fallback to stock_data if features_data is unavailable
            return self.stock_data['RiskFreeRate'].iloc[self.idx_time]
        
    def _get_state(self):
        """
        Retrieves the current state.
        
        Returns:
        - A numpy array representing the state.
        """
        S = self.stock_data['Adj Close'].iloc[self.idx_time]
        option_row = self.option_data.iloc[self.idx_time % len(self.option_data)]
        #(f"n_steps: {self.n_steps}, idx_time: {self.idx_time}")
        T = (self.current_max_steps - self.idx_time)
        K = option_row['strike']
        sigma = option_row['impliedVolatility']  # daily volatility
        r = self._get_risk_free_rate()  # Get risk-free rate using helper
        
        num_stk = 0
        cash_balance = self.cash_balance  # Cash balance
    
        # Check if features are provided
        state_dict = {
        'T': T,
        'S': S,
        'num_stk': num_stk,
        'cash_balance': cash_balance
            }

        if self.features_data is not None:
            features = self.features_data.iloc[self.idx_time]
            state_dict.update(features.to_dict())

        state = pd.Series(state_dict)

        return state


    def _calculate_reward(
        self, T0=None, T1=None, S0=None, S1=None, delta0=None, delta1=None,
        nS0=None, nS1=None, vol=None, action=None, C0=None, C1=None, terminal=False
    ):
        if terminal:
            # Terminal reward calculation
            initial_option_price = self.option_data['OptionPrice'].iloc[self.initial_idx_time]
            r = self.num_sold_opt * (np.exp(self._get_risk_free_rate() * self.T) - 1) * initial_option_price
            r = r / np.power(self.gamma, self.n_steps - self.T)
            return r
        
        # Check for NaN values in inputs
        if any(np.isnan([S0, S1, delta0, delta1, nS0, nS1, vol, action, C0, C1])):
            print(f"NaN detected in reward calculation inputs: S0={S0}, S1={S1}, delta0={delta0}, delta1={delta1}, nS0={nS0}, nS1={nS1}, vol={vol}, action={action}, C0={C0}, C1={C1}")
            return 0.0  # Or handle as per your logic

        r = self._get_risk_free_rate()

        reward = nS1 * S1 - nS0 * S0
        reward -= self.num_sold_opt * (C1 - C0)
        reward -= (nS1 - nS0) * S0 * np.exp(r * (self.T - T0))
        
        if self.alpha > 0:
            cost = self.__get_cost(S=S0, chg_nS=(nS1 - nS0))
            reward -= cost * np.exp(r * (self.T - T0))
        
        if self.kappa > 0:
            var = vol * S0 * (nS1 - delta0)
            var = var**2 * self.dt
            reward -= self.kappa * var / 2

        # Incorporate standard deviation of the last 30-day PnL
        if len(self.pnl_history) > 1:
            pnl_std = np.std(self.pnl_history)
            reward -= self.beta * pnl_std  # Penalize high standard deviation
        else:
            pnl_std = 0.0  # No penalty if insufficient history
        
        reward = reward / np.power(self.gamma, T0)

        return reward

    def _calculate_pnl(
        self, T0=None, T1=None, S0=None, S1=None, delta0=None, delta1=None,
        nS0=None, nS1=None, vol=None, action=None, C0=None, C1=None, terminal=False
    ):
        # # Extract real-world data based on the current time index
        # if self.idx_time >= self.initial_idx_time + len(self.stock_data):
        #     # If no next state or if we are at the end, compute terminal value
        #     current_stock_price = self.stock_data['Adj Close'].iloc[self.idx_time]
        #     current_option_value = max(current_stock_price - self.option_data['strike'].iloc[0], 0)  # Assuming a call option payoff at expiration
        #     pnl = self.num_sold_opt * (np.exp(self.T) - 1) * current_option_value
        #     return pnl

        if terminal:
            # Terminal PnL calculation
            current_stock_price = S1  # Use the final stock price
            strike_price = self.option_data['strike'].iloc[0]  # Assuming single strike price
            current_option_value = max(current_stock_price - strike_price, 0)  # Call option payoff at expiration
            
            # Option PnL: Premium received minus payoff
            option_pnl = self.num_sold_opt * (C0 - current_option_value)
            
            # Stock PnL: Positions should be closed out
            stock_pnl = nS1 * S1 - nS0 * S0
            
            # **Include final cash balance**
            total_cash = self.cash_balance
        
            # Total PnL
            pnl = option_pnl + stock_pnl + total_cash
            return pnl

        risk_free_rate = self._get_risk_free_rate()  # Get risk-free rate using helper

        # Calculate the portfolio value components
        pnl = nS1 * S1 - nS0 * S0  # Profit from stock holdings
        pnl -= self.num_sold_opt * (C1 - C0)  # Change in option value
        pnl -= (nS1 - nS0) * S0 * np.exp(risk_free_rate * (self.T - T0))  # Cost for changing stock positions

        # Calculate transaction costs if applicable
        if self.alpha > 0:
            cost = self.__get_cost(S=S0, chg_nS=(nS1 - nS0))
            pnl -= cost * np.exp(risk_free_rate * (self.T - T0))

        return pnl

    def __get_cost(self, S, chg_nS):
        return self.alpha * S * (np.abs(chg_nS) + 0.01 * chg_nS**2)
