In [1]:
import os
os.chdir('..')

In [2]:
import torch
from src.agent import *
from src.agent_interface import *
from src.prior_measure import *
from src.robust import *

In [3]:
device = torch.device("cuda")

# Training Controller

In [4]:
train_steps = 5
clone_steps = 50
batch_size = 32
n_updates = 10

training_controller = TrainingController(train_steps=train_steps, clone_steps=clone_steps, batch_size=batch_size, n_batches=n_updates)

In [5]:
print('Insufficient Sample Case:', training_controller.has_samples(1))
print('Sufficient Sample Case:', training_controller.has_samples(1e6))

Insufficient Sample Case: False
Sufficient Sample Case: True


In [6]:
print('Training Needed Case:', training_controller.should_train())
print('Cloning Needed Case:', training_controller.should_clone_q())

Training Needed Case: True
Cloning Needed Case: True


In [7]:
training_controller.step_increment()
print('Training Not Needed Case:', training_controller.should_train())
print('Cloning Not Needed Case:', training_controller.should_clone_q())

Training Not Needed Case: False
Cloning Not Needed Case: False


# Prior Distribution

In [8]:
prior_measure = PriorStudentDistribution(device=device)
support = prior_measure.sample_from_support(batch_size)
support

tensor([[-1.4997, -1.0601, -0.8653,  ...,  0.8653,  1.0601,  1.4997],
        [-1.4997, -1.0601, -0.8653,  ...,  0.8653,  1.0601,  1.4997],
        [-1.4997, -1.0601, -0.8653,  ...,  0.8653,  1.0601,  1.4997],
        ...,
        [-1.4997, -1.0601, -0.8653,  ...,  0.8653,  1.0601,  1.4997],
        [-1.4997, -1.0601, -0.8653,  ...,  0.8653,  1.0601,  1.4997],
        [-1.4997, -1.0601, -0.8653,  ...,  0.8653,  1.0601,  1.4997]],
       device='cuda:0')

In [9]:
support.device.type

'cuda'

# Duality HQ Operator

In [10]:
discount_rate = 0.99
delta = 1e-4
sinkhorn_radius = 0.003
duality_operator = DualityHQOperator(discount_rate=discount_rate, delta=delta, sinkhorn_radius=sinkhorn_radius)

In [11]:
# Example inputs
batch_size = 4
n_samples = 5

reference_r = torch.rand((batch_size, 1))  # Reference returns, shape (batch_size, 1)
prior_r = torch.rand((batch_size, n_samples))  # Prior returns, shape (batch_size, n_samples)
q_max = torch.rand((batch_size, n_samples))  # Maximum Q-values, shape (batch_size, n_samples)
not_terminal = torch.ones((batch_size), dtype=torch.bool)  # Non-terminal states, shape (batch_size, 1)
lamda = torch.rand((batch_size))  # Lambda values, shape (batch_size, 1)
cost = torch.rand((batch_size, n_samples))  # Cost values, shape (batch_size, n_samples)

# Compute cost
cost_output = duality_operator.compute_cost(reference_r, prior_r)
print("Cost Output:", cost_output)

Cost Output: tensor([[0.4561, 0.0919, 0.1363, 0.3344, 0.4668],
        [0.5399, 0.6378, 0.7873, 0.5294, 0.1902],
        [0.3020, 0.1753, 0.7027, 0.4825, 0.3270],
        [0.1787, 0.2273, 0.5994, 0.0040, 0.6672]])


In [12]:
# Compute cij
cij_output = duality_operator.compute_cij(prior_r, q_max, not_terminal, lamda, cost)
print("Cij Output:", cij_output)

Cij Output: tensor([[ -18368.5254,  -20886.2168,  -16897.5938,  -10696.3711,  -23196.3008],
        [ -61874.2695,  -93767.9609,  -40809.2148,  -97187.5469, -140172.1094],
        [ -12975.9590,  -17403.3887,  -19746.6836,  -18188.1230,  -14593.5625],
        [ -46192.0000,  -36694.7344,  -53319.6484,  -69326.7266,  -18421.0156]])


In [13]:
# Compute inner expectation
inner_exp_output = duality_operator.inner_expectation(cij_output)
print("Inner Expectation Output:", inner_exp_output)

Inner Expectation Output: tensor([-10697.9805, -40810.8242, -12977.5684, -18422.6250])


In [14]:
# Update Sinkhorn radius
epsilon_bar = duality_operator.update_sinkhorn_radius(cost_output)
print("Updated Sinkhorn Radius (epsilon_bar):", epsilon_bar)

Updated Sinkhorn Radius (epsilon_bar): tensor([-0.0890, -0.1874, -0.1724, -0.0011])


In [15]:
# Compute HQ value
lamda_plus = torch.log1p(torch.exp(lamda.squeeze(-1)))  # Softplus of lambda
hq_value = duality_operator.hq_value(lamda_plus, inner_exp_output)
print("HQ Value:", hq_value)

HQ Value: tensor([1.3234, 3.0770, 1.4768, 1.5075])


# Optimize Lambda

In [16]:
lr = 0.1
max_iter = 100
step_size = 10
gamma = 0.9

lamda_from_buffer = torch.rand((batch_size), dtype=torch.float32) 
lambda_mask = torch.ones((batch_size), dtype=torch.bool)


hq_value, lamda_star, n_iter = hq_opt_with_nn(
    duality_operator=duality_operator,
    reference_r=reference_r,
    prior_r=prior_r,
    q_max=q_max,
    not_terminal=not_terminal,
    lamda_from_buffer=lamda_from_buffer,
    lambda_mask=lambda_mask,
    lr=lr,
    max_iter=max_iter,
    step_size=step_size,
    gamma=gamma
)


In [17]:
print("HQ Value:", hq_value)
print("Optimized Lambda (lamda_star):", lamda_star)
print("Number of Iterations:", n_iter)

HQ Value: tensor([ 2.3425,  1.6139, -6.3124,  1.4024])
Optimized Lambda (lamda_star): tensor([ 0.1986,  0.7429, -0.0998,  0.2153])
Number of Iterations: 2


# Replay Buffer

In [18]:
state_dim = 63
action_dim = 1 #Restricted to single action using epsilon greedy
buffer = ReplayBuffer(state_dim, action_dim, batch_size, max_len=1000, device=device)

# Add data to the buffer
state = torch.rand((batch_size, state_dim), device=device)
action = torch.rand((batch_size, 1), device=device)
reward = torch.rand((batch_size, 1), device=device)
next_state = torch.rand((batch_size, state_dim), device=device)
terminal_state = torch.zeros((batch_size, 1), dtype=torch.bool, device=device)
lambda_val = torch.rand((batch_size, 1), device=device)
risk_free_rate = torch.rand((batch_size, 1), device=device)
transaction_cost = torch.rand((batch_size, 1), device=device)

buffer.add(state, action, reward, next_state, terminal_state, lambda_val, risk_free_rate, transaction_cost)

# Sample from the buffer
sampled_data = buffer.sample()
print("Sampled Data Shapes:")
for data in sampled_data:
    print(data.shape)

Sampled Data Shapes:
torch.Size([4, 63])
torch.Size([4, 1])
torch.Size([4])
torch.Size([4, 63])
torch.Size([4])
torch.Size([4])
torch.Size([4])
torch.Size([4])
torch.Size([4])


# Agent

In [None]:
def generate_financial_samples(batch_size:int, device:torch.device='cpu'):
    """
    Generate synthetic financial samples aligned with asset returns.
    
    State structure (63-dim):
    - [0:60]: Past 60 returns
    - [60]: Log portfolio value
    - [61]: Current position (portfolio weight)
    - [62]: dt (time delta)
    
    Inputs:
        batch_size: Number of samples
        device: Torch device
    
    Outputs:
        Dict with keys: state, action, reward, next_state, terminal_state, 
                        lambda_val, risk_free_rate, transaction_cost
    """
    # Past 60 returns (mean ~0.05%, std ~1%)
    past_returns = torch.randn((batch_size, 60), device=device) * 0.01 + 0.0005
    
    # Log portfolio value (start around 0, vary Â±0.1)
    log_portfolio_value = torch.randn((batch_size, 1), device=device) * 0.1
    
    # Current position/portfolio weight [0, 1]
    current_position = torch.rand((batch_size, 1), device=device)
    
    # dt: time delta (daily = 1/252)
    dt = torch.full((batch_size, 1), 1.0/252, device=device)
    
    # State: concatenate all components
    state = torch.cat([past_returns, log_portfolio_value, current_position, dt], dim=1)
    
    # Action: portfolio weight [0, 1]
    action = torch.rand((batch_size, 1), device=device)
    
    # Reward: portfolio return (realized weighted return) - shape (batch_size, 1)
    reward = torch.randn((batch_size, 1), device=device) * 0.01 + 0.0005
    
    # Next_state: update components
    next_returns = torch.randn((batch_size, 60), device=device) * 0.01 + 0.0005
    next_log_portfolio_value = log_portfolio_value + reward
    next_position = action
    next_dt = torch.full((batch_size, 1), 1.0/252, device=device)
    
    next_state = torch.cat([next_returns, next_log_portfolio_value, next_position, next_dt], dim=1)
    
    # Terminal: mostly False, ~5% True - shape (batch_size,)
    terminal_state = torch.bernoulli(torch.full((batch_size, 1), 0.05, device=device)).bool()
    
    # Lambda: positive values, initialized to 1.0 - shape (batch_size,)
    lambda_val = torch.ones((batch_size, 1), device=device)
    
    # Risk-free rate: constant 4% (0.04/252 per day) - shape (batch_size,)
    risk_free_rate = torch.full((batch_size, 1), 0.04/252, device=device)
    
    # Transaction cost: constant 0.2% (0.002) - shape (batch_size,)
    transaction_cost = torch.full((batch_size, 1), 0.002, device=device)
    
    return {
        'state': state,                    # (batch_size, 63)
        'action': action,                  # (batch_size, 1)
        'reward': reward,                  # (batch_size, 1)
        'next_state': next_state,          # (batch_size, 63)
        'terminal_state': terminal_state,  # (batch_size, 1)
        'lambda_val': lambda_val,          # (batch_size, 1)
        'risk_free_rate': risk_free_rate,  # (batch_size, 1)
        'transaction_cost': transaction_cost  # (batch_size, 1)
    }

In [20]:
buffer = ReplayBuffer(state_dim, action_dim, batch_size, max_len=1000, device=device)
sampled_data = generate_financial_samples(batch_size=4, device=device)
buffer.add(**sampled_data)
print("Sampled Data Shapes:")
for data in sampled_data.values():
    print(data.shape)

Sampled Data Shapes:
torch.Size([4, 63])
torch.Size([4, 1])
torch.Size([4, 1])
torch.Size([4, 63])
torch.Size([4, 1])
torch.Size([4, 1])
torch.Size([4, 1])
torch.Size([4, 1])


In [21]:
# Sample from the buffer
sampled_data = buffer.sample()
print("Sampled Data Shapes:")
for data in sampled_data:
    print(data.shape)

Sampled Data Shapes:
torch.Size([4, 63])
torch.Size([4, 1])
torch.Size([4])
torch.Size([4, 63])
torch.Size([4])
torch.Size([4])
torch.Size([4])
torch.Size([4])
torch.Size([4])


In [22]:
epsilon = 0.1
lamda_init = 1.0

training_controller = TrainingController(train_steps=train_steps, clone_steps=clone_steps, batch_size=batch_size, n_batches=n_updates)
prior_measure = PriorStudentDistribution(device=device)
duality_operator = DualityHQOperator(discount_rate=discount_rate, delta=delta, sinkhorn_radius=sinkhorn_radius)

agent = PORDQN(
    state_dim=state_dim,
    action_dim=action_dim,
    batch_size=batch_size,
    n_updates=n_updates,
    training_controller=training_controller,
    prior_measure=prior_measure,
    duality_operator=duality_operator,
    epsilon=epsilon,
    lamda_init=lamda_init,
    device=device
)

In [23]:
observation = torch.rand((batch_size, state_dim))
actions = agent.get_action(observation)
print("Actions Shape:", actions.shape)

Actions Shape: torch.Size([4, 1])


In [24]:
agent.train_batch(*sampled_data)
print("Training batch completed successfully.")

Training batch completed successfully.
