# Case 1: Risky Asset always better
Condition: a > r & b > r

In [4]:
from mdp_agent.policy import Policy
from mdp_agent.action_space import ActionSpace
from mdp_env.risky_asset import RiskyAsset
from mdp_env.risk_free_asset import RiskFreeAsset
from mdp_env.train import train
from mdp_env.reward import reward_eval
import numpy as np
from decimal import Decimal


# Parameters Configuration
EPOCHS = 20000
# Total time periods
T = 10
# init for epsilon-greedy
INIT_EPSILON = 0.3
# init for learning rate
INIT_ALPHA = 0.01
# state space: this is dependent on time, per section 8.4 of Rao and Jelvis.
states = np.arange(0, T + 1, 1)
# action space
actions = ActionSpace(
    Decimal("0"),
    Decimal("1.0"),
    Decimal("0.25"))
# policy
policy = Policy(
    epochs=EPOCHS,
    init_epsilon=INIT_EPSILON,
    init_alpha=INIT_ALPHA,
    state_space=states,
    action_space=actions,
    training=True
)
# assets
risky_asset = RiskyAsset(0.15, 0.1, 0.5)
risk_free_asset = RiskFreeAsset(0.04)
# CARA Coefficient
CARA_COEF = 1


train(
    epochs=EPOCHS,
    T=T,
    risky_asset=risky_asset,
    risk_free_asset=risk_free_asset,
    policy=policy,
    reward_eval=reward_eval,
    cara_coef=CARA_COEF,
    early_stopping=2e-4
)

display(policy.q_table)

print("Optimal policy:")
policy.print()


  0%|          | 82/20000 [00:00<01:08, 292.10it/s]

Epoch: 0
Alpha: 0.010000, Epsilon: 0.299982.


 10%|█         | 2049/20000 [00:06<00:52, 344.46it/s]

Epoch: 2000
Alpha: 0.009000, Epsilon: 0.263982.
Last 10 max delta mean: 0.001567.


 20%|██        | 4053/20000 [00:12<00:44, 358.76it/s]

Epoch: 4000
Alpha: 0.008000, Epsilon: 0.227982.
Last 10 max delta mean: 0.000540.


 30%|███       | 6069/20000 [00:17<00:39, 352.71it/s]

Epoch: 6000
Alpha: 0.007000, Epsilon: 0.191982.
Last 10 max delta mean: 0.000451.


 39%|███▉      | 7758/20000 [00:22<00:36, 338.71it/s]

Converged. Total Epochs: 7758.
Last 10 max delta mean: 0.000196.





Unnamed: 0,0.00,0.25,0.50,0.75,1.00
0,-1.34698,-1.340056,-1.334005,-1.327652,-1.311897
1,-1.131985,-1.127873,-1.125567,-1.117617,-1.09578
2,-0.93909,-0.933971,-0.927258,-0.918816,-0.8983
3,-0.758378,-0.753505,-0.744548,-0.742859,-0.721444
4,-0.597816,-0.59519,-0.585731,-0.584677,-0.562957
5,-0.458126,-0.456017,-0.448151,-0.438482,-0.426213
6,-0.337991,-0.333224,-0.326777,-0.324532,-0.308205
7,-0.231384,-0.228028,-0.220714,-0.21994,-0.207718
8,-0.145932,-0.141086,-0.13498,-0.13204,-0.124055
9,-0.073816,-0.069462,-0.064102,-0.059995,-0.055133


Optimal policy:
0     1.00
1     1.00
2     1.00
3     1.00
4     1.00
5     1.00
6     1.00
7     1.00
8     1.00
9     1.00
10    0.00
dtype: object


# Case 2: Risk-Free Asset always better
Condition: a < r & b < r

In [5]:
from mdp_agent.policy import Policy
from mdp_agent.action_space import ActionSpace
from mdp_env.risky_asset import RiskyAsset
from mdp_env.risk_free_asset import RiskFreeAsset
from mdp_env.train import train
from mdp_env.reward import reward_eval
import numpy as np
from decimal import Decimal


# Parameters Configuration
EPOCHS = 20000
# Total time periods
T = 10
# init for epsilon-greedy
INIT_EPSILON = 0.3
# init for learning rate
INIT_ALPHA = 0.01
# state space: this is dependent on time, per section 8.4 of Rao and Jelvis.
states = np.arange(0, T + 1, 1)
# action space
actions = ActionSpace(
    Decimal("0"),
    Decimal("1.0"),
    Decimal("0.25"))
# policy
policy = Policy(
    epochs=EPOCHS,
    init_epsilon=INIT_EPSILON,
    init_alpha=INIT_ALPHA,
    state_space=states,
    action_space=actions,
    training=True
)
# assets
risky_asset = RiskyAsset(-0.04, -0.10, 0.5)
risk_free_asset = RiskFreeAsset(0.04)
# CARA Coefficient
CARA_COEF = 1


train(
    epochs=EPOCHS,
    T=T,
    risky_asset=risky_asset,
    risk_free_asset=risk_free_asset,
    policy=policy,
    reward_eval=reward_eval,
    cara_coef=CARA_COEF,
    early_stopping=1e-4
)

display(policy.q_table)

print("Optimal policy:")
policy.print()


  0%|          | 53/20000 [00:00<01:18, 255.58it/s]

Epoch: 0
Alpha: 0.010000, Epsilon: 0.299982.


 10%|█         | 2052/20000 [00:06<00:53, 335.26it/s]

Epoch: 2000
Alpha: 0.009000, Epsilon: 0.263982.
Last 10 max delta mean: 0.002694.


 20%|██        | 4057/20000 [00:11<00:49, 322.24it/s]

Epoch: 4000
Alpha: 0.008000, Epsilon: 0.227982.
Last 10 max delta mean: 0.001458.


 30%|███       | 6065/20000 [00:17<00:40, 347.82it/s]

Epoch: 6000
Alpha: 0.007000, Epsilon: 0.191982.
Last 10 max delta mean: 0.001237.


 40%|████      | 8061/20000 [00:23<00:34, 347.32it/s]

Epoch: 8000
Alpha: 0.006000, Epsilon: 0.155982.
Last 10 max delta mean: 0.000528.


 50%|█████     | 10067/20000 [00:29<00:29, 338.39it/s]

Epoch: 10000
Alpha: 0.005000, Epsilon: 0.119982.
Last 10 max delta mean: 0.000482.


 60%|██████    | 12059/20000 [00:34<00:23, 340.53it/s]

Epoch: 12000
Alpha: 0.004000, Epsilon: 0.083982.
Last 10 max delta mean: 0.000456.


 70%|███████   | 14056/20000 [00:40<00:16, 350.62it/s]

Epoch: 14000
Alpha: 0.003000, Epsilon: 0.047982.
Last 10 max delta mean: 0.000528.


 80%|████████  | 16043/20000 [00:46<00:11, 338.51it/s]

Epoch: 16000
Alpha: 0.002000, Epsilon: 0.011982.
Last 10 max delta mean: 0.000456.


 90%|█████████ | 18057/20000 [00:52<00:05, 355.28it/s]

Epoch: 18000
Alpha: 0.001000, Epsilon: 0.000000.
Last 10 max delta mean: 0.000449.


100%|██████████| 20000/20000 [00:57<00:00, 344.99it/s]


Unnamed: 0,0.00,0.25,0.50,0.75,1.00
0,-2.104203,-2.11564,-2.126299,-2.135093,-2.148439
1,-1.947092,-1.956412,-1.966097,-1.977427,-1.986385
2,-1.780708,-1.792085,-1.800697,-1.809786,-1.822768
3,-1.605411,-1.617583,-1.622787,-1.635472,-1.648316
4,-1.41798,-1.427579,-1.440653,-1.448652,-1.460249
5,-1.220252,-1.225978,-1.238248,-1.251915,-1.255545
6,-1.009519,-1.017846,-1.026684,-1.037498,-1.043202
7,-0.780399,-0.787675,-0.802041,-0.813272,-0.821921
8,-0.538494,-0.549251,-0.555743,-0.571367,-0.580379
9,-0.279884,-0.293659,-0.301339,-0.310746,-0.321995


Optimal policy:
0     0.00
1     0.00
2     0.00
3     0.00
4     0.00
5     0.00
6     0.00
7     0.00
8     0.00
9     0.00
10    0.00
dtype: object


# Case 3: Risky Asset expected return same as Risk-Free Rate
Condition: (a > r) & (b < r) & (a == -b) & (ap + b(1-p) == r)

In [6]:
from mdp_agent.policy import Policy
from mdp_agent.action_space import ActionSpace
from mdp_env.risky_asset import RiskyAsset
from mdp_env.risk_free_asset import RiskFreeAsset
from mdp_env.train import train
from mdp_env.reward import reward_eval
import numpy as np
from decimal import Decimal


# Parameters Configuration
EPOCHS = 20000
# Total time periods
T = 10
# init for epsilon-greedy
INIT_EPSILON = 0.3
# init for learning rate
INIT_ALPHA = 0.01
# state space: this is dependent on time, per section 8.4 of Rao and Jelvis.
states = np.arange(0, T + 1, 1)
# action space
actions = ActionSpace(
    Decimal("0"),
    Decimal("1.0"),
    Decimal("0.25"))
# policy
policy = Policy(
    epochs=EPOCHS,
    init_epsilon=INIT_EPSILON,
    init_alpha=INIT_ALPHA,
    state_space=states,
    action_space=actions,
    training=True
)
# assets
risky_asset = RiskyAsset(0.04, -0.04, 0.5)
risk_free_asset = RiskFreeAsset(0.04)
# CARA Coefficient
CARA_COEF = 1


train(
    epochs=EPOCHS,
    T=T,
    risky_asset=risky_asset,
    risk_free_asset=risk_free_asset,
    policy=policy,
    reward_eval=reward_eval,
    cara_coef=CARA_COEF,
    early_stopping=1e-4
)

display(policy.q_table)

print("Optimal policy:")
policy.print()


  0%|          | 68/20000 [00:00<00:59, 334.63it/s]

Epoch: 0
Alpha: 0.010000, Epsilon: 0.299982.


 10%|█         | 2071/20000 [00:06<00:56, 318.30it/s]

Epoch: 2000
Alpha: 0.009000, Epsilon: 0.263982.
Last 10 max delta mean: 0.002257.


 20%|██        | 4100/20000 [00:12<00:45, 347.73it/s]

Epoch: 4000
Alpha: 0.008000, Epsilon: 0.227982.
Last 10 max delta mean: 0.001114.


 31%|███       | 6106/20000 [00:18<00:38, 356.91it/s]

Epoch: 6000
Alpha: 0.007000, Epsilon: 0.191982.
Last 10 max delta mean: 0.000501.


 41%|████      | 8103/20000 [00:24<00:33, 355.25it/s]

Epoch: 8000
Alpha: 0.006000, Epsilon: 0.155982.
Last 10 max delta mean: 0.000348.


 51%|█████     | 10106/20000 [00:29<00:29, 334.28it/s]

Epoch: 10000
Alpha: 0.005000, Epsilon: 0.119982.
Last 10 max delta mean: 0.000239.


 61%|██████    | 12102/20000 [00:35<00:22, 350.20it/s]

Epoch: 12000
Alpha: 0.004000, Epsilon: 0.083982.
Last 10 max delta mean: 0.000205.


 70%|███████   | 14077/20000 [00:40<00:16, 357.41it/s]

Epoch: 14000
Alpha: 0.003000, Epsilon: 0.047982.
Last 10 max delta mean: 0.000217.


 80%|████████  | 16049/20000 [00:46<00:11, 353.80it/s]

Epoch: 16000
Alpha: 0.002000, Epsilon: 0.011982.
Last 10 max delta mean: 0.000225.


 90%|█████████ | 18050/20000 [00:52<00:05, 353.78it/s]

Epoch: 18000
Alpha: 0.001000, Epsilon: 0.000000.
Last 10 max delta mean: 0.000180.


100%|██████████| 20000/20000 [00:57<00:00, 345.97it/s]


Unnamed: 0,0.00,0.25,0.50,0.75,1.00
0,-2.016049,-2.018674,-2.022735,-2.027868,-2.029826
1,-1.848015,-1.850295,-1.854036,-1.858615,-1.862466
2,-1.674976,-1.676623,-1.679499,-1.684606,-1.689599
3,-1.495243,-1.496573,-1.501343,-1.503361,-1.504467
4,-1.30839,-1.310181,-1.314741,-1.319816,-1.3207
5,-1.115442,-1.118805,-1.120524,-1.124764,-1.127521
6,-0.914575,-0.916855,-0.919466,-0.921963,-0.929443
7,-0.704454,-0.706093,-0.708512,-0.712508,-0.717461
8,-0.481601,-0.485588,-0.487493,-0.49167,-0.495185
9,-0.247076,-0.250462,-0.254522,-0.257802,-0.261614


Optimal policy:
0     0.00
1     0.00
2     0.00
3     0.00
4     0.00
5     0.00
6     0.00
7     0.00
8     0.00
9     0.00
10    0.00
dtype: object


# Case 4: Risky Asset usually gives same return as Risk-Free, but sometimes incur huge loss
Condition: (a == r) & (r >> b) & (p > (1-p))

In [7]:
from mdp_agent.policy import Policy
from mdp_agent.action_space import ActionSpace
from mdp_env.risky_asset import RiskyAsset
from mdp_env.risk_free_asset import RiskFreeAsset
from mdp_env.train import train
from mdp_env.reward import reward_eval
import numpy as np
from decimal import Decimal


# Parameters Configuration
EPOCHS = 30000
# Total time periods
T = 10
# init for epsilon-greedy
INIT_EPSILON = 0.3
# init for learning rate
INIT_ALPHA = 0.005
# state space: this is dependent on time, per section 8.4 of Rao and Jelvis.
states = np.arange(0, T + 1, 1)
# action space
actions = ActionSpace(
    Decimal("0"),
    Decimal("1.0"),
    Decimal("0.25"))
# policy
policy = Policy(
    epochs=EPOCHS,
    init_epsilon=INIT_EPSILON,
    init_alpha=INIT_ALPHA,
    state_space=states,
    action_space=actions,
    training=True,
    gamma=1.0
)
# assets
risky_asset = RiskyAsset(0.04, -0.3, 0.80)
risk_free_asset = RiskFreeAsset(0.04)
# CARA Coefficient
CARA_COEF = 1


train(
    epochs=EPOCHS,
    T=T,
    risky_asset=risky_asset,
    risk_free_asset=risk_free_asset,
    policy=policy,
    reward_eval=reward_eval,
    cara_coef=CARA_COEF,
    early_stopping=1e-4
)

display(policy.q_table)

print("Optimal policy:")
policy.print()


  0%|          | 68/30000 [00:00<01:29, 334.75it/s]

Epoch: 0
Alpha: 0.005000, Epsilon: 0.299988.


 10%|█         | 3063/30000 [00:08<01:16, 351.69it/s]

Epoch: 3000
Alpha: 0.004500, Epsilon: 0.263988.
Last 10 max delta mean: 0.001981.


 20%|██        | 6052/30000 [00:17<01:08, 351.88it/s]

Epoch: 6000
Alpha: 0.004000, Epsilon: 0.227988.
Last 10 max delta mean: 0.001645.


 30%|███       | 9040/30000 [00:26<01:03, 330.49it/s]

Epoch: 9000
Alpha: 0.003500, Epsilon: 0.191988.
Last 10 max delta mean: 0.001042.


 40%|████      | 12042/30000 [00:35<00:52, 341.14it/s]

Epoch: 12000
Alpha: 0.003000, Epsilon: 0.155988.
Last 10 max delta mean: 0.000674.


 50%|█████     | 15046/30000 [00:44<00:46, 324.17it/s]

Epoch: 15000
Alpha: 0.002500, Epsilon: 0.119988.
Last 10 max delta mean: 0.000448.


 60%|██████    | 18068/30000 [00:53<00:34, 350.82it/s]

Epoch: 18000
Alpha: 0.002000, Epsilon: 0.083988.
Last 10 max delta mean: 0.000310.


 70%|███████   | 21039/30000 [01:01<00:25, 355.72it/s]

Epoch: 21000
Alpha: 0.001500, Epsilon: 0.047988.
Last 10 max delta mean: 0.000252.


 80%|████████  | 24061/30000 [01:10<00:16, 354.90it/s]

Epoch: 24000
Alpha: 0.001000, Epsilon: 0.011988.
Last 10 max delta mean: 0.000381.


 90%|█████████ | 27041/30000 [01:18<00:08, 355.27it/s]

Epoch: 27000
Alpha: 0.000500, Epsilon: 0.000000.
Last 10 max delta mean: 0.000311.


100%|██████████| 30000/30000 [01:26<00:00, 345.85it/s]


Unnamed: 0,0.00,0.25,0.50,0.75,1.00
0,-3.068152,-3.099246,-3.107551,-3.110378,-3.124791
1,-2.713314,-2.744986,-2.747955,-2.757147,-2.767963
2,-2.37139,-2.398677,-2.403949,-2.4081,-2.420212
3,-2.040932,-2.066214,-2.065036,-2.076818,-2.083233
4,-1.718805,-1.738985,-1.741675,-1.746789,-1.760895
5,-1.405879,-1.423288,-1.428389,-1.434124,-1.446798
6,-1.104281,-1.126101,-1.12291,-1.129631,-1.143152
7,-0.812027,-0.822961,-0.83225,-0.844887,-0.844759
8,-0.534263,-0.540621,-0.549625,-0.557401,-0.574526
9,-0.262229,-0.268685,-0.281004,-0.284015,-0.291729


Optimal policy:
0     0.00
1     0.00
2     0.00
3     0.00
4     0.00
5     0.00
6     0.00
7     0.00
8     0.00
9     0.00
10    0.00
dtype: object
