# Case 1: Risky Asset always better
Condition: a > r & b > r

In [None]:
from mdp_agent.policy import Policy
from mdp_agent.action_space import ActionSpace
from mdp_env.risky_asset import RiskyAsset
from mdp_env.risk_free_asset import RiskFreeAsset
from mdp_env.train import train
from mdp_env.reward import reward_eval
import numpy as np
from decimal import Decimal


# Parameters Configuration
EPOCHS = 20000
# Total time periods
T = 10
# init for epsilon-greedy
INIT_EPSILON = 0.3
# init for learning rate
INIT_ALPHA = 0.01
# state space: this is dependent on time, per section 8.4 of Rao and Jelvis.
states = np.arange(0, T + 1, 1)
# action space
actions = ActionSpace(
    Decimal("0"),
    Decimal("1.0"),
    Decimal("0.2"))
# policy
policy = Policy(
    epochs=EPOCHS,
    init_epsilon=INIT_EPSILON,
    init_alpha=INIT_ALPHA,
    state_space=states,
    action_space=actions,
    training=True
)
# assets
risky_asset = RiskyAsset(0.15, 0.1, 0.5)
risk_free_asset = RiskFreeAsset(0.04)
# CARA Coefficient
CARA_COEF = 1


train(
    epochs=EPOCHS,
    T=T,
    risky_asset=risky_asset,
    risk_free_asset=risk_free_asset,
    policy=policy,
    reward_eval=reward_eval,
    cara_coef=CARA_COEF,
    early_stopping=2e-4
)

display(policy.q_table)

print("Optimal policy:")
policy.print()


  0%|          | 70/20000 [00:00<00:57, 348.01it/s]

Epoch: 0
Alpha: 0.010000, Epsilon: 0.299982.


  5%|▌         | 1059/20000 [00:02<00:52, 360.91it/s]

Epoch: 1000
Alpha: 0.009500, Epsilon: 0.281982.
Last 10 max delta mean: 0.001369.


 10%|█         | 2047/20000 [00:05<00:50, 358.19it/s]

Epoch: 2000
Alpha: 0.009000, Epsilon: 0.263982.
Last 10 max delta mean: 0.000587.


 15%|█▌        | 3044/20000 [00:08<00:48, 351.01it/s]

Epoch: 3000
Alpha: 0.008500, Epsilon: 0.245982.
Last 10 max delta mean: 0.000340.


 20%|██        | 4046/20000 [00:11<00:46, 339.56it/s]

Epoch: 4000
Alpha: 0.008000, Epsilon: 0.227982.
Last 10 max delta mean: 0.000358.


 25%|██▌       | 5051/20000 [00:14<00:42, 354.43it/s]

Epoch: 5000
Alpha: 0.007500, Epsilon: 0.209982.
Last 10 max delta mean: 0.000267.


 30%|███       | 6065/20000 [00:17<00:39, 350.46it/s]

Epoch: 6000
Alpha: 0.007000, Epsilon: 0.191982.
Last 10 max delta mean: 0.000312.


 31%|███       | 6187/20000 [00:17<00:39, 352.72it/s]

Converged. Total Epochs: 6187.
Last 10 max delta mean: 0.000199.





Unnamed: 0,0.00,0.20,0.40,0.60,0.80,1.00
0,-0.59976,-0.593824,-0.590448,-0.585772,-0.581778,-0.574959
1,-0.528117,-0.525299,-0.518418,-0.513292,-0.509651,-0.503433
2,-0.460607,-0.456245,-0.45155,-0.444642,-0.442162,-0.434212
3,-0.392718,-0.390746,-0.387635,-0.383279,-0.375211,-0.367587
4,-0.333446,-0.328749,-0.325086,-0.317792,-0.318017,-0.306468
5,-0.278323,-0.274195,-0.26981,-0.260013,-0.260813,-0.251934
6,-0.224339,-0.218904,-0.216483,-0.21236,-0.201795,-0.200747
7,-0.17622,-0.173037,-0.168716,-0.166101,-0.162601,-0.150571
8,-0.12581,-0.122534,-0.121952,-0.115964,-0.11279,-0.105154
9,-0.074189,-0.070822,-0.071092,-0.066568,-0.065762,-0.057116


Optimal policy:
0     1.00
1     1.00
2     1.00
3     1.00
4     1.00
5     1.00
6     1.00
7     1.00
8     1.00
9     1.00
10    0.00
dtype: object


# Case 2: Risk-Free Asset always better
Condition: a < r & b < r

In [5]:
from mdp_agent.policy import Policy
from mdp_agent.action_space import ActionSpace
from mdp_env.risky_asset import RiskyAsset
from mdp_env.risk_free_asset import RiskFreeAsset
from mdp_env.train import train
from mdp_env.reward import reward_eval
import numpy as np
from decimal import Decimal


# Parameters Configuration
EPOCHS = 20000
# Total time periods
T = 10
# init for epsilon-greedy
INIT_EPSILON = 0.3
# init for learning rate
INIT_ALPHA = 0.01
# state space: this is dependent on time, per section 8.4 of Rao and Jelvis.
states = np.arange(0, T + 1, 1)
# action space
actions = ActionSpace(
    Decimal("0"),
    Decimal("1.0"),
    Decimal("0.2"))
# policy
policy = Policy(
    epochs=EPOCHS,
    init_epsilon=INIT_EPSILON,
    init_alpha=INIT_ALPHA,
    state_space=states,
    action_space=actions,
    training=True
)
# assets
risky_asset = RiskyAsset(-0.04, -0.10, 0.5)
risk_free_asset = RiskFreeAsset(0.04)
# CARA Coefficient
CARA_COEF = 1


train(
    epochs=EPOCHS,
    T=T,
    risky_asset=risky_asset,
    risk_free_asset=risk_free_asset,
    policy=policy,
    reward_eval=reward_eval,
    cara_coef=CARA_COEF,
    early_stopping=1e-4
)

display(policy.q_table)

print("Optimal policy:")
policy.print()


  1%|          | 103/20000 [00:00<00:58, 337.44it/s]

Epoch: 0
Alpha: 0.010000, Epsilon: 0.299982.


  5%|▌         | 1064/20000 [00:03<00:54, 345.32it/s]

Epoch: 1000
Alpha: 0.009500, Epsilon: 0.281982.
Last 10 max delta mean: 0.002021.


 10%|█         | 2043/20000 [00:05<00:50, 354.96it/s]

Epoch: 2000
Alpha: 0.009000, Epsilon: 0.263982.
Last 10 max delta mean: 0.000946.


 15%|█▌        | 3049/20000 [00:08<00:48, 350.56it/s]

Epoch: 3000
Alpha: 0.008500, Epsilon: 0.245982.
Last 10 max delta mean: 0.000530.


 20%|██        | 4060/20000 [00:11<00:47, 338.83it/s]

Epoch: 4000
Alpha: 0.008000, Epsilon: 0.227982.
Last 10 max delta mean: 0.000697.


 25%|██▌       | 5075/20000 [00:14<00:41, 359.77it/s]

Epoch: 5000
Alpha: 0.007500, Epsilon: 0.209982.
Last 10 max delta mean: 0.000546.


 30%|███       | 6052/20000 [00:17<00:38, 358.08it/s]

Epoch: 6000
Alpha: 0.007000, Epsilon: 0.191982.
Last 10 max delta mean: 0.000488.


 35%|███▌      | 7063/20000 [00:20<00:36, 354.84it/s]

Epoch: 7000
Alpha: 0.006500, Epsilon: 0.173982.
Last 10 max delta mean: 0.000411.


 40%|████      | 8061/20000 [00:23<00:34, 348.01it/s]

Epoch: 8000
Alpha: 0.006000, Epsilon: 0.155982.
Last 10 max delta mean: 0.000428.


 45%|████▌     | 9069/20000 [00:25<00:32, 339.99it/s]

Epoch: 9000
Alpha: 0.005500, Epsilon: 0.137982.
Last 10 max delta mean: 0.000447.


 50%|█████     | 10064/20000 [00:28<00:28, 351.07it/s]

Epoch: 10000
Alpha: 0.005000, Epsilon: 0.119982.
Last 10 max delta mean: 0.000375.


 55%|█████▌    | 11043/20000 [00:31<00:25, 356.42it/s]

Epoch: 11000
Alpha: 0.004500, Epsilon: 0.101982.
Last 10 max delta mean: 0.000367.


 60%|██████    | 12057/20000 [00:34<00:22, 353.47it/s]

Epoch: 12000
Alpha: 0.004000, Epsilon: 0.083982.
Last 10 max delta mean: 0.000603.


 65%|██████▌   | 13067/20000 [00:37<00:19, 355.37it/s]

Epoch: 13000
Alpha: 0.003500, Epsilon: 0.065982.
Last 10 max delta mean: 0.000454.


 70%|███████   | 14041/20000 [00:40<00:16, 358.19it/s]

Epoch: 14000
Alpha: 0.003000, Epsilon: 0.047982.
Last 10 max delta mean: 0.000397.


 75%|███████▌  | 15054/20000 [00:43<00:13, 355.91it/s]

Epoch: 15000
Alpha: 0.002500, Epsilon: 0.029982.
Last 10 max delta mean: 0.000340.


 80%|████████  | 16069/20000 [00:45<00:11, 350.20it/s]

Epoch: 16000
Alpha: 0.002000, Epsilon: 0.011982.
Last 10 max delta mean: 0.000418.


 85%|████████▌ | 17061/20000 [00:48<00:08, 327.37it/s]

Epoch: 17000
Alpha: 0.001500, Epsilon: 0.000000.
Last 10 max delta mean: 0.000403.


 90%|█████████ | 18059/20000 [00:51<00:05, 354.84it/s]

Epoch: 18000
Alpha: 0.001000, Epsilon: 0.000000.
Last 10 max delta mean: 0.000408.


 95%|█████████▌| 19066/20000 [00:54<00:02, 344.39it/s]

Epoch: 19000
Alpha: 0.000500, Epsilon: 0.000000.
Last 10 max delta mean: 0.000314.


100%|██████████| 20000/20000 [00:57<00:00, 349.38it/s]


Unnamed: 0,0.00,0.20,0.40,0.60,0.80,1.00
0,-0.689519,-0.697223,-0.705219,-0.713325,-0.722622,-0.731676
1,-0.672988,-0.679755,-0.687493,-0.697202,-0.705994,-0.712844
2,-0.657022,-0.662586,-0.669563,-0.678823,-0.686477,-0.69665
3,-0.636401,-0.646002,-0.649932,-0.663242,-0.668939,-0.675136
4,-0.615275,-0.621778,-0.633057,-0.639593,-0.648478,-0.654927
5,-0.593679,-0.599825,-0.602379,-0.612838,-0.621739,-0.630329
6,-0.558664,-0.570847,-0.574277,-0.583627,-0.587036,-0.5978
7,-0.510051,-0.516701,-0.524067,-0.528856,-0.542495,-0.551755
8,-0.424515,-0.433741,-0.443076,-0.453176,-0.459664,-0.467901
9,-0.279052,-0.287449,-0.293001,-0.307112,-0.317388,-0.3208


Optimal policy:
0     0.00
1     0.00
2     0.00
3     0.00
4     0.00
5     0.00
6     0.00
7     0.00
8     0.00
9     0.00
10    0.00
dtype: object


# Case 3: Risky Asset expected return same as Risk-Free Rate
Condition: (a > r) & (b < r) & (a == -b) & (ap + b(1-p) == r)

In [6]:
from mdp_agent.policy import Policy
from mdp_agent.action_space import ActionSpace
from mdp_env.risky_asset import RiskyAsset
from mdp_env.risk_free_asset import RiskFreeAsset
from mdp_env.train import train
from mdp_env.reward import reward_eval
import numpy as np
from decimal import Decimal


# Parameters Configuration
EPOCHS = 20000
# Total time periods
T = 10
# init for epsilon-greedy
INIT_EPSILON = 0.3
# init for learning rate
INIT_ALPHA = 0.01
# state space: this is dependent on time, per section 8.4 of Rao and Jelvis.
states = np.arange(0, T + 1, 1)
# action space
actions = ActionSpace(
    Decimal("0"),
    Decimal("1.0"),
    Decimal("0.2"))
# policy
policy = Policy(
    epochs=EPOCHS,
    init_epsilon=INIT_EPSILON,
    init_alpha=INIT_ALPHA,
    state_space=states,
    action_space=actions,
    training=True
)
# assets
risky_asset = RiskyAsset(0.04, -0.04, 0.5)
risk_free_asset = RiskFreeAsset(0.04)
# CARA Coefficient
CARA_COEF = 1


train(
    epochs=EPOCHS,
    T=T,
    risky_asset=risky_asset,
    risk_free_asset=risk_free_asset,
    policy=policy,
    reward_eval=reward_eval,
    cara_coef=CARA_COEF,
    early_stopping=1e-4
)

display(policy.q_table)

print("Optimal policy:")
policy.print()


  0%|          | 67/20000 [00:00<01:00, 330.80it/s]

Epoch: 0
Alpha: 0.010000, Epsilon: 0.299982.


  5%|▌         | 1067/20000 [00:03<00:55, 343.65it/s]

Epoch: 1000
Alpha: 0.009500, Epsilon: 0.281982.
Last 10 max delta mean: 0.001583.


 10%|█         | 2066/20000 [00:06<00:51, 345.96it/s]

Epoch: 2000
Alpha: 0.009000, Epsilon: 0.263982.
Last 10 max delta mean: 0.000774.


 15%|█▌        | 3063/20000 [00:08<00:49, 343.05it/s]

Epoch: 3000
Alpha: 0.008500, Epsilon: 0.245982.
Last 10 max delta mean: 0.000404.


 21%|██        | 4107/20000 [00:11<00:44, 356.39it/s]

Epoch: 4000
Alpha: 0.008000, Epsilon: 0.227982.
Last 10 max delta mean: 0.000330.


 25%|██▌       | 5039/20000 [00:14<00:43, 347.57it/s]

Epoch: 5000
Alpha: 0.007500, Epsilon: 0.209982.
Last 10 max delta mean: 0.000255.


 30%|███       | 6069/20000 [00:17<00:39, 350.85it/s]

Epoch: 6000
Alpha: 0.007000, Epsilon: 0.191982.
Last 10 max delta mean: 0.000219.


 35%|███▌      | 7040/20000 [00:20<00:36, 351.82it/s]

Epoch: 7000
Alpha: 0.006500, Epsilon: 0.173982.
Last 10 max delta mean: 0.000289.


 40%|████      | 8054/20000 [00:23<00:37, 319.53it/s]

Epoch: 8000
Alpha: 0.006000, Epsilon: 0.155982.
Last 10 max delta mean: 0.000227.


 45%|████▌     | 9063/20000 [00:26<00:31, 349.06it/s]

Epoch: 9000
Alpha: 0.005500, Epsilon: 0.137982.
Last 10 max delta mean: 0.000295.


 50%|█████     | 10050/20000 [00:29<00:28, 345.44it/s]

Epoch: 10000
Alpha: 0.005000, Epsilon: 0.119982.
Last 10 max delta mean: 0.000248.


 55%|█████▌    | 11050/20000 [00:32<00:25, 345.20it/s]

Epoch: 11000
Alpha: 0.004500, Epsilon: 0.101982.
Last 10 max delta mean: 0.000223.


 60%|██████    | 12073/20000 [00:35<00:22, 351.32it/s]

Epoch: 12000
Alpha: 0.004000, Epsilon: 0.083982.
Last 10 max delta mean: 0.000203.


 65%|██████▌   | 13038/20000 [00:37<00:20, 342.05it/s]

Epoch: 13000
Alpha: 0.003500, Epsilon: 0.065982.
Last 10 max delta mean: 0.000222.


 70%|███████   | 14054/20000 [00:40<00:19, 310.48it/s]

Epoch: 14000
Alpha: 0.003000, Epsilon: 0.047982.
Last 10 max delta mean: 0.000230.


 75%|███████▌  | 15042/20000 [00:43<00:14, 353.18it/s]

Epoch: 15000
Alpha: 0.002500, Epsilon: 0.029982.
Last 10 max delta mean: 0.000227.


 80%|████████  | 16041/20000 [00:46<00:11, 350.78it/s]

Epoch: 16000
Alpha: 0.002000, Epsilon: 0.011982.
Last 10 max delta mean: 0.000217.


 85%|████████▌ | 17062/20000 [00:49<00:08, 339.66it/s]

Epoch: 17000
Alpha: 0.001500, Epsilon: 0.000000.
Last 10 max delta mean: 0.000174.


 90%|█████████ | 18045/20000 [00:52<00:05, 348.82it/s]

Epoch: 18000
Alpha: 0.001000, Epsilon: 0.000000.
Last 10 max delta mean: 0.000194.


 95%|█████████▌| 19073/20000 [00:55<00:02, 352.82it/s]

Epoch: 19000
Alpha: 0.000500, Epsilon: 0.000000.
Last 10 max delta mean: 0.000229.


100%|██████████| 20000/20000 [00:58<00:00, 344.01it/s]


Unnamed: 0,0.00,0.20,0.40,0.60,0.80,1.00
0,-0.682006,-0.685543,-0.688727,-0.690608,-0.693962,-0.697076
1,-0.656924,-0.660267,-0.664424,-0.666809,-0.670871,-0.672643
2,-0.632014,-0.634821,-0.639719,-0.64092,-0.645231,-0.649841
3,-0.605579,-0.610874,-0.61204,-0.615374,-0.619357,-0.620827
4,-0.578458,-0.583528,-0.583977,-0.587855,-0.591624,-0.591503
5,-0.54674,-0.554242,-0.556918,-0.558827,-0.559891,-0.562162
6,-0.509344,-0.516824,-0.516827,-0.520228,-0.525905,-0.527796
7,-0.457559,-0.465498,-0.466033,-0.471464,-0.472566,-0.475901
8,-0.378701,-0.386397,-0.387881,-0.38912,-0.392337,-0.39631
9,-0.244378,-0.250981,-0.251788,-0.257291,-0.256563,-0.25863


Optimal policy:
0     0.00
1     0.00
2     0.00
3     0.00
4     0.00
5     0.00
6     0.00
7     0.00
8     0.00
9     0.00
10    0.00
dtype: object
