# Case 1: Risky Asset always better
Condition: a > r & b > r

In [35]:
from mdp_agent.policy import Policy
from mdp_agent.action_space import ActionSpace
from mdp_env.risky_asset import RiskyAsset
from mdp_env.risk_free_asset import RiskFreeAsset
from mdp_env.train import train
from mdp_env.reward import reward_eval
import numpy as np
from decimal import Decimal


# Parameters Configuration
EPOCHS = 20000
# Total time periods
T = 10
# init for epsilon-greedy
INIT_EPSILON = 0.3
# init for learning rate
INIT_ALPHA = 0.01
# state space: this is dependent on time, per section 8.4 of Rao and Jelvis.
states = np.arange(0, T + 1, 1)
# action space
actions = ActionSpace(
    Decimal("0"),
    Decimal("1.0"),
    Decimal("0.2"))
# policy
policy = Policy(
    epochs=EPOCHS,
    init_epsilon=INIT_EPSILON,
    init_alpha=INIT_ALPHA,
    state_space=states,
    action_space=actions,
    training=True
)
# assets
risky_asset = RiskyAsset(0.15, 0.1, 0.5)
risk_free_asset = RiskFreeAsset(0.04)
# CARA Coefficient
CARA_COEF = 1


train(
    epochs=EPOCHS,
    T=T,
    risky_asset=risky_asset,
    risk_free_asset=risk_free_asset,
    policy=policy,
    reward_eval=reward_eval,
    cara_coef=CARA_COEF,
    early_stopping=2e-4
)

display(policy.q_table)

print("Optimal policy:")
policy.print()


  0%|          | 65/20000 [00:00<01:02, 319.60it/s]

Epoch: 0
Alpha: 0.010000, Epsilon: 0.299982.


  5%|▌         | 1067/20000 [00:03<00:54, 345.25it/s]

Epoch: 1000
Alpha: 0.009500, Epsilon: 0.281982.
Last 10 max delta mean: 0.001377.


 10%|█         | 2042/20000 [00:05<00:51, 347.50it/s]

Epoch: 2000
Alpha: 0.009000, Epsilon: 0.263982.
Last 10 max delta mean: 0.000614.


 15%|█▌        | 3045/20000 [00:08<00:49, 344.97it/s]

Epoch: 3000
Alpha: 0.008500, Epsilon: 0.245982.
Last 10 max delta mean: 0.000359.


 20%|██        | 4053/20000 [00:11<00:44, 354.91it/s]

Epoch: 4000
Alpha: 0.008000, Epsilon: 0.227982.
Last 10 max delta mean: 0.000371.


 25%|██▌       | 5056/20000 [00:14<00:42, 349.94it/s]

Epoch: 5000
Alpha: 0.007500, Epsilon: 0.209982.
Last 10 max delta mean: 0.000310.


 27%|██▋       | 5357/20000 [00:15<00:42, 345.11it/s]

Converged. Total Epochs: 5357.
Last 10 max delta mean: 0.000197.





Unnamed: 0,0.00,0.20,0.40,0.60,0.80,1.00
0,-0.596708,-0.592095,-0.589359,-0.586241,-0.58199,-0.577446
1,-0.529349,-0.524054,-0.520353,-0.515168,-0.511318,-0.504321
2,-0.461472,-0.457848,-0.452351,-0.448635,-0.443722,-0.433624
3,-0.396842,-0.392082,-0.385308,-0.387641,-0.384292,-0.365803
4,-0.334894,-0.33137,-0.326327,-0.328803,-0.322325,-0.305294
5,-0.279195,-0.273424,-0.27006,-0.272802,-0.266629,-0.246743
6,-0.226582,-0.223883,-0.22067,-0.214492,-0.214919,-0.194915
7,-0.174355,-0.173222,-0.175364,-0.171043,-0.164476,-0.148268
8,-0.127899,-0.123666,-0.122466,-0.120169,-0.115593,-0.103102
9,-0.075325,-0.074165,-0.072323,-0.070595,-0.066795,-0.056589


Optimal policy:
0     1.00
1     1.00
2     1.00
3     1.00
4     1.00
5     1.00
6     1.00
7     1.00
8     1.00
9     1.00
10    0.00
dtype: object


# Case 2: Risk-Free Asset always better
Condition: a < r & b < r

In [36]:
from mdp_agent.policy import Policy
from mdp_agent.action_space import ActionSpace
from mdp_env.risky_asset import RiskyAsset
from mdp_env.risk_free_asset import RiskFreeAsset
from mdp_env.train import train
from mdp_env.reward import reward_eval
import numpy as np
from decimal import Decimal


# Parameters Configuration
EPOCHS = 20000
# Total time periods
T = 10
# init for epsilon-greedy
INIT_EPSILON = 0.3
# init for learning rate
INIT_ALPHA = 0.01
# state space: this is dependent on time, per section 8.4 of Rao and Jelvis.
states = np.arange(0, T + 1, 1)
# action space
actions = ActionSpace(
    Decimal("0"),
    Decimal("1.0"),
    Decimal("0.2"))
# policy
policy = Policy(
    epochs=EPOCHS,
    init_epsilon=INIT_EPSILON,
    init_alpha=INIT_ALPHA,
    state_space=states,
    action_space=actions,
    training=True
)
# assets
risky_asset = RiskyAsset(-0.04, -0.10, 0.5)
risk_free_asset = RiskFreeAsset(0.04)
# CARA Coefficient
CARA_COEF = 1


train(
    epochs=EPOCHS,
    T=T,
    risky_asset=risky_asset,
    risk_free_asset=risk_free_asset,
    policy=policy,
    reward_eval=reward_eval,
    cara_coef=CARA_COEF,
    early_stopping=1e-4
)

display(policy.q_table)

print("Optimal policy:")
policy.print()


  0%|          | 66/20000 [00:00<01:01, 326.42it/s]

Epoch: 0
Alpha: 0.010000, Epsilon: 0.299982.


  6%|▌         | 1104/20000 [00:03<00:52, 359.47it/s]

Epoch: 1000
Alpha: 0.009500, Epsilon: 0.281982.
Last 10 max delta mean: 0.001894.


 10%|█         | 2045/20000 [00:05<00:49, 359.17it/s]

Epoch: 2000
Alpha: 0.009000, Epsilon: 0.263982.
Last 10 max delta mean: 0.000918.


 15%|█▌        | 3041/20000 [00:08<00:50, 336.58it/s]

Epoch: 3000
Alpha: 0.008500, Epsilon: 0.245982.
Last 10 max delta mean: 0.000583.


 20%|██        | 4069/20000 [00:11<00:44, 354.81it/s]

Epoch: 4000
Alpha: 0.008000, Epsilon: 0.227982.
Last 10 max delta mean: 0.000765.


 25%|██▌       | 5053/20000 [00:14<00:41, 360.58it/s]

Epoch: 5000
Alpha: 0.007500, Epsilon: 0.209982.
Last 10 max delta mean: 0.000602.


 30%|███       | 6037/20000 [00:17<00:39, 351.43it/s]

Epoch: 6000
Alpha: 0.007000, Epsilon: 0.191982.
Last 10 max delta mean: 0.000537.


 35%|███▌      | 7052/20000 [00:20<00:36, 356.47it/s]

Epoch: 7000
Alpha: 0.006500, Epsilon: 0.173982.
Last 10 max delta mean: 0.000413.


 40%|████      | 8068/20000 [00:22<00:34, 349.24it/s]

Epoch: 8000
Alpha: 0.006000, Epsilon: 0.155982.
Last 10 max delta mean: 0.000457.


 45%|████▌     | 9047/20000 [00:25<00:30, 354.34it/s]

Epoch: 9000
Alpha: 0.005500, Epsilon: 0.137982.
Last 10 max delta mean: 0.000416.


 50%|█████     | 10046/20000 [00:28<00:28, 353.14it/s]

Epoch: 10000
Alpha: 0.005000, Epsilon: 0.119982.
Last 10 max delta mean: 0.000394.


 55%|█████▌    | 11061/20000 [00:31<00:25, 350.94it/s]

Epoch: 11000
Alpha: 0.004500, Epsilon: 0.101982.
Last 10 max delta mean: 0.000474.


 60%|██████    | 12041/20000 [00:34<00:22, 354.48it/s]

Epoch: 12000
Alpha: 0.004000, Epsilon: 0.083982.
Last 10 max delta mean: 0.000431.


 65%|██████▌   | 13044/20000 [00:37<00:19, 349.22it/s]

Epoch: 13000
Alpha: 0.003500, Epsilon: 0.065982.
Last 10 max delta mean: 0.000418.


 70%|███████   | 14058/20000 [00:40<00:16, 351.14it/s]

Epoch: 14000
Alpha: 0.003000, Epsilon: 0.047982.
Last 10 max delta mean: 0.000405.


 75%|███████▌  | 15068/20000 [00:42<00:14, 340.54it/s]

Epoch: 15000
Alpha: 0.002500, Epsilon: 0.029982.
Last 10 max delta mean: 0.000344.


 80%|████████  | 16068/20000 [00:45<00:11, 356.33it/s]

Epoch: 16000
Alpha: 0.002000, Epsilon: 0.011982.
Last 10 max delta mean: 0.000455.


 85%|████████▌ | 17048/20000 [00:48<00:08, 352.65it/s]

Epoch: 17000
Alpha: 0.001500, Epsilon: 0.000000.
Last 10 max delta mean: 0.000347.


 90%|█████████ | 18062/20000 [00:51<00:05, 348.67it/s]

Epoch: 18000
Alpha: 0.001000, Epsilon: 0.000000.
Last 10 max delta mean: 0.000375.


 95%|█████████▌| 19041/20000 [00:54<00:02, 355.99it/s]

Epoch: 19000
Alpha: 0.000500, Epsilon: 0.000000.
Last 10 max delta mean: 0.000437.


100%|██████████| 20000/20000 [00:57<00:00, 350.59it/s]


Unnamed: 0,0.00,0.20,0.40,0.60,0.80,1.00
0,-0.689948,-0.697319,-0.706563,-0.714205,-0.723433,-0.731066
1,-0.674113,-0.679329,-0.689386,-0.695875,-0.704315,-0.716018
2,-0.655483,-0.663037,-0.671594,-0.679666,-0.688509,-0.696803
3,-0.637858,-0.646252,-0.653583,-0.660752,-0.66983,-0.681422
4,-0.617531,-0.625028,-0.629117,-0.64097,-0.64773,-0.65657
5,-0.591614,-0.603755,-0.604291,-0.614931,-0.623818,-0.63136
6,-0.558573,-0.568181,-0.577657,-0.584577,-0.592263,-0.601301
7,-0.509092,-0.519821,-0.526946,-0.530812,-0.542976,-0.547417
8,-0.426714,-0.436381,-0.444967,-0.450259,-0.461273,-0.471346
9,-0.278527,-0.288566,-0.301872,-0.308788,-0.3121,-0.316343


Optimal policy:
0     0.00
1     0.00
2     0.00
3     0.00
4     0.00
5     0.00
6     0.00
7     0.00
8     0.00
9     0.00
10    0.00
dtype: object


# Case 3: Risky Asset expected return same as Risk-Free Rate
Condition: (a > r) & (b < r) & (a == -b) & (ap + b(1-p) == r)

In [None]:
from mdp_agent.policy import Policy
from mdp_agent.action_space import ActionSpace
from mdp_env.risky_asset import RiskyAsset
from mdp_env.risk_free_asset import RiskFreeAsset
from mdp_env.train import train
from mdp_env.reward import reward_eval
import numpy as np
from decimal import Decimal


# Parameters Configuration
EPOCHS = 20000
# Total time periods
T = 10
# init for epsilon-greedy
INIT_EPSILON = 0.3
# init for learning rate
INIT_ALPHA = 0.01
# state space: this is dependent on time, per section 8.4 of Rao and Jelvis.
states = np.arange(0, T + 1, 1)
# action space
actions = ActionSpace(
    Decimal("0"),
    Decimal("1.0"),
    Decimal("0.2"))
# policy
policy = Policy(
    epochs=EPOCHS,
    init_epsilon=INIT_EPSILON,
    init_alpha=INIT_ALPHA,
    state_space=states,
    action_space=actions,
    training=True
)
# assets
risky_asset = RiskyAsset(0.04, -0.04, 0.5)
risk_free_asset = RiskFreeAsset(0.04)
# CARA Coefficient
CARA_COEF = 1


train(
    epochs=EPOCHS,
    T=T,
    risky_asset=risky_asset,
    risk_free_asset=risk_free_asset,
    policy=policy,
    reward_eval=reward_eval,
    cara_coef=CARA_COEF,
    early_stopping=1e-4
)

display(policy.q_table)

print("Optimal policy:")
policy.print()


  0%|          | 67/20000 [00:00<01:00, 330.80it/s]

Epoch: 0
Alpha: 0.010000, Epsilon: 0.299982.


  5%|▌         | 1067/20000 [00:03<00:55, 343.65it/s]

Epoch: 1000
Alpha: 0.009500, Epsilon: 0.281982.
Last 10 max delta mean: 0.001583.


 10%|█         | 2066/20000 [00:06<00:51, 345.96it/s]

Epoch: 2000
Alpha: 0.009000, Epsilon: 0.263982.
Last 10 max delta mean: 0.000774.


 15%|█▌        | 3063/20000 [00:08<00:49, 343.05it/s]

Epoch: 3000
Alpha: 0.008500, Epsilon: 0.245982.
Last 10 max delta mean: 0.000404.


 21%|██        | 4107/20000 [00:11<00:44, 356.39it/s]

Epoch: 4000
Alpha: 0.008000, Epsilon: 0.227982.
Last 10 max delta mean: 0.000330.


 25%|██▌       | 5039/20000 [00:14<00:43, 347.57it/s]

Epoch: 5000
Alpha: 0.007500, Epsilon: 0.209982.
Last 10 max delta mean: 0.000255.


 30%|███       | 6069/20000 [00:17<00:39, 350.85it/s]

Epoch: 6000
Alpha: 0.007000, Epsilon: 0.191982.
Last 10 max delta mean: 0.000219.


 35%|███▌      | 7040/20000 [00:20<00:36, 351.82it/s]

Epoch: 7000
Alpha: 0.006500, Epsilon: 0.173982.
Last 10 max delta mean: 0.000289.


 40%|████      | 8054/20000 [00:23<00:37, 319.53it/s]

Epoch: 8000
Alpha: 0.006000, Epsilon: 0.155982.
Last 10 max delta mean: 0.000227.


 45%|████▌     | 9063/20000 [00:26<00:31, 349.06it/s]

Epoch: 9000
Alpha: 0.005500, Epsilon: 0.137982.
Last 10 max delta mean: 0.000295.


 50%|█████     | 10050/20000 [00:29<00:28, 345.44it/s]

Epoch: 10000
Alpha: 0.005000, Epsilon: 0.119982.
Last 10 max delta mean: 0.000248.


 55%|█████▌    | 11050/20000 [00:32<00:25, 345.20it/s]

Epoch: 11000
Alpha: 0.004500, Epsilon: 0.101982.
Last 10 max delta mean: 0.000223.


 60%|██████    | 12073/20000 [00:35<00:22, 351.32it/s]

Epoch: 12000
Alpha: 0.004000, Epsilon: 0.083982.
Last 10 max delta mean: 0.000203.


 65%|██████▌   | 13038/20000 [00:37<00:20, 342.05it/s]

Epoch: 13000
Alpha: 0.003500, Epsilon: 0.065982.
Last 10 max delta mean: 0.000222.


 70%|███████   | 14054/20000 [00:40<00:19, 310.48it/s]

Epoch: 14000
Alpha: 0.003000, Epsilon: 0.047982.
Last 10 max delta mean: 0.000230.


 75%|███████▌  | 15042/20000 [00:43<00:14, 353.18it/s]

Epoch: 15000
Alpha: 0.002500, Epsilon: 0.029982.
Last 10 max delta mean: 0.000227.


 80%|████████  | 16041/20000 [00:46<00:11, 350.78it/s]

Epoch: 16000
Alpha: 0.002000, Epsilon: 0.011982.
Last 10 max delta mean: 0.000217.


 85%|████████▌ | 17062/20000 [00:49<00:08, 339.66it/s]

Epoch: 17000
Alpha: 0.001500, Epsilon: 0.000000.
Last 10 max delta mean: 0.000174.


 90%|█████████ | 18045/20000 [00:52<00:05, 348.82it/s]

Epoch: 18000
Alpha: 0.001000, Epsilon: 0.000000.
Last 10 max delta mean: 0.000194.


 95%|█████████▌| 19073/20000 [00:55<00:02, 352.82it/s]

Epoch: 19000
Alpha: 0.000500, Epsilon: 0.000000.
Last 10 max delta mean: 0.000229.


100%|██████████| 20000/20000 [00:58<00:00, 344.01it/s]


Unnamed: 0,0.00,0.20,0.40,0.60,0.80,1.00
0,-0.682006,-0.685543,-0.688727,-0.690608,-0.693962,-0.697076
1,-0.656924,-0.660267,-0.664424,-0.666809,-0.670871,-0.672643
2,-0.632014,-0.634821,-0.639719,-0.64092,-0.645231,-0.649841
3,-0.605579,-0.610874,-0.61204,-0.615374,-0.619357,-0.620827
4,-0.578458,-0.583528,-0.583977,-0.587855,-0.591624,-0.591503
5,-0.54674,-0.554242,-0.556918,-0.558827,-0.559891,-0.562162
6,-0.509344,-0.516824,-0.516827,-0.520228,-0.525905,-0.527796
7,-0.457559,-0.465498,-0.466033,-0.471464,-0.472566,-0.475901
8,-0.378701,-0.386397,-0.387881,-0.38912,-0.392337,-0.39631
9,-0.244378,-0.250981,-0.251788,-0.257291,-0.256563,-0.25863


Optimal policy:
0     0.00
1     0.00
2     0.00
3     0.00
4     0.00
5     0.00
6     0.00
7     0.00
8     0.00
9     0.00
10    0.00
dtype: object


# Case 4: Risky Asset usually gives same return as Risk-Free, but sometimes incur huge loss
Condition: (a == r) & (r >> b) & (p > (1-p))

In [3]:
from mdp_agent.policy import Policy
from mdp_agent.action_space import ActionSpace
from mdp_env.risky_asset import RiskyAsset
from mdp_env.risk_free_asset import RiskFreeAsset
from mdp_env.train import train
from mdp_env.reward import reward_eval
import numpy as np
from decimal import Decimal


# Parameters Configuration
EPOCHS = 30000
# Total time periods
T = 10
# init for epsilon-greedy
INIT_EPSILON = 0.3
# init for learning rate
INIT_ALPHA = 0.005
# state space: this is dependent on time, per section 8.4 of Rao and Jelvis.
states = np.arange(0, T + 1, 1)
# action space
actions = ActionSpace(
    Decimal("0"),
    Decimal("1.0"),
    Decimal("0.2"))
# policy
policy = Policy(
    epochs=EPOCHS,
    init_epsilon=INIT_EPSILON,
    init_alpha=INIT_ALPHA,
    state_space=states,
    action_space=actions,
    training=True,
    gamma=1.0
)
# assets
risky_asset = RiskyAsset(0.04, -0.3, 0.80)
risk_free_asset = RiskFreeAsset(0.04)
# CARA Coefficient
CARA_COEF = 1


train(
    epochs=EPOCHS,
    T=T,
    risky_asset=risky_asset,
    risk_free_asset=risk_free_asset,
    policy=policy,
    reward_eval=reward_eval,
    cara_coef=CARA_COEF,
    early_stopping=1e-4
)

display(policy.q_table)

print("Optimal policy:")
policy.print()


  0%|          | 102/30000 [00:00<01:29, 335.09it/s]

Epoch: 0
Alpha: 0.005000, Epsilon: 0.299988.


 10%|█         | 3066/30000 [00:09<01:19, 337.49it/s]

Epoch: 3000
Alpha: 0.004500, Epsilon: 0.263988.
Last 10 max delta mean: 0.002013.


 20%|██        | 6044/30000 [00:18<01:23, 285.21it/s]

Epoch: 6000
Alpha: 0.004000, Epsilon: 0.227988.
Last 10 max delta mean: 0.001727.


 30%|███       | 9057/30000 [00:27<01:02, 336.69it/s]

Epoch: 9000
Alpha: 0.003500, Epsilon: 0.191988.
Last 10 max delta mean: 0.001327.


 40%|████      | 12072/30000 [00:36<00:51, 350.66it/s]

Epoch: 12000
Alpha: 0.003000, Epsilon: 0.155988.
Last 10 max delta mean: 0.000864.


 50%|█████     | 15040/30000 [00:44<00:42, 351.02it/s]

Epoch: 15000
Alpha: 0.002500, Epsilon: 0.119988.
Last 10 max delta mean: 0.000785.


 60%|██████    | 18064/30000 [00:54<00:34, 345.63it/s]

Epoch: 18000
Alpha: 0.002000, Epsilon: 0.083988.
Last 10 max delta mean: 0.000468.


 70%|███████   | 21055/30000 [01:03<00:26, 340.61it/s]

Epoch: 21000
Alpha: 0.001500, Epsilon: 0.047988.
Last 10 max delta mean: 0.000374.


 80%|████████  | 24062/30000 [01:11<00:16, 352.85it/s]

Epoch: 24000
Alpha: 0.001000, Epsilon: 0.011988.
Last 10 max delta mean: 0.000368.


 90%|█████████ | 27062/30000 [01:20<00:08, 348.35it/s]

Epoch: 27000
Alpha: 0.000500, Epsilon: 0.000000.
Last 10 max delta mean: 0.000354.


100%|██████████| 30000/30000 [01:28<00:00, 337.98it/s]


Unnamed: 0,0.00,0.20,0.40,0.60,0.80,1.00
0,-3.088462,-3.090119,-3.09505,-3.100485,-3.108175,-3.1084
1,-2.736157,-2.736957,-2.743392,-2.747204,-2.750538,-2.759635
2,-2.392726,-2.394028,-2.400717,-2.408235,-2.412088,-2.417892
3,-2.061085,-2.064172,-2.071289,-2.078984,-2.078165,-2.080773
4,-1.738917,-1.742883,-1.747897,-1.750939,-1.759249,-1.761635
5,-1.425593,-1.431798,-1.432294,-1.434722,-1.443203,-1.455802
6,-1.12076,-1.129756,-1.134605,-1.132642,-1.145437,-1.141226
7,-0.824442,-0.833727,-0.837687,-0.842563,-0.844324,-0.850231
8,-0.536457,-0.549971,-0.550791,-0.554463,-0.558611,-0.570662
9,-0.262308,-0.273295,-0.277487,-0.282219,-0.290944,-0.294


Optimal policy:
0     0.00
1     0.00
2     0.00
3     0.00
4     0.00
5     0.00
6     0.00
7     0.00
8     0.00
9     0.00
10    0.00
dtype: object
