In [1]:
import numpy as np

In [2]:
from drmdp import feats, task
from drmdp.envs import gympg

## Control: Current Impl

In [3]:
MAX_STEPS = 1000

In [4]:
env = gympg.make("MountainCar-v0", max_episode_steps=2500)

In [5]:
env, monitor = task.monitor_wrapper(env)
env, monitor

(<EnvMonitorWrapper<TimeLimit<OrderEnforcing<PassiveEnvChecker<MountainCarEnv<MountainCar-v0>>>>>>,
 <drmdp.core.EnvMonitor at 0x31a942050>)

In [6]:
rew_delay = task.reward_delay_distribution(
    # {"name": "fixed", "args": {"delay": 8}}
    None
)

In [7]:
rew_delay

In [8]:
env = task.delay_wrapper(env, rew_delay)
env

<EnvMonitorWrapper<TimeLimit<OrderEnforcing<PassiveEnvChecker<MountainCarEnv<MountainCar-v0>>>>>>

In [9]:
feats_spec = {"name": "tiles", "args": {"tiling_dim": 6}}

In [10]:
env = task.reward_mapper(
    env,
    mapping_spec={"name": "identity", "args": None},
    feats_spec=feats_spec,
)
env

<EnvMonitorWrapper<TimeLimit<OrderEnforcing<PassiveEnvChecker<MountainCarEnv<MountainCar-v0>>>>>>

In [11]:
feats_tfx = feats.create_feat_transformer(env=env, **feats_spec)
feats_tfx

<drmdp.feats.TileFeatTransform at 0x323bbb4d0>

## Uniform Random Policy

In [12]:
lr = task.learning_rate(**{"name": "constant", "args": {"initial_lr": 0.01}})
num_episodes = 500
# Create spec using provided name and args for feature spec
for turn in range(3):
    print("Turn:", turn)
    monitor.clear()
    algorithm = task.create_algorithm(
        env=env,
        feats_transform=feats_tfx,
        delay_reward=rew_delay,
        lr=lr,
        gamma=1.0,
        epsilon=0.2,
        policy_type="uniform-random",
        base_seed=0,
    )
    
    results = task.policy_control(
        env=env,
        algorithm=algorithm,
        num_episodes=num_episodes,
        monitor=monitor,
    )
    
    returns = []
    for episode, snapshot in enumerate(results):
        returns.append(snapshot.returns)
        if episode % (num_episodes//5) == 0:
            print(
                f"Episode: {episode}; Steps: {snapshot.steps}, Mean returns: {np.mean(monitor.returns + [monitor.rewards]).item()};"
            )
            # print(np.reshape(snapshot.weights, (-1, env.action_space.n)))

Turn: 0
Episode: 0; Steps: 2500, Mean returns: -2500.0;


2025-04-16 15:05:36,800 INFO root: Episode 100 mean returns: -2494.940000


Episode: 100; Steps: 2500, Mean returns: -2494.990099009901;


2025-04-16 15:05:47,468 INFO root: Episode 200 mean returns: -2492.395000


Episode: 200; Steps: 2500, Mean returns: -2492.4328358208954;


2025-04-16 15:05:58,111 INFO root: Episode 300 mean returns: -2491.276667


Episode: 300; Steps: 2500, Mean returns: -2491.3056478405315;


2025-04-16 15:06:08,789 INFO root: Episode 400 mean returns: -2491.747500


Episode: 400; Steps: 2500, Mean returns: -2491.7680798004985;


2025-04-16 15:06:19,481 INFO root: Episode 500 mean returns: -2493.074000


Turn: 1
Episode: 0; Steps: 2500, Mean returns: -2500.0;


2025-04-16 15:06:30,206 INFO root: Episode 100 mean returns: -2490.670000


Episode: 100; Steps: 2500, Mean returns: -2490.762376237624;


2025-04-16 15:06:40,908 INFO root: Episode 200 mean returns: -2493.910000


Episode: 200; Steps: 2500, Mean returns: -2493.9402985074626;


2025-04-16 15:06:51,590 INFO root: Episode 300 mean returns: -2493.703333


Episode: 300; Steps: 2500, Mean returns: -2493.724252491694;


2025-04-16 15:07:02,305 INFO root: Episode 400 mean returns: -2495.272500


Episode: 400; Steps: 2500, Mean returns: -2495.284289276808;


2025-04-16 15:07:12,965 INFO root: Episode 500 mean returns: -2494.940000


Turn: 2
Episode: 0; Steps: 2500, Mean returns: -2500.0;


2025-04-16 15:07:23,777 INFO root: Episode 100 mean returns: -2491.160000


Episode: 100; Steps: 2500, Mean returns: -2491.2475247524753;


2025-04-16 15:07:34,440 INFO root: Episode 200 mean returns: -2493.020000


Episode: 200; Steps: 2500, Mean returns: -2493.054726368159;


2025-04-16 15:07:45,103 INFO root: Episode 300 mean returns: -2493.556667


Episode: 300; Steps: 2500, Mean returns: -2493.578073089701;


2025-04-16 15:07:55,776 INFO root: Episode 400 mean returns: -2494.745000


Episode: 400; Steps: 2500, Mean returns: -2494.7581047381545;


2025-04-16 15:08:06,438 INFO root: Episode 500 mean returns: -2494.544000


## e-greedy Policy

In [13]:
lr = task.learning_rate(**{"name": "constant", "args": {"initial_lr": 0.01}})
num_episodes = 500
# Create spec using provided name and args for feature spec
for turn in range(3):
    print("Turn:", turn)
    monitor.clear()
    algorithm = task.create_algorithm(
        env=env,
        feats_transform=feats_tfx,
        delay_reward=rew_delay,
        lr=lr,
        gamma=1.0,
        epsilon=0.2,
        policy_type="markovian",
        base_seed=0,
    )
    
    results = task.policy_control(
        env=env,
        algorithm=algorithm,
        num_episodes=num_episodes,
        monitor=monitor,
    )
    
    returns = []
    for episode, snapshot in enumerate(results):
        returns.append(snapshot.returns)
        if episode % (num_episodes//5) == 0:
            print(
                f"Episode: {episode}; Steps: {snapshot.steps}, Mean returns: {np.mean(monitor.returns + [monitor.rewards]).item()};"
            )
            # print(np.reshape(snapshot.weights, (-1, env.action_space.n)))

Turn: 0
Episode: 0; Steps: 2423, Mean returns: -2423.0;


2025-04-16 15:08:08,108 INFO root: Episode 100 mean returns: -368.760000


Episode: 100; Steps: 158, Mean returns: -366.6732673267327;


2025-04-16 15:08:08,894 INFO root: Episode 200 mean returns: -271.230000


Episode: 200; Steps: 153, Mean returns: -270.64179104477614;


2025-04-16 15:08:09,619 INFO root: Episode 300 mean returns: -234.186667


Episode: 300; Steps: 148, Mean returns: -233.90033222591362;


2025-04-16 15:08:10,345 INFO root: Episode 400 mean returns: -215.755000


Episode: 400; Steps: 150, Mean returns: -215.59102244389027;


2025-04-16 15:08:11,052 INFO root: Episode 500 mean returns: -203.706000


Turn: 1
Episode: 0; Steps: 2500, Mean returns: -2500.0;


2025-04-16 15:08:12,632 INFO root: Episode 100 mean returns: -351.270000


Episode: 100; Steps: 161, Mean returns: -349.38613861386136;


2025-04-16 15:08:13,392 INFO root: Episode 200 mean returns: -259.625000


Episode: 200; Steps: 155, Mean returns: -259.1044776119403;


2025-04-16 15:08:14,118 INFO root: Episode 300 mean returns: -226.626667


Episode: 300; Steps: 150, Mean returns: -226.37209302325581;


2025-04-16 15:08:14,834 INFO root: Episode 400 mean returns: -209.630000


Episode: 400; Steps: 148, Mean returns: -209.47630922693267;


2025-04-16 15:08:15,518 INFO root: Episode 500 mean returns: -197.960000


Turn: 2
Episode: 0; Steps: 2500, Mean returns: -2500.0;


2025-04-16 15:08:17,165 INFO root: Episode 100 mean returns: -363.920000


Episode: 100; Steps: 159, Mean returns: -361.8910891089109;


2025-04-16 15:08:17,926 INFO root: Episode 200 mean returns: -265.705000


Episode: 200; Steps: 157, Mean returns: -265.1641791044776;


2025-04-16 15:08:18,647 INFO root: Episode 300 mean returns: -230.056667


Episode: 300; Steps: 160, Mean returns: -229.82392026578074;


2025-04-16 15:08:19,355 INFO root: Episode 400 mean returns: -211.445000


Episode: 400; Steps: 121, Mean returns: -211.21945137157107;


2025-04-16 15:08:20,042 INFO root: Episode 500 mean returns: -199.468000
