In [1]:
import copy

In [2]:
from gym_electric_motor import reward_functions
import pandas as pd
import numpy as np

In [3]:
import seaborn as sns
import matplotlib as mpl
from matplotlib import pyplot as plt

In [4]:
from drmdp import feats, task
from drmdp.envs import gympg

## Control: Current Impl

In [5]:
MAX_STEPS = 1000

In [6]:
env = gympg.make("MountainCar-v0", max_episode_steps=2500)

In [7]:
env, monitor = task.monitor_wrapper(env)
env, monitor

(<EnvMonitorWrapper<TimeLimit<OrderEnforcing<PassiveEnvChecker<MountainCarEnv<MountainCar-v0>>>>>>,
 <drmdp.core.EnvMonitor at 0x31fe9a350>)

In [8]:
rew_delay = task.reward_delay_distribution(
    # {"name": "fixed", "args": {"delay": 8}}
    None
)

In [9]:
rew_delay

In [10]:
env = task.delay_wrapper(env, rew_delay)
env

<EnvMonitorWrapper<TimeLimit<OrderEnforcing<PassiveEnvChecker<MountainCarEnv<MountainCar-v0>>>>>>

In [11]:
feats_spec = {"name": "tiles", "args": {"tiling_dim": 6}}

In [12]:
env = task.reward_mapper(
    env,
    mapping_spec={"name": "identity", "args": None},
    feats_spec=feats_spec,
)
env

<EnvMonitorWrapper<TimeLimit<OrderEnforcing<PassiveEnvChecker<MountainCarEnv<MountainCar-v0>>>>>>

In [13]:
feats_tfx = feats.create_feat_transformer(env=env, **feats_spec)
feats_tfx

<drmdp.feats.TileFeatTransform at 0x3279977d0>

## Uniform Random Policy

In [14]:
lr = task.learning_rate(**{"name": "constant", "args": {"initial_lr": 0.01}})
num_episodes = 500
# Create spec using provided name and args for feature spec
for turn in range(3):
    print("Turn:", turn)
    monitor.clear()
    algorithm = task.create_algorithm(
        env=env,
        feats_transform=feats_tfx,
        delay_reward=rew_delay,
        lr=lr,
        gamma=1.0,
        epsilon=0.2,
        policy_type="uniform-random",
        base_seed=0,
    )
    
    results = task.policy_control(
        env=env,
        algorithm=algorithm,
        num_episodes=num_episodes,
        monitor=monitor,
    )
    
    returns = []
    for episode, snapshot in enumerate(results):
        returns.append(snapshot.returns)
        if episode % (num_episodes//5) == 0:
            print(
                f"Episode: {episode}; Steps: {snapshot.steps}, Mean returns: {np.mean(monitor.returns + [monitor.rewards]).item()};"
            )
            # print(np.reshape(snapshot.weights, (-1, env.action_space.n)))

Turn: 0
Episode: 0; Steps: 2500, Mean returns: -2500.0;


2025-04-08 16:17:58,247 INFO root: Episode 100 mean returns: -2496.810000


Episode: 100; Steps: 2500, Mean returns: -2496.841584158416;


2025-04-08 16:18:09,061 INFO root: Episode 200 mean returns: -2493.880000


Episode: 200; Steps: 2500, Mean returns: -2493.910447761194;


2025-04-08 16:18:19,846 INFO root: Episode 300 mean returns: -2490.700000


Episode: 300; Steps: 2500, Mean returns: -2490.7308970099666;


2025-04-08 16:18:30,504 INFO root: Episode 400 mean returns: -2488.702500


Episode: 400; Steps: 2500, Mean returns: -2488.7306733167084;


2025-04-08 16:18:41,187 INFO root: Episode 500 mean returns: -2487.132000


Turn: 1
Episode: 0; Steps: 2500, Mean returns: -2500.0;


2025-04-08 16:18:51,949 INFO root: Episode 100 mean returns: -2495.600000


Episode: 100; Steps: 2500, Mean returns: -2495.6435643564355;


2025-04-08 16:19:02,692 INFO root: Episode 200 mean returns: -2497.800000


Episode: 200; Steps: 2500, Mean returns: -2497.8109452736317;


2025-04-08 16:19:13,266 INFO root: Episode 300 mean returns: -2488.063333


Episode: 300; Steps: 2500, Mean returns: -2488.102990033223;


2025-04-08 16:19:23,929 INFO root: Episode 400 mean returns: -2489.780000


Episode: 400; Steps: 2500, Mean returns: -2489.805486284289;


2025-04-08 16:19:34,644 INFO root: Episode 500 mean returns: -2489.776000


Turn: 2
Episode: 0; Steps: 2500, Mean returns: -2500.0;


2025-04-08 16:19:45,363 INFO root: Episode 100 mean returns: -2488.520000


Episode: 100; Steps: 2500, Mean returns: -2488.633663366337;


2025-04-08 16:19:56,073 INFO root: Episode 200 mean returns: -2493.680000


Episode: 200; Steps: 2500, Mean returns: -2493.7114427860697;


2025-04-08 16:20:07,042 INFO root: Episode 300 mean returns: -2491.676667


Episode: 300; Steps: 2500, Mean returns: -2491.704318936877;


2025-04-08 16:20:17,853 INFO root: Episode 400 mean returns: -2491.567500


Episode: 400; Steps: 2500, Mean returns: -2491.588528678304;


2025-04-08 16:20:28,431 INFO root: Episode 500 mean returns: -2489.636000


## e-greedy Policy

In [15]:
lr = task.learning_rate(**{"name": "constant", "args": {"initial_lr": 0.01}})
num_episodes = 500
# Create spec using provided name and args for feature spec
for turn in range(3):
    print("Turn:", turn)
    monitor.clear()
    algorithm = task.create_algorithm(
        env=env,
        feats_transform=feats_tfx,
        delay_reward=rew_delay,
        lr=lr,
        gamma=1.0,
        epsilon=0.2,
        policy_type="markovian",
        base_seed=0,
    )
    
    results = task.policy_control(
        env=env,
        algorithm=algorithm,
        num_episodes=num_episodes,
        monitor=monitor,
    )
    
    returns = []
    for episode, snapshot in enumerate(results):
        returns.append(snapshot.returns)
        if episode % (num_episodes//5) == 0:
            print(
                f"Episode: {episode}; Steps: {snapshot.steps}, Mean returns: {np.mean(monitor.returns + [monitor.rewards]).item()};"
            )
            # print(np.reshape(snapshot.weights, (-1, env.action_space.n)))

Turn: 0
Episode: 0; Steps: 2500, Mean returns: -2500.0;


2025-04-08 16:20:30,045 INFO root: Episode 100 mean returns: -358.030000


Episode: 100; Steps: 156, Mean returns: -356.029702970297;


2025-04-08 16:20:30,788 INFO root: Episode 200 mean returns: -261.930000


Episode: 200; Steps: 164, Mean returns: -261.44278606965173;


2025-04-08 16:20:31,502 INFO root: Episode 300 mean returns: -227.413333


Episode: 300; Steps: 147, Mean returns: -227.14617940199335;


2025-04-08 16:20:32,175 INFO root: Episode 400 mean returns: -208.107500


Episode: 400; Steps: 152, Mean returns: -207.96758104738154;


2025-04-08 16:20:32,867 INFO root: Episode 500 mean returns: -197.540000


Turn: 1
Episode: 0; Steps: 2500, Mean returns: -2500.0;


2025-04-08 16:20:34,439 INFO root: Episode 100 mean returns: -352.970000


Episode: 100; Steps: 162, Mean returns: -351.0792079207921;


2025-04-08 16:20:35,196 INFO root: Episode 200 mean returns: -261.295000


Episode: 200; Steps: 149, Mean returns: -260.7363184079602;


2025-04-08 16:20:35,895 INFO root: Episode 300 mean returns: -226.596667


Episode: 300; Steps: 146, Mean returns: -226.32890365448506;


2025-04-08 16:20:36,601 INFO root: Episode 400 mean returns: -209.125000


Episode: 400; Steps: 154, Mean returns: -208.98753117206982;


2025-04-08 16:20:37,278 INFO root: Episode 500 mean returns: -197.486000


Turn: 2
Episode: 0; Steps: 2500, Mean returns: -2500.0;


2025-04-08 16:20:38,851 INFO root: Episode 100 mean returns: -353.560000


Episode: 100; Steps: 158, Mean returns: -351.6237623762376;


2025-04-08 16:20:39,582 INFO root: Episode 200 mean returns: -258.970000


Episode: 200; Steps: 160, Mean returns: -258.4776119402985;


2025-04-08 16:20:40,291 INFO root: Episode 300 mean returns: -225.640000


Episode: 300; Steps: 146, Mean returns: -225.37541528239203;


2025-04-08 16:20:40,966 INFO root: Episode 400 mean returns: -207.080000


Episode: 400; Steps: 148, Mean returns: -206.93266832917706;


2025-04-08 16:20:41,637 INFO root: Episode 500 mean returns: -195.652000
