In [1]:
import gym
import numpy as np

In [2]:
env = gym.make('MountainCar-v0', render_mode="human")
print('观测空间 O: {}'.format(env.observation_space))
print('动作空间 A: {}'.format(env.action_space))
print('观测范围 : {} ~ {}'.format(env.observation_space.low, env.observation_space.high))
print('动作数 n: {}'.format(env.action_space.n))

观测空间 O: Box([-1.2  -0.07], [0.6  0.07], (2,), float32)
动作空间 A: Discrete(3)
观测范围 : [-1.2  -0.07] ~ [0.6  0.07]
动作数 n: 3


In [3]:
class BespokeAgent:
    def __init__(self, env):
        pass

    def decide(self, observation):
        position, velocity = observation
        lower_bound = min(-0.09 * (position + 0.25) ** 2 + 0.03, 0.3 * (position + 0.9) ** 4 - 0.008)
        upper_bound = -0.07 * (position + 0.38) ** 2 + 0.07
        if lower_bound < velocity and velocity < upper_bound:
            return 2 # 返回动作
        else:
            return 0

    def learn(self, *args):
        pass

agent = BespokeAgent(env)

In [4]:
def play_montecarlo(env, agent, render=False, train=False, seed=None):
    episode_reward = 0.0
    observation, _ = env.reset(seed=seed) # 重置游戏环境, 开始新的一回合
    while True:
        if render:
            env.render()
        action = agent.decide(observation)
        next_observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        if train:
            agent.learn(observation, action, reward, terminated, truncated)
        if terminated or truncated:
            break
        observation = next_observation
    return episode_reward

In [5]:
# 设置初始化种子, 没有引入时间, 所以同样的数字就可以重现
episode_reward = play_montecarlo(env, agent, render=True, seed = 0)
print('回合奖励: {}'.format(episode_reward))
env.close()

回合奖励: -123.0


In [6]:
env = gym.make('MountainCar-v0') # 不设置 render
episode_rewards = [play_montecarlo(env, agent) for _ in range(100)] # 交互 100 回合求平均
print(f'平均回合奖励 {np.mean(episode_rewards):.2f}')

平均回合奖励 -105.82
