In [2]:
#Basic information about the game
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class GridWorldEnv(gym.Env):
    """2D网格世界环境，代理需导航至随机目标位置"""
    metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 4}

    def __init__(self, size=5, render_mode=None):
        """
        参数:
            size (int): 网格边长（生成size x size的网格）
            render_mode (str): 渲染模式，可选 "human" 或 "rgb_array"
        """
        self.size = size
        self.window_size = 512  # 渲染窗口尺寸（像素）

        # 定义动作空间：4个离散动作 [右, 上, 左, 下]
        self.action_space = spaces.Discrete(4)

        # 定义观测空间：代理位置（x,y） + 目标位置（x,y）
        self.observation_space = spaces.Dict({
            "agent": spaces.Box(0, size-1, shape=(2,), dtype=int),
            "target": spaces.Box(0, size-1, shape=(2,), dtype=int)
        })

        # 动作到方向向量的映射 [Δx, Δy]
        self._action_to_direction = {
            0: np.array([1, 0]),   # 右
            1: np.array([0, 1]),   # 上
            2: np.array([-1, 0]),  # 左
            3: np.array([0, -1]),  # 下
        }

        self.render_mode = render_mode
        self.window = None  # 渲染窗口（PyGame）

    def _get_obs(self):
        """返回当前观察值"""
        return {"agent": self._agent_location, "target": self._target_location}

    def _get_info(self):
        """返回调试信息（可选）"""
        return {
            "distance": np.linalg.norm(
                self._agent_location - self._target_location, ord=1
            )
        }

    def reset(self, seed=None, options=None):
        """重置环境，随机生成代理和目标位置"""
        super().reset(seed=seed)

        # 随机初始化代理位置（允许任何位置）
        self._agent_location = self.np_random.integers(0, self.size, size=2, dtype=int)

        # 随机生成目标位置，确保不与代理初始位置重合
        self._target_location = self._agent_location
        while np.array_equal(self._target_location, self._agent_location):
            self._target_location = self.np_random.integers(
                0, self.size, size=2, dtype=int
            )

        # 渲染初始化（若需要）
        if self.render_mode == "human":
            self._render_frame()

        return self._get_obs(), self._get_info()

    def step(self, action):
        """执行一个动作"""
        direction = self._action_to_direction[action]

        # 计算新位置（不越界）
        self._agent_location = np.clip(
            self._agent_location + direction, 0, self.size - 1
        )

        # 判断是否到达目标
        terminated = np.array_equal(self._agent_location, self._target_location)
        reward = 1 if terminated else 0  # 到达目标奖励1，否则0
        truncated = False  # 此处不设置步数限制

        # 渲染更新
        if self.render_mode == "human":
            self._render_frame()

        return self._get_obs(), reward, terminated, truncated, self._get_info()

    def render(self):
        """渲染环境（基于PyGame）"""
        if self.render_mode == "rgb_array":
            return self._render_frame()

    def _render_frame(self):
        """内部渲染逻辑（PyGame实现）"""
        if self.window is None and self.render_mode == "human":
            import pygame
            pygame.init()
            self.window = pygame.display.set_mode((self.window_size, self.window_size))

        canvas = pygame.Surface((self.window_size, self.window_size))
        canvas.fill((255, 255, 255))
        pix_square_size = self.window_size / self.size  # 每个网格的像素尺寸

        # 绘制目标（红色）
        pygame.draw.rect(
            canvas,
            (255, 0, 0),
            pygame.Rect(
                pix_square_size * self._target_location,
                (pix_square_size, pix_square_size),
            ),
        )

        # 绘制代理（蓝色）
        pygame.draw.circle(
            canvas,
            (0, 0, 255),
            (self._agent_location + 0.5) * pix_square_size,
            pix_square_size / 3,
        )

        # 绘制网格线
        for x in range(self.size + 1):
            pygame.draw.line(
                canvas,
                0,
                (0, pix_square_size * x),
                (self.window_size, pix_square_size * x),
                width=3,
            )
            pygame.draw.line(
                canvas,
                0,
                (pix_square_size * x, 0),
                (pix_square_size * x, self.window_size),
                width=3,
            )

        if self.render_mode == "human":
            self.window.blit(canvas, canvas.get_rect())
            pygame.event.pump()
            pygame.display.update()
        else:  # rgb_array
            return np.transpose(
                np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2)
            )

    def close(self):
        """关闭渲染窗口"""
        if self.window is not None:
            import pygame
            pygame.display.quit()
            pygame.quit()

In [3]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pygame  # 确保已安装pygame（pip install pygame）

class GridWorldEnv(gym.Env):
    """2D网格世界环境，代理需导航至随机目标位置"""
    metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 4}

    def __init__(self, size=5, render_mode=None):
        self.size = size  # 网格尺寸（size x size）
        self.window_size = 512  # 渲染窗口大小

        # 动作空间：4个方向 [右, 上, 左, 下]
        self.action_space = spaces.Discrete(4)

        # 观测空间：代理位置 + 目标位置
        self.observation_space = spaces.Dict({
            "agent": spaces.Box(0, size-1, shape=(2,), dtype=int),
            "target": spaces.Box(0, size-1, shape=(2,), dtype=int)
        })

        # 动作映射 [Δx, Δy]
        self._action_to_direction = {
            0: np.array([1, 0]),  # 右
            1: np.array([0, 1]),  # 上
            2: np.array([-1, 0]), # 左
            3: np.array([0, -1]), # 下
        }

        self.render_mode = render_mode
        self.window = None

    def _get_obs(self):
        return {"agent": self._agent_location, "target": self._target_location}

    def _get_info(self):
        return {"distance": np.linalg.norm(
            self._agent_location - self._target_location, ord=1
        )}

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)

        # 随机初始化代理位置
        self._agent_location = self.np_random.integers(0, self.size, size=2, dtype=int)

        # 生成不与代理重合的目标位置
        self._target_location = self._agent_location
        while np.array_equal(self._target_location, self._agent_location):
            self._target_location = self.np_random.integers(0, self.size, size=2, dtype=int)

        if self.render_mode == "human":
            self._render_frame()

        return self._get_obs(), self._get_info()

    def step(self, action):
        direction = self._action_to_direction[action]
        self._agent_location = np.clip(
            self._agent_location + direction, 0, self.size - 1
        )

        terminated = np.array_equal(self._agent_location, self._target_location)
        reward = 1 if terminated else 0
        truncated = False

        if self.render_mode == "human":
            self._render_frame()

        return self._get_obs(), reward, terminated, truncated, self._get_info()

    def render(self):
        if self.render_mode == "rgb_array":
            return self._render_frame()

    def _render_frame(self):
        if self.window is None and self.render_mode == "human":
            pygame.init()
            self.window = pygame.display.set_mode((self.window_size, self.window_size))

        canvas = pygame.Surface((self.window_size, self.window_size))
        canvas.fill((255, 255, 255))
        pix_square_size = self.window_size / self.size

        # 绘制目标（红色方块）
        pygame.draw.rect(
            canvas,
            (255, 0, 0),
            pygame.Rect(
                pix_square_size * self._target_location,
                (pix_square_size, pix_square_size),
            ),
        )

        # 绘制代理（蓝色圆形）
        pygame.draw.circle(
            canvas,
            (0, 0, 255),
            (self._agent_location + 0.5) * pix_square_size,
            pix_square_size / 3,
        )

        # 绘制网格线
        for x in range(self.size + 1):
            pygame.draw.line(
                canvas,
                0,
                (0, pix_square_size * x),
                (self.window_size, pix_square_size * x),
                width=3,
            )
            pygame.draw.line(
                canvas,
                0,
                (pix_square_size * x, 0),
                (pix_square_size * x, self.window_size),
                width=3,
            )

        if self.render_mode == "human":
            self.window.blit(canvas, canvas.get_rect())
            pygame.event.pump()
            pygame.display.update()
        else:
            return np.transpose(np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2))

    def close(self):
        if self.window is not None:
            pygame.display.quit()
            pygame.quit()

# ------------------- 测试代码 -------------------
if __name__ == "__main__":
    # 创建可视化环境
    env = GridWorldEnv(size=5, render_mode="human")

    # 运行10步随机动作测试
    obs, info = env.reset()
    for _ in range(10):
        action = env.action_space.sample()  # 随机动作
        obs, reward, terminated, truncated, info = env.step(action)
        print(f"位置: {obs['agent']}, 奖励: {reward}, 结束: {terminated}")
        if terminated:
            obs, info = env.reset()

    env.close()

位置: [2 2], 奖励: 0, 结束: False
位置: [2 3], 奖励: 0, 结束: False
位置: [3 3], 奖励: 0, 结束: False
位置: [3 2], 奖励: 0, 结束: False
位置: [3 3], 奖励: 0, 结束: False
位置: [3 2], 奖励: 0, 结束: False
位置: [4 2], 奖励: 0, 结束: False
位置: [4 3], 奖励: 0, 结束: False
位置: [4 4], 奖励: 0, 结束: False
位置: [4 3], 奖励: 0, 结束: False


In [4]:
from typing import Optional
import numpy as np
import gymnasium as gym


class GridWorldEnv(gym.Env):

    def __init__(self, size: int = 5):
        # The size of the square grid
        self.size = size

        # Define the agent and target location; randomly chosen in `reset` and updated in `step`
        self._agent_location = np.array([-1, -1], dtype=np.int32)
        self._target_location = np.array([-1, -1], dtype=np.int32)

        # Observations are dictionaries with the agent's and the target's location.
        # Each location is encoded as an element of {0, ..., `size`-1}^2
        self.observation_space = gym.spaces.Dict(
            {
                "agent": gym.spaces.Box(0, size - 1, shape=(2,), dtype=int),
                "target": gym.spaces.Box(0, size - 1, shape=(2,), dtype=int),
            }
        )

        # We have 4 actions, corresponding to "right", "up", "left", "down"
        self.action_space = gym.spaces.Discrete(4)
        # Dictionary maps the abstract actions to the directions on the grid
        self._action_to_direction = {
            0: np.array([1, 0]),  # right
            1: np.array([0, 1]),  # up
            2: np.array([-1, 0]),  # left
            3: np.array([0, -1]),  # down
        }

In [None]:
#Constructing Observations
import numpy as np
import matplotlib.pyplot as plt
from grid_world_env import GridWorldEnv  # 假设环境代码已保存为 grid_world_env.py

# ====================== 1. 定义 Q-learning 智能体 ======================
class QLearningAgent:
    def __init__(self, env, learning_rate=0.1, gamma=0.95, epsilon=1.0, epsilon_decay=0.9995, min_epsilon=0.01):
        self.env = env
        self.lr = learning_rate    # 学习率
        self.gamma = gamma        # 折扣因子
        self.epsilon = epsilon    # 初始探索率
        self.epsilon_decay = epsilon_decay  # 探索率衰减系数
        self.min_epsilon = min_epsilon      # 最小探索率

        # 初始化 Q-table：字典键为观测状态元组 (agent_x, agent_y, target_x, target_y)
        self.q_table = {}

    def _get_state_key(self, obs):
        """将观测字典转换为Q-table的键（元组）"""
        agent = tuple(obs["agent"])
        target = tuple(obs["target"])
        return (agent + target)  # 例如：(1, 2, 3, 4)

    def get_action(self, obs):
        """ε-greedy策略选择动作"""
        state_key = self._get_state_key(obs)

        # 初始化未知状态的Q值
        if state_key not in self.q_table:
            self.q_table[state_key] = np.zeros(self.env.action_space.n)

        # 探索：随机选择动作
        if np.random.rand() < self.epsilon:
            return self.env.action_space.sample()
        # 利用：选择Q值最高的动作
        else:
            return np.argmax(self.q_table[state_key])

    def update(self, obs, action, reward, next_obs):
        """Q-learning更新规则"""
        state_key = self._get_state_key(obs)
        next_state_key = self._get_state_key(next_obs)

        # 初始化下一状态的Q值（如果未知）
        if next_state_key not in self.q_table:
            self.q_table[next_state_key] = np.zeros(self.env.action_space.n)

        # 计算TD目标
        current_q = self.q_table[state_key][action]
        max_next_q = np.max(self.q_table[next_state_key])
        td_target = reward + self.gamma * max_next_q

        # 更新Q值
        self.q_table[state_key][action] += self.lr * (td_target - current_q)

        # 衰减探索率
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

# ====================== 2. 训练参数设置 ======================
TRAIN_EPISODES = 2000   # 训练总回合数
TEST_EPISODES = 100     # 测试回合数
SHOW_EVERY = 500        # 每N回合渲染一次演示

# 初始化环境和智能体
env = GridWorldEnv(size=5)
agent = QLearningAgent(env)

# ====================== 3. 训练循环 ======================
train_rewards = []
for episode in range(TRAIN_EPISODES):
    obs, info = env.reset()
    total_reward = 0
    done = False

    while not done:
        action = agent.get_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)

        # 更新智能体
        agent.update(obs, action, reward, next_obs)

        total_reward += reward
        done = terminated or truncated
        obs = next_obs

    train_rewards.append(total_reward)

    # 定期展示学习效果
    if episode % SHOW_EVERY == 0:
        print(f"Episode: {episode}, Epsilon: {agent.epsilon:.2f}, Avg Reward (last 100): {np.mean(train_rewards[-100:]):.2f}")

        # 渲染演示
        demo_env = GridWorldEnv(size=5, render_mode="human")
        demo_obs, _ = demo_env.reset()
        demo_done = False
        while not demo_done:
            action = agent.get_action(demo_obs)
            demo_obs, _, demo_done, _, _ = demo_env.step(action)
        demo_env.close()

# ====================== 4. 评估训练结果 ======================
# 禁用探索
agent.epsilon = 0.0

test_rewards = []
for _ in range(TEST_EPISODES):
    obs, _ = env.reset()
    done = False
    episode_reward = 0

    while not done:
        action = agent.get_action(obs)
        obs, reward, done, _, _ = env.step(action)
        episode_reward += reward

    test_rewards.append(episode_reward)

print(f"\n测试结果 ({TEST_EPISODES} 回合):")
print(f"平均奖励: {np.mean(test_rewards):.2f}")
print(f"达成目标比例: {sum(test_rewards)/TEST_EPISODES * 100:.1f}%")

# ====================== 5. 可视化训练曲线 ======================
plt.figure(figsize=(12, 5))

# 滑动平均奖励（窗口=100）
window_size = 100
smoothed_rewards = [np.mean(train_rewards[i-window_size:i]) for i in range(window_size, len(train_rewards))]

plt.plot(range(window_size, len(train_rewards)), smoothed_rewards, label="滑动平均奖励 (窗口=100)")
plt.xlabel("训练回合")
plt.ylabel("奖励")
plt.title("训练过程表现")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

Episode: 0, Epsilon: 0.99, Avg Reward (last 100): 1.00


In [10]:
#Reset function
# 创建环境
env = GridWorldEnv(size=5)

# 重置环境（设定种子）
obs, info = env.reset(seed=42)
print("初始观测:", obs)  # 例如: {'agent': array([3, 2]), 'target': array([1, 4])}
print("附加信息:", info) # 例如: {'distance': 5.0}

# 再次重置（相同种子，结果一致）
obs, info = env.reset(seed=42)
print("相同种子观测:", obs)  # 与上次相同

初始观测: {'agent': array([0, 3]), 'target': array([3, 2])}
附加信息: {'distance': np.float64(4.0)}
相同种子观测: {'agent': array([0, 3]), 'target': array([3, 2])}


In [17]:
#Step function
# 创建环境
env = GridWorldEnv(size=5)

# 重置环境
obs, info = env.reset(seed=42)
print("初始位置:", obs["agent"])  # 例如: [3, 2]

# 执行动作（向右移动）
action = 0
next_obs, reward, terminated, truncated, info = env.step(action)
print("新位置:", next_obs["agent"])  # 例如: [4, 2]
print("奖励:", reward)              # 0（未到达目标）
print("是否终止:", terminated)       # False

初始位置: [0 3]
新位置: [1 3]
奖励: 0
是否终止: False


In [23]:
#Registering and making the environment
# main.py
import gymnasium as gym

# 创建环境实例（自动从注册表加载）
env = gym.make(
    "GridWorld-v0",
    render_mode="human",  # 开启可视化
    size=10               # 覆盖默认参数为10x10网格
)

# 运行测试
obs, info = env.reset()
print("初始观测:", obs)
print("初始距离:", info["distance"])

for step in range(20):
    action = env.action_space.sample()  # 随机策略
    obs, reward, terminated, truncated, info = env.step(action)

    print(f"\nStep {step + 1}")
    print("动作:", ["右", "上", "左", "下"][action])
    print("新位置:", obs["agent"])
    print("奖励:", reward)
    print("距离:", info["distance"])

    if terminated or truncated:
        print("环境终止，重置...")
        obs, info = env.reset()

env.close()

NameNotFound: Environment `GridWorld` doesn't exist.

In [1]:
#Using Wrappers
from gymnasium.wrappers import FlattenObservation

env = gym.make('gymnasium_env/GridWorld-v0')
env.observation_space

env.reset()

wrapped_env = FlattenObservation(env)
wrapped_env.observation_space

wrapped_env.reset()


NameError: name 'gym' is not defined