### 创建交互环境

首先需要创建一个网格环境，其中包含障碍物、起点和终点，每个元素都有不同的颜色表示。这个环境可以用一个二维数组来表示，其中不同的数字代表不同类型的格子。

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors

class GridEnvironment:
    def __init__(self, grid_size=16, impassable_obstacles=None, negative_obstacles=None, start=(0, 0), goal=(15, 15)):
        self.grid_size = grid_size
        self.impassable_obstacles = impassable_obstacles if impassable_obstacles else []
        self.negative_obstacles = negative_obstacles if negative_obstacles else []
        self.start = start
        self.goal = goal
        self.state = start
        self.grid = np.zeros((grid_size, grid_size))

        for obstacle in self.impassable_obstacles:
            self.grid[obstacle] = -1  # Represent impassable obstacles with -1
        for obstacle in self.negative_obstacles:
            self.grid[obstacle] = -2  # Represent negative reward obstacles with -2
        self.grid[goal] = 1  # Represent the goal with 1

    def render(self):
        cmap = colors.ListedColormap(['white', 'green', 'red', 'blue'])
        bounds = [-2, -1, 0, 1, 2]
        norm = colors.BoundaryNorm(bounds, cmap.N)

        fig, ax = plt.subplots()
        ax.imshow(self.grid, cmap=cmap, norm=norm)
        ax.grid(which='major', axis='both', linestyle='-', color='k', linewidth=2)
        ax.set_xticks(np.arange(-.5, self.grid_size, 1))
        ax.set_yticks(np.arange(-.5, self.grid_size, 1))

        # Draw the start and goal positions
        ax.text(self.start[0], self.start[1], 'S', va='center', ha='center', color='black', fontsize=12)
        ax.text(self.goal[0], self.goal[1], 'G', va='center', ha='center', color='black', fontsize=12)

        plt.show()

    def step(self, action):
        action_mappings = [(-1, 0), (1, 0), (0, -1), (0, 1)]
        next_state = tuple(np.add(self.state, action_mappings[action]))

        # Check for boundaries and obstacles
        if (0 <= next_state[0] < self.grid_size and
            0 <= next_state[1] < self.grid_size and
            self.grid[next_state] != -1):
            self.state = next_state
        else:
            next_state = self.state  # No change in state if it's an invalid move

        # Define rewards
        reward = -1  # Default reward
        if self.grid[next_state] == -2:
            reward = -5  # Negative reward for negative obstacles
        elif next_state == self.goal:
            reward = 100  # High reward for reaching the goal
            return next_state, reward, True  # Episode ends

        return next_state, reward, False

    def render_path(self, path):
        path_grid = self.grid.copy()
        for step in path:
            if step != self.start and step != self.goal and path_grid[step] == 0:
                path_grid[step] = 0.5  # Represent the path with a different value

        cmap = colors.ListedColormap(['white', 'green', 'red', 'blue', 'yellow'])
        bounds = [-2, -1, 0, 0.5, 1, 2]
        norm = colors.BoundaryNorm(bounds, cmap.N)

        fig, ax = plt.subplots()
        ax.imshow(path_grid, cmap=cmap, norm=norm)
        ax.grid(which='major', axis='both', linestyle='-', color='k', linewidth=2)
        ax.set_xticks(np.arange(-.5, self.grid_size, 1))
        ax.set_yticks(np.arange(-.5, self.grid_size, 1))

        # Draw the start and goal positions
        ax.text(self.start[0], self.start[1], 'S', va='center', ha='center', color='black', fontsize=12)
        ax.text(self.goal[0], self.goal[1], 'G', va='center', ha='center', color='black', fontsize=12)

        plt.show()

# Example of how to create and use the environment
env = GridEnvironment(impassable_obstacles=[(12, 2), (3, 13), (4, 4), (8, 7), (6, 6), (6, 7)], 
                      negative_obstacles=[(8, 2), (2, 11), (3, 14), (7, 7), (7, 8), (8, 8), (8, 9), (8, 10), (8, 13), (7, 13), (7, 14)] )
env.render()


### 构建 Q-learning 模型

Q-learning 是一种无模型的强化学习算法，它使用一个表格（Q-table）来存储在给定状态下采取不同动作的预期收益。我们的目标是通过探索环境来更新这个 Q-table，从而找到最优策略。

在这个网格环境中，每个状态可以表示为网格上的一个位置，动作则是从一个格子移动到另一个格子。动作通常是：上、下、左、右移动。

In [None]:
import random

class QLearningAgent:
    def __init__(self, env, learning_rate=0.1, discount_factor=0.9, epsilon=0.1):
        self.env = env
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.q_table = np.zeros((env.grid_size, env.grid_size, 4))  # 4 actions: up, down, left, right

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice([0, 1, 2, 3])  # Explore: choose a random action
        else:
            return np.argmax(self.q_table[state[0], state[1]])  # Exploit: choose the best known action

    def update_q_table(self, state, action, reward, next_state):
        best_next_action = np.argmax(self.q_table[next_state[0], next_state[1]])
        td_target = reward + self.discount_factor * self.q_table[next_state[0], next_state[1], best_next_action]
        td_error = td_target - self.q_table[state[0], state[1], action]
        self.q_table[state[0], state[1], action] += self.learning_rate * td_error


### 定义训练过程（算法）

In [None]:
# Update the train_agent function to record the path
def train_agent(env, agent, episodes=500):
    for episode in range(episodes):
        state = env.start
        env.state = state
        done = False
        path = [state]  # Initialize the path list

        while not done:
            action = agent.choose_action(state)
            next_state, reward, done = env.step(action)
            agent.update_q_table(state, action, reward, next_state)
            state = next_state
            path.append(state)

        if episode % 10 == 0:
            print(f"Episode {episode}: Agent is learning...")
            env.render_path(path)  # Visualize the path

# Example usage
env = GridEnvironment()
agent = QLearningAgent(env)
train_agent(env, agent)

In [None]:
def visualize_training_results(rewards):
    # Calculate and plot the rolling average of rewards
    rolling_avg = np.convolve(rewards, np.ones(100)/100, mode='valid')
    plt.figure(figsize=(10, 5))
    plt.plot(rolling_avg)
    plt.title("Rewards Rolling Average Over Episodes")
    plt.xlabel("Episode")
    plt.ylabel("Average Reward")
    plt.show()

# 可视化训练结果
visualize_training_results(rewards)
