In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy.special import softmax
import numpy as np
import os

import matplotlib.pyplot as plt
from tqdm import tqdm

import draw
import utils

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [2]:
# !pip install -i https://pypi.tuna.tsinghua.edu.cn/simple gym==0.25.2
# !pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pygame==2.5.2
# !pip install  moviepy

In [3]:
from base64 import b64encode  # 导入base64编码函数，用于视频编码
from IPython.display import display, HTML  # 导入display和HTML函数，用于在Jupyter中显示内容
# from moviepy.editor import ImageSequenceClip  # 导入ImageSequenceClip（已注释），用于创建视频剪辑

# 录制CartPole环境的视频
def record_video(agent, env_name='CartPole-v0', video_dir='video'):  # 定义录制视频的函数，参数为智能体、环境名称和视频保存目录
    env = gym.make(env_name)  # 创建指定名称的gym环境
    env = gym.wrappers.RecordVideo(env, video_dir, episode_trigger=lambda x: True)  # 使用RecordVideo包装器记录每个回合的视频
    state = env.reset()  # 重置环境并获取初始状态
    done = False  # 初始化终止标志为False
    cnt = 0  # 初始化步数计数器
    while not done:  # 当回合未结束时循环
        action = agent.take_action(state)  # 智能体根据当前状态选择动作
        state, _, done, _ = env.step(action)  # 执行动作，获取新状态和是否终止的信息
        cnt = cnt+1  # 步数加1
    print(cnt)  # 打印总步数
    env.close()  # 关闭环境

def display_video(file_path):  # 定义显示视频的函数，参数为视频文件路径
    video = open(file_path, "rb").read()  # 以二进制模式读取视频文件
    encoded_video = b64encode(video).decode("ascii")  # 将视频内容编码为base64字符串
    display(HTML(data=f"""  # 使用HTML显示视频
        <video width="640" height="480" controls>  <!-- 创建视频元素，设置宽高和控制栏 -->
            <source src="data:video/mp4;base64,{encoded_video}" type="video/mp4" />  <!-- 设置视频源为base64编码的数据 -->
        </video>
    """))

In [4]:
class PolicyNet(torch.nn.Module):
    """
    输入状态，输出是策略值，action_dim是2
    """
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(PolicyNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, 128)
        self.fc2 = torch.nn.Linear(128, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return F.softmax(self.fc2(x), dim=-1)

In [5]:
class REINFORCE:
    def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma, device = torch.device("cpu")):
        self.action_dim = action_dim  # 动作空间维度
        self.state_dim =state_dim  # 状态空间维度
        self.policy_net = PolicyNet(state_dim, hidden_dim, action_dim).to(device)  # 创建策略网络并移至指定设备
        self.device = device  # 计算设备（CPU或GPU）
        
        self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=learning_rate)  # 使用Adam优化器
        self.gamma = gamma  # 折扣因子
        self.epsilon = 1  # 探索率，初始值为1

    def save_model(self, path):
        torch.save(self.policy_net.state_dict(), path)  # 保存模型参数

    def load_model(self, path):
        self.policy_net.load_state_dict(torch.load(path, map_location=self.device))  # 加载模型参数
        
    def save_model(self, path):
        torch.save(self.policy_net, path)  # 保存整个模型
    def load_model(self, path):
        self.policy_net = torch.load(path)  # 加载整个模型
        
    def take_action(self, state): 
        if np.random.rand() < self.epsilon:  # 以epsilon的概率随机选择动作（探索）
            return np.random.choice(self.action_dim)  # 随机选择一个动作
        else:
            input = torch.tensor(state, dtype=torch.float).to(self.device)  # 将状态转换为张量并移至设备
            output = self.policy_net(input)  # 通过策略网络获取动作概率分布
            now_frame_probabilities  = output.detach().cpu().numpy()  # 将概率分布转换为numpy数组
            action = np.random.choice(self.action_dim, p=now_frame_probabilities)  # 根据概率分布采样动作
            return action  # 返回选择的动作
    
    def update(self, transition_dict):
        reward_list = transition_dict['rewards']  # 获取奖励列表
        state_list = transition_dict['states']  # 获取状态列表
        action_list = transition_dict['actions']  # 获取动作列表
        G = 0  # 初始化累积奖励为0
        self.optimizer.zero_grad()  # 清空梯度
        for i in reversed(range(len(reward_list))):  # 从最后一步算起
            reward = reward_list[i]  # 获取当前步骤的奖励
            state = state_list[i]  # 获取当前步骤的状态
            action = action_list[i]  # 获取当前步骤的动作
            G = self.gamma * G + reward  # 计算折扣累积奖励
            input = torch.tensor(state, dtype=torch.float).to(self.device)  # 将状态转换为张量并移至设备
            log_prob = torch.log(self.policy_net(input)[action])  # 对所选动作的概率求对数，用于计算梯度
            loss = log_prob * G  # 计算损失函数，G为正时鼓励该动作，G为负时惩罚该动作
            loss = -loss  # 梯度上升（最大化期望回报），需要将损失取负
            loss.backward()  # 反向传播计算梯度
        self.optimizer.step()  # 更新网络参数

In [6]:
def train():
    env = gym.make('CartPole-v0')  # 创建CartPole-v0环境
    agent = REINFORCE(state_dim = 4,  # 创建REINFORCE智能体，状态维度为4
              hidden_dim= 128,  # 隐藏层维度为128
              action_dim = 2,  # 动作维度为2
              learning_rate = 0.001,  # 学习率设为0.001
              gamma = 0.98)  # 折扣因子设为0.98

    print(agent.policy_net)  # 打印策略网络结构
    return_list = []  # 初始化回报列表，用于记录每个回合的累积奖励
    agent.epsilon = 0  # 将探索率设为0，即完全依赖策略网络选择动作
    num_episodes = 3000  # 设置训练回合数为300
    pbar = tqdm(range(num_episodes))  # 创建进度条
    for i in pbar:# 10000  # 循环训练指定回合数
        episode_return = 0  # 初始化当前回合的累积奖励为0
        transition_dict = {  # 初始化转移字典，用于存储轨迹数据
            'states': [],  # 状态列表
            'actions': [],  # 动作列表
            'next_states': [],  # 下一状态列表
            'rewards': [],  # 奖励列表
            'dones': []  # 终止标志列表
        }
        state = env.reset()  # 重置环境，获取初始状态
        done = False  # 初始化终止标志为False
        cnt = 0  # 初始化步数计数器为0
        while not done:  # 当回合未结束时循环
            cnt = cnt + 1  # 步数计数器加1
            # print(state)  # (注释掉的代码)打印当前状态
            action = agent.take_action(state) ########  # 智能体根据当前状态选择动作
            # print(action)  # (注释掉的代码)打印选择的动作
            next_state, reward, done, _ = env.step(action)  # 执行动作，获取下一状态、奖励、终止标志和其他信息
            transition_dict['states'].append(state)  # 将当前状态添加到状态列表
            transition_dict['actions'].append(action)  # 将当前动作添加到动作列表
            transition_dict['next_states'].append(next_state)  # 将下一状态添加到下一状态列表
            transition_dict['rewards'].append(reward)  # 将奖励添加到奖励列表
            transition_dict['dones'].append(done)  # 将终止标志添加到终止标志列表
            state = next_state  # 更新当前状态为下一状态
            episode_return += reward  # 累加奖励
        return_list.append(episode_return)  # 将当前回合的累积奖励添加到回报列表
        agent.update(transition_dict)  # 使用收集的轨迹数据更新智能体的策略网络
        if (i + 1) % 10 == 0:  # 每10个回合
            pbar.set_postfix({  # 更新进度条显示的信息
                'episode':
                '%d' % i,  # 显示当前回合数
                'return':
                '%.3f' % np.mean(return_list[-10:]),  # 显示最近10个回合的平均回报
                'cnt': cnt  # 显示最后一个回合的步数
            })
        pbar.update(1)  # 更新进度条

        if(i % 100==0):  # 每100个回合
            agent.save_model('./models/'+f"{i}.pth")  # 保存模型到指定路径

    print(sum(return_list)/len(return_list))  # 打印所有回合的平均回报
    agent.save_model('./models/'+f"{num_episodes}.pth")  # 保存最终模型
    return agent  # 返回训练好的智能体

In [7]:
training_agent = train()

  logger.warn(
  deprecation(
  deprecation(


PolicyNet(
  (fc1): Linear(in_features=4, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=2, bias=True)
)


  if not isinstance(terminated, (bool, np.bool8)):
100%|██████████| 3000/3000 [03:29<00:00, 14.31it/s, episode=2999, return=200.000, cnt=200]

177.664





In [8]:
record_video(training_agent)
display_video('./video/rl-video-episode-0.mp4')

  logger.warn(
  deprecation(
  deprecation(
  logger.warn(
  logger.deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  from pkg_resources import resource_stream, resource_exists
  elif pkgutil.find_loader("imageio_ffmpeg"):
  if not isinstance(terminated, (bool, np.bool8)):
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


200


In [9]:
agent123 = REINFORCE(state_dim = 4, 
          hidden_dim= 128, 
          action_dim = 2, 
          learning_rate = 0.001, 
          gamma = 0.98)
agent123.epsilon = 0
agent123.load_model('./models/3000.pth')
record_video(agent123)
display_video('./video/rl-video-episode-0.mp4')

UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL __main__.PolicyNet was not an allowed global by default. Please use `torch.serialization.add_safe_globals([__main__.PolicyNet])` or the `torch.serialization.safe_globals([__main__.PolicyNet])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.