In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gym

# 超参数
BATCH_SIZE = 32
LR = 0.01                   # learning rate
EPSILON = 0.9               # 最优选择动作百分比
GAMMA = 0.9                 # 奖励递减参数
TARGET_REPLACE_ITER = 100   # Q 现实网络的更新频率
MEMORY_CAPACITY = 2000      # 记忆库大小
env = gym.make('CartPole-v0')   # 立杆子游戏
env = env.unwrapped
N_ACTIONS = env.action_space.n  # 杆子能做的动作
N_STATES = env.observation_space.shape[0]   # 杆子能获取的环境信息数

  f"The environment {id} is out of date. You should consider "


In [2]:
class Net(nn.Module):
    def __init__(self, ):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(N_STATES, 10)
        self.fc1.weight.data.normal_(0, 0.1)   # initialization
        self.out = nn.Linear(10, N_ACTIONS)
        self.out.weight.data.normal_(0, 0.1)   # initialization

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        actions_value = self.out(x)
        return actions_value

In [59]:
class DQN(object):
    def __init__(self):
        self.eval_net, self.target_net = Net(), Net()

        self.learn_step_counter = 0     # 用于 target 更新计时
        self.memory_counter = 0         # 记忆库记数
        self.memory = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))     # 初始化记忆库
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)    # torch 的优化器
        self.loss_func = nn.MSELoss()   # 误差公式

    def choose_action(self, x):
        x = torch.unsqueeze(torch.FloatTensor(x), 0)
        # 这里只输入一个 sample
        if np.random.uniform() < EPSILON:   # 选最优动作
            actions_value = self.eval_net.forward(x)
            action = torch.max(actions_value, 1)[1].data.numpy()[0]     # return the argmax
        else:   # 选随机动作
            action = np.random.randint(0, N_ACTIONS)
        return action

    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, [a, r], s_))
        # 如果记忆库满了, 就覆盖老数据
        index = self.memory_counter % MEMORY_CAPACITY
        self.memory[index, :] = transition
        self.memory_counter += 1

    def learn(self):
        # target net 参数更新
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.learn_step_counter += 1

        # 抽取记忆库中的批数据
        sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
        b_memory = self.memory[sample_index, :]
        b_s = torch.FloatTensor(b_memory[:, :N_STATES])
        b_a = torch.LongTensor(b_memory[:, N_STATES:N_STATES+1].astype(int))
        b_r = torch.FloatTensor(b_memory[:, N_STATES+1:N_STATES+2])
        b_s_ = torch.FloatTensor(b_memory[:, -N_STATES:])

        # 针对做过的动作b_a, 来选 q_eval 的值, (q_eval 原本有所有动作的值)
        q_eval = self.eval_net(b_s).gather(1, b_a)  # shape (batch, 1)
        q_next = self.target_net(b_s_).detach()     # q_next 不进行反向传递误差, 所以 detach
        q_target = b_r + GAMMA * q_next.max(1)[0]   # shape (batch, 1)
        loss = self.loss_func(q_eval, q_target)

        # 计算, 更新 eval net
        self.optimizer.zero_grad()
        loss.backward()
        print(loss.data.numpy())
        self.optimizer.step()

In [60]:
dqn = DQN() # 定义 DQN 系统

for i_episode in range(400):
    s = env.reset()[0]
    while True:
        env.render()    # 显示实验动画
        a = dqn.choose_action(s)

        # 选动作, 得到环境反馈
        s_, r, done, info = env.step(a)[0:4]

        # 修改 reward, 使 DQN 快速学习
        x, x_dot, theta, theta_dot = s_
        r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
        r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
        r = r1 + r2

        # 存记忆
        dqn.store_transition(s, a, r, s_)

        if dqn.memory_counter > MEMORY_CAPACITY:
            dqn.learn() # 记忆库满了就进行学习

        if done:    # 如果回合结束, 进入下回合
            break

        s = s_

  "You are calling render method without specifying any render mode. "
  return F.mse_loss(input, target, reduction=self.reduction)


0.24530333
0.16351673
0.1627185
0.14106163
0.12617736
0.15451168
0.14884552
0.1258276
0.11194172
0.09882002
0.10336585
0.07105324
0.09711501
0.03715909
0.07875972
0.083208814
0.036934197
0.0741881
0.060016185
0.062729865
0.04125863
0.061757468
0.04637832
0.046701115
0.04160686
0.035633445
0.062427506
0.039537676
0.046522655
0.041093282
0.048261885
0.029952584
0.034393094
0.016593551
0.031171435
0.06508468
0.04387221
0.053236913
0.03612735
0.040169477
0.04452055
0.034851648
0.031712435
0.030163366
0.03686123
0.026858132
0.028182317
0.029733185
0.030984815
0.029109556
0.03476507
0.041030865
0.024933593
0.021907479
0.033853546
0.025419312
0.04043994
0.035649184
0.0226678
0.033989172
0.025707075
0.026403535
0.034815345
0.022234302
0.029221175
0.019807171
0.043139175
0.019626273
0.03482858
0.02585011
0.03972042
0.019830775
0.027558839
0.027160801
0.03409615
0.02538801
0.026335973
0.01917901
0.013755761
0.021831734
0.028716926
0.02096
0.02465638
0.021884926
0.020617994
0.013662233
0.02537761

In [58]:
dqn.memory

array([[-0.01166239, -0.55088443, -0.00779113, ..., -0.74589914,
         0.00898073,  1.12881577],
       [-0.02268007, -0.74589914,  0.00898073, ..., -0.55089593,
         0.03155705,  0.83896309],
       [-0.03759805, -0.55089593,  0.03155705, ..., -0.74643427,
         0.0483363 ,  1.14140069],
       ...,
       [-0.00198668,  0.03350954, -0.02380514, ..., -0.16126306,
        -0.02416793,  0.26693836],
       [-0.00131649, -0.16126306, -0.02416793, ..., -0.35603192,
        -0.01882917,  0.55190164],
       [-0.00454175, -0.35603192, -0.01882917, ..., -0.55088443,
        -0.00779113,  0.83859313]])

In [35]:
eval_net = Net()
x = env.reset()[0]
x = torch.unsqueeze(torch.FloatTensor(x), 0)
eval_net.forward(x)
env.step(0)[0:4]

(array([ 0.02359051, -0.23325674, -0.04790257,  0.28283146], dtype=float32),
 1.0,
 False,
 False)

In [26]:
x = env.reset()[0]
x = torch.unsqueeze(torch.FloatTensor(x), 0)
        # 这里只输入一个 sample
        # if np.random.uniform() < EPSILON:   # 选最优动作
actions_value = eval_net.forward(x)
print(actions_value)
print(torch.max(actions_value, 1)[1].data.numpy()[0])
        # action = torch.max(actions_value, 1)[1].data.numpy()[0, 0]     # return the argmax
        # else:   # 选随机动作
        #     action = np.random.randint(0, N_ACTIONS)
        # return action

# torch.FloatTensor(env.reset())

tensor([[0.3313, 0.1501]], grad_fn=<AddmmBackward0>)
0


In [44]:
import gym
env = gym.make('CartPole-v1')
env.reset()
for _ in range(1000):
    env.render()
    observation, reward, done, info = env.step(env.action_space.sample())[0:4]
    if done:
        env.reset()
env.close()



  "You are calling render method without specifying any render mode. "


In [45]:
import gym

def main():
    env = gym.make('CartPole-v1', render_mode="human")
    for i_episode in range(20):
        observation = env.reset()
        for t in range(100):
            env.render()
            print(observation)
            action = env.action_space.sample()
            observation, reward, done, info, _ = env.step(action)
            if done:
                print("Episode finished after {} timesteps".format(t + 1))
                break

if __name__ == "__main__":
    main()

DependencyNotInstalled: pygame is not installed, run `pip install gym[classic_control]`