In [4]:
# DQL
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import gym              # this includes many visualisable physical experiments

In [5]:
# Hyper Parameters
BATCH_SIZE = 32
LR = 0.01                   # learning rate
EPSILON = 0.9               # greedy policy
GAMMA = 0.9                 # reward discount
TARGET_REPLACE_ITER = 100   # target update frequency
MEMORY_CAPACITY = 2000

env = gym.make('CartPole-v0')               # import experiment
env = env.unwrapped
N_ACTIONS = env.action_space.n                # N actions of the ex
N_STATES = env.observation_space.shape[0]       # N observation points(N states) of the ex

ENV_A_SHAPE = 0 if isinstance(env.action_space.sample(), int) else env.action_space.sample().shape     # to confirm the shape


In [6]:
class Net(nn.Module):      #输入是N_STATES,输出是N_ACTIONS,在当前N个state下采取最好的一个action
    def __init__(self,):   
        super(Net,self).__init__() 
        self.fc1 = nn.Linear(N_STATES, 50)
        self.fc1.weight.data.normal_(0, 0.1)   # initialization #随机初始化权重的过程，按照normalize，平均值0，标准差0.1
        self.out = nn.Linear(50, N_ACTIONS)
        self.out.weight.data.normal_(0, 0.1)   # initialization
        
    def forward(self,x):
        x = self.fc1(x)
        x = F.relu(x)
        actions_value = self.out(x)
        return actions_value
    
    
class DQN(object):                  # DQN framework
    def __init__(self,):
        self.eval_net, self.target_net = Net(), Net()   # eval_net时刻更新学习，target_net每定期多少步后用eval_net来更新

        self.learn_step_counter = 0                                     # for target updating， 通过这个定义每一百步更新一次target_net
        self.memory_counter = 0                                         # for storing memory
        
        self.memory = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))     # initialize memory # N_STATES * 2为s,s_的存储，+2为a和r的存储
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)
        self.loss_func = nn.MSELoss()
        
    def choose_action(self,x):           # x: observation state
        x = torch.unsqueeze(torch.FloatTensor(x), 0)
        # 这里只输入一个 sample
        if np.random.uniform() < EPSILON:   # greedy，表示贪心的时候：选最优动作
            actions_value = self.eval_net.forward(x)
            action = torch.max(actions_value, 1)[1].data.numpy()        # return the argmax
            action = action[0] if ENV_A_SHAPE == 0 else action.reshape(ENV_A_SHAPE)
        else:   # non greedy: 选随机动作
            action = np.random.randint(0, N_ACTIONS)
            action = action if ENV_A_SHAPE == 0 else action.reshape(ENV_A_SHAPE)
        return action
    
    def store_transition(self,s,a,r,s_):               # r: reward, s_: next state   # 该函数存储记忆
        transition = np.hstack((s, [a, r], s_))
        # 如果记忆库满了, 就覆盖老数据
        index = self.memory_counter % MEMORY_CAPACITY
        self.memory[index, :] = transition
        self.memory_counter += 1
        
    def learn(self):
        # target net 参数更新,用load_state_dict直接从eval_net中更新，而不从数据中更新
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())        
        self.learn_step_counter += 1

        # 随机抽取记忆库中的批数据（批量32）
        sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE) #表示取BATCH_SIZE个数，每个数随机值，范围0到MEMORY_CAPACITY-1
        b_memory = self.memory[sample_index, :]
        b_s = torch.FloatTensor(b_memory[:, :N_STATES])
        b_a = torch.LongTensor(b_memory[:, N_STATES:N_STATES+1].astype(int))    #shape (batch,1)
        b_r = torch.FloatTensor(b_memory[:, N_STATES+1:N_STATES+2])             #shape (batch,1)
        b_s_ = torch.FloatTensor(b_memory[:, -N_STATES:])

        # 针对做过的动作b_a, 来选 q_eval 的值, (q_eval 原本有所有动作的值)
        #算q_eval,这是预测值
        q_eval = self.eval_net(b_s).gather(dim=1, index=b_a)  # shape (batch, 1) #self.eval_net(b_s)的输出为actions_value
        #算q_target，这是现实值
        q_next = self.target_net(b_s_).detach()     # q_next 不进行反向传递误差,因为tagrt_net这里不更新, 所以 detach
        q_target = b_r + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1)   # shape (batch, 1)
        #计算loss
        loss = self.loss_func(q_eval, q_target)
        
        #上面这一部分更新我的理解：首先我们知道了Qlearning是用的类似q_target的方法迭代，并且能迭代到准确的q值，在DQL中，我们用s_的数据
        #（表示下一步的数据）来更新q_target，用s来更新eval（表示上一步的数据，并且eval是用神经网络训练的），表示eval追赶target的过程

        # 计算, 更新 eval net
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [7]:
# training process
dqn = DQN() # 定义 DQN 系统

print('\nCollecting experience...')
for i_episode in range(400): 
    s = env.reset()         # initialize
    ep_r = 0
    while True:
        env.render()        # 显示实验动画
        a = dqn.choose_action(s)            #根据当前state来选取action

        # 执行动作a, 得到环境反馈
        s_, r, done, info = env.step(a) #done表示是否要环境重置 env.reset，当 Done 为 True 时，就表明当前回合(episode)或者试验(tial)结束。
                                        #例如当机器人摔倒或者掉出台面，就应当终止当前回合进行重置(reset)

        # 修改 reward, 使 DQN 快速学习，表示小车杆子越直，reward越大，小车越在中间，reward越大
        x, x_dot, theta, theta_dot = s_
        r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
        r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
        r = r1 + r2
        
        # 存记忆
        dqn.store_transition(s, a, r, s_)

        ep_r += r
        if dqn.memory_counter > MEMORY_CAPACITY:
            dqn.learn()          # 记忆库满了就进行学习)
            if done:       
                print('Ep: ', i_episode,
                      '| Ep_r: ', round(ep_r, 2))
        
        if done:               #如果回合结束, 进入下回合
            break
        s = s_


Collecting experience...
Ep:  206 | Ep_r:  2.54
Ep:  207 | Ep_r:  1.53
Ep:  208 | Ep_r:  1.34
Ep:  209 | Ep_r:  1.21
Ep:  210 | Ep_r:  2.59
Ep:  211 | Ep_r:  2.8
Ep:  212 | Ep_r:  2.32
Ep:  213 | Ep_r:  3.24
Ep:  214 | Ep_r:  2.02
Ep:  215 | Ep_r:  2.87
Ep:  216 | Ep_r:  3.79
Ep:  217 | Ep_r:  2.34
Ep:  218 | Ep_r:  3.32
Ep:  219 | Ep_r:  10.17
Ep:  220 | Ep_r:  2.27
Ep:  221 | Ep_r:  1.83
Ep:  222 | Ep_r:  2.11
Ep:  223 | Ep_r:  1.05
Ep:  224 | Ep_r:  1.61
Ep:  225 | Ep_r:  1.31
Ep:  226 | Ep_r:  2.2
Ep:  227 | Ep_r:  2.97
Ep:  228 | Ep_r:  2.08
Ep:  229 | Ep_r:  3.74
Ep:  230 | Ep_r:  3.54
Ep:  231 | Ep_r:  2.06
Ep:  232 | Ep_r:  0.27
Ep:  233 | Ep_r:  25.56
Ep:  234 | Ep_r:  41.18
Ep:  235 | Ep_r:  28.19
Ep:  236 | Ep_r:  45.14
Ep:  237 | Ep_r:  15.39
Ep:  238 | Ep_r:  0.17
Ep:  239 | Ep_r:  54.01
Ep:  240 | Ep_r:  55.74
Ep:  241 | Ep_r:  137.35
Ep:  242 | Ep_r:  140.44
Ep:  243 | Ep_r:  157.37


KeyboardInterrupt: 

In [33]:
# torch.gather 函数
b = torch.Tensor([[1,2,3],[4,5,6]])
print(b)
index_1 = torch.LongTensor([[0,1],[2,0]])
index_2 = torch.LongTensor([[0,1,1],[0,0,0]])
print(torch.gather(b, dim=1, index=index_1))
print(torch.gather(b, dim=0, index=index_2))

tensor([[1., 2., 3.],
        [4., 5., 6.]])
tensor([[1., 2.],
        [6., 4.]])
tensor([[1., 5., 6.],
        [1., 2., 3.]])
