In [1]:
# 同策略蒙特卡洛算法
import random
import time
from yuanyangEnv import YuanYangEnv
import numpy as np

pygame 2.1.2 (SDL 2.0.18, Python 3.9.9)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
class MC_RL:
    def __init__(self,yuanyang):
        # 行为值函数的初始化
        self.qvalue = np.ones((len(yuanyang.states),len(yuanyang.actions)))*0.1
        
        # 次数初始化，求经验平均时，q(s,a)=G(s,a)/n(s,a)
        self.n = 0.001*np.ones((len(yuanyang.states),len(yuanyang.actions)))
        self.actions = yuanyang.actions
        self.gamma = yuanyang.gamma
        self.yuanyang = yuanyang

    def greedy_policy(self,qfun,state):
        # 贪婪策略
        amax = qfun[state,:].argmax()
        return self.actions[amax]

    def epsilon_greedy_policy(self,qfun,state,epsilon):
        # ε-greedy策略
        amax = qfun[state,:].argmax()
        if np.random.uniform() < 1 - epsilon:
            return self.greedy_policy(qfun,state)
        else:
            return self.actions[int(np.random.rand()*len(self.actions))]

    def find_anum(self,a):
        # 找到动作对应的序号
        for i in range(len(self.actions)):
            if a == self.actions[i]:
                return i

    def mc_learning_on_policy(self, num_iter, epsilon):
        self.qvalue = np.zeros((len(yuanyang.states), len(yuanyang.actions)))
        self.n = 0.001 * np.ones((len(yuanyang.states), len(yuanyang.actions)))
        
        # 学习num_iter次
        for iter in range(num_iter):
            # 采集状态样本
            s_sample = []
            
            # 采集动作样本
            a_sample = []
            
            # 采集回报样本
            r_sample = []
            
            # #随机初始化状态
            # s = self.yuanyang.reset()
            # 固定初始状态
            s = 0
            done = False
            step_num = 0
            epsilon = epsilon * np.exp(-iter / 10)   # 探索率随迭代次数指数衰减
            
            # 采集数据s0-a1-s1-a2-s2...terminate state
            # for i in range(5):
            while False == done and step_num < 30:
                a = self.epsilon_greedy_policy(self.qvalue, s, epsilon)
                
                # 与环境交互
                s_next, r, done = self.yuanyang.transform(s, a)
                a_num = self.find_anum(a)
                
                # 往回走给予惩罚
                if s_next in s_sample:
                    r = -2
                    
                # 存储数据，采样数据
                s_sample.append(s)
                r_sample.append(r)
                a_sample.append(a_num)
                step_num += 1
                
                # 转移到下一个状态，继续试验，s0-s1-s2
                s = s_next
                
            # 任务完成结束条件
            if s == 9:
                print("同策略第一次完成任务需要的次数：", iter)
                break
                
            # 从样本中计算累计回报,g(s_0) = r_0+gamma*r_1+gamma^2*r_2+gamma^3*r3+v(sT)
            a = self.epsilon_greedy_policy(self.qvalue, s, epsilon)
            g = self.qvalue[s, self.find_anum(a)]
            
            # 计算该序列的第一状态的累计回报
            for i in range(len(s_sample) - 1, -1, -1):
                g *= self.gamma
                g += r_sample[i]
                
            # print("episode number, trajectory", iter1, s_sample)
            # print("first state", s_sample[0], g)
            # g=G(s1,a),开始算其他状态处的累计回报
            for i in range(len(s_sample)):
                # 计算状态-行为对（s,a)的次数，s,a1...s,a2
                self.n[s_sample[i], a_sample[i]] += 1.0
                
                # 利用增量式方法更新值函数
                self.qvalue[s_sample[i], a_sample[i]] = (self.qvalue[s_sample[i], a_sample[i]] * (
                            self.n[s_sample[i], a_sample[i]] - 1) + g) / self.n[s_sample[i], a_sample[i]]
                g -= r_sample[i]
                g /= self.gamma
                # print("s_sample,a",g)
            # print("number",self.n)
        return self.qvalue

    def mc_test(self):
        s = 0
        s_sample = []
        done = False
        flag = 0
        step_num = 0
        while False == done and step_num < 30:
            a = self.greedy_policy(self.qvalue, s)
            # 与环境交互
            s_next, r, done = self.yuanyang.transform(s, a)
            s_sample.append(s)
            s = s_next
            step_num += 1
        if s == 9:
            flag = 1
        return flag

In [None]:
yuanyang = YuanYangEnv()
brain = MC_RL(yuanyang)

# on-policy方法
qvalue = brain.mc_learning_on_policy(num_iter=1000000, epsilon=0.9)
print(qvalue)

# 将行为值函数渲染出来
yuanyang.action_value = qvalue

# 测试学到的策略
flag = 1
s = 0
# print(policy_value.pi)
step_num = 0
path = []

# 将最优路径打印出来
while flag:
    # 渲染路径点
    path.append(s)
    yuanyang.path = path
    a = brain.greedy_policy(qvalue, s)
    print('%d->%s\t' % (s, a), qvalue[s, 0], qvalue[s, 1], qvalue[s, 2], qvalue[s, 3])
    yuanyang.bird_male_position = yuanyang.state_to_position(s)
    yuanyang.render()
    time.sleep(0.25)
    step_num += 1
    s_, r, t = yuanyang.transform(s, a)
    if t == True or step_num > 30:
        flag = 0
    s = s_
    
# 渲染最后的路径点
yuanyang.bird_male_position = yuanyang.state_to_position(s)
path.append(s)
yuanyang.render()
while True:
    yuanyang.render()

同策略第一次完成任务需要的次数： 321
[[-23.6780304  -16.52032294 -22.92689534 -23.2813468 ]
 [-23.20202635 -22.76585105 -26.43747283 -23.60202   ]
 [-20.10118267 -23.54247943 -24.84134137 -20.10118267]
 [  0.           0.           0.           0.        ]
 [  0.           0.           0.           0.        ]
 [  0.           0.           0.           0.        ]
 [  0.           0.           0.           0.        ]
 [  0.           0.           0.           0.        ]
 [  0.           0.           0.           0.        ]
 [  0.           0.           0.           0.        ]
 [-25.24396996 -17.29970525 -23.38373204 -24.12261185]
 [-22.1869644  -21.51964885 -24.71239772 -24.84134137]
 [ -9.9950025  -15.37720048 -23.41433373 -23.86752329]
 [  0.           0.           0.           0.        ]
 [  0.           0.           0.           0.        ]
 [  0.           0.           0.           0.        ]
 [  0.           0.           0.           0.        ]
 [  0.           0.           0.           0