## 初始化环境
www.gymlibrary.dev/environments/toy_text/blackjack

In [1]:
import numpy as np
import gym
 
env = gym.make("Blackjack-v1")  #加载21点游戏环境Blackjack-v1

In [2]:
# 查看观测空间
env.observation_space

Tuple(Discrete(32), Discrete(11), Discrete(2))

In [3]:
# 查看行为空间
env.action_space

Discrete(2)

## 21点游戏过程

In [10]:
# 胜利
state, _ = env.reset()
state

(9, 4, False)

In [11]:
next_state, reward, terminated, truncated, info = env.step(1) #要牌
next_state, reward, terminated

((13, 4, False), 0.0, False)

In [12]:
next_state, reward, terminated, truncated, info = env.step(0) #不要牌
next_state, reward, terminated

((13, 4, False), -1.0, True)

In [13]:
# 胜利
state, _ = env.reset()
state

(12, 2, True)

In [32]:
next_state, reward, terminated, truncated, info = env.step(0) #不要牌
next_state, reward, terminated

((21, 2, True), 1.0, True)

In [33]:
# 失败
state, _ = env.reset()
state

(5, 2, False)

In [34]:
next_state, reward, terminated, truncated, info = env.step(0) #不要牌
next_state, reward, terminated

((5, 2, False), -1.0, True)

In [37]:
# 失败
state, _ = env.reset()
state

(16, 10, False)

In [38]:
next_state, reward, terminated, truncated, info = env.step(1) #要牌
next_state, reward, terminated

((26, 10, False), -1.0, True)

## 蒙特卡洛方法

In [14]:
# 初始化Q值表，用于存储Q(state,action)
Q = {}
explore_rate = 0.2 # 探索率为0.2
policy = {} # 创建空字典作为策略表，存储在状态s下选择动作a的概率
rewards = {} # 创建空字典用于保存(state,action)对的累积奖励
num_episodes = 5000 # 总回合数

In [15]:
# 模拟回合
def generate_episode(Q,policy):
    #回合观测序列列表
    episode = []
    # 初始化环境并获取初始状态
    state, _ = env.reset()
    while True:
        # 若状态s第一次出现，将它添加到Q和policy中
        if state not in Q.keys():
            Q[state] = {0:0, 1:0} #初始化stick和hit的Q值
            policy[state] = [0.5, 0.5] #初始化stick和hit的概率，各为0.5
            rewards[state] = {0:[] , 1: []} #初始化空列表用于存储累积奖励
        # 利用policy采样一个动作
        action = np.random.choice([0,1], p=policy[state],size=1)[0]
        #与环境交互，产生奖励与下一状态
        next_state, reward, terminated, truncated, info = env.step(action)
        #将状态state、动作action、奖励reward添加到episode中
        episode.append((state, action, reward))
        #到达终止状态，循环结束
        if terminated:
            break
        state = next_state
    return episode

In [16]:
# 开始学习
for i in range(num_episodes):
    #产生一个观测序列，包括该回合每一步的state,action, reward
    episode = generate_episode(Q, policy)
    #初始化该回合累积奖励
    episode_sum_reward = 0.0
    #反向遍历观测序列中的每一步
    for t in range(len(episode))[::-1]:
        #分别保存每一步的状态、动作和奖励
        state ,action, reward = episode[t]
        #计算累积奖励
        episode_sum_reward += reward
        #将episode_sum_reward加入（state，action）的rewards中
        rewards[state][action].append(episode_sum_reward)
        # 更新Q表中的Q值
        Q[state][action] = np.mean(rewards[state][action])
        # 更新策略
        policy_action = np.argmax(list(Q[state].values()))
        policy[state][policy_action] = 1 - explore_rate
        policy[state][1 - policy_action] = explore_rate

In [None]:
Q

In [None]:
rewards

In [19]:
# 总奖励
total_rewards = 0
for k,v in rewards.items():
    total_rewards += sum(v[0])
    total_rewards += sum(v[1])
print(total_rewards)

-1318.0


In [None]:
policy

## 时序差分方法

In [21]:
# 初始化Q值表，用于存储Q(state,action)
Q = {}
explore_rate = 0.2 # 探索率为0.2
policy = {} # 创建空字典作为策略表，存储在状态s下选择动作a的概率
rewards = {} # 创建空字典用于保存(state,action)对的累积奖励
num_episodes = 5000 # 总回合数
alpha = 0.1  # 学习率

In [22]:
# 生成回合
def generate_episode(Q,policy):
    #回合观测序列列表
    episode = []
    # 初始化环境并获取初始状态
    state, _ = env.reset()
    while True:
        # 若状态s第一次出现，将它添加到Q和policy中
        if state not in Q.keys():
            Q[state] = {0:0, 1:0} #初始化stick和hit的Q值
            policy[state] = [0.5, 0.5] #初始化stick和hit的概率，各为0.5
            rewards[state] = {0:[] , 1: []} #初始化空列表用于存储累积奖励
        # 利用policy采样一个动作
        action = np.random.choice([0,1], p=policy[state],size=1)[0]
        #与环境交互，产生奖励与下一状态
        next_state, reward, terminated, truncated, info = env.step(action)
        #将状态state、动作action、奖励reward添加到episode中
        episode.append((state, action, reward, next_state))
        #到达终止状态，循环结束
        if terminated:
            break
        state = next_state
    return episode

In [23]:
# 开始学习
for i in range(num_episodes):
    #产生一个观测序列，包括该回合每一步的state,action, reward
    episode = generate_episode(Q, policy)
    #初始化该回合累积奖励
    episode_sum_reward = 0.0
    #反向遍历观测序列中的每一步
    for t in range(len(episode)):
        #分别保存每一步的状态、动作和奖励
        state ,action, reward, next_state = episode[t]
        if next_state not in Q.keys():
            Q[next_state] = {0: 0.0, 1: 0.0}
        #计算累积奖励
        episode_sum_reward += reward
        #将episode_sum_reward加入（state，action）的rewards中
        rewards[state][action].append(episode_sum_reward)
        # TD（0）方式更新Q表中的Q值
        Q[state][action] += alpha * (reward + max(Q[next_state].values()) - Q[state][action])
        # 更新策略
        policy_action = np.argmax(list(Q[state].values()))
        policy[state][policy_action] = 1 - explore_rate
        policy[state][1 - policy_action] = explore_rate

In [24]:
policy

{(20, 10, False): [0.8, 0.2],
 (12, 10, False): [0.2, 0.8],
 (15, 8, False): [0.2, 0.8],
 (19, 8, False): [0.8, 0.2],
 (20, 6, False): [0.8, 0.2],
 (21, 6, False): [0.8, 0.2],
 (12, 6, True): [0.8, 0.2],
 (16, 6, True): [0.2, 0.8],
 (20, 6, True): [0.8, 0.2],
 (21, 6, True): [0.8, 0.2],
 (7, 10, False): [0.2, 0.8],
 (14, 6, False): [0.2, 0.8],
 (7, 5, False): [0.8, 0.2],
 (11, 5, False): [0.2, 0.8],
 (21, 5, False): [0.8, 0.2],
 (13, 2, False): [0.2, 0.8],
 (10, 5, False): [0.2, 0.8],
 (15, 2, False): [0.2, 0.8],
 (16, 2, False): [0.2, 0.8],
 (17, 10, False): [0.2, 0.8],
 (13, 3, True): [0.2, 0.8],
 (20, 3, True): [0.8, 0.2],
 (17, 7, True): [0.2, 0.8],
 (6, 3, False): [0.2, 0.8],
 (18, 10, False): [0.2, 0.8],
 (18, 3, False): [0.8, 0.2],
 (9, 3, False): [0.8, 0.2],
 (9, 5, False): [0.2, 0.8],
 (19, 5, False): [0.8, 0.2],
 (12, 4, True): [0.2, 0.8],
 (14, 4, True): [0.2, 0.8],
 (19, 10, False): [0.2, 0.8],
 (7, 1, False): [0.2, 0.8],
 (17, 4, False): [0.8, 0.2],
 (8, 10, False): [0.2, 

In [25]:
# 总奖励
total_rewards = 0
for k,v in rewards.items():
    total_rewards += sum(v[0])
    total_rewards += sum(v[1])
print(total_rewards)

-1302.0
