## 使用理论最佳策略进行测试

In [1]:
import numpy as np
import gym
 
env = gym.make("Blackjack-v1")  # 加载21点游戏环境Blackjack-v1

# 定义最佳策略函数 - 简单明了
def optimal_policy(state):
    player_sum, dealer_card, usable_ace = state
    
    # 有可用的A（软牌）
    if usable_ace:
        if player_sum >= 19:
            return 0  # 停牌
        elif player_sum == 18:
            if dealer_card in [2, 7, 8]:
                return 0  # 停牌
            else:
                return 1  # 要牌
        else:
            return 1  # 要牌
    # 无可用的A（硬牌）
    else:
        if player_sum >= 17:
            return 0  # 停牌
        elif 13 <= player_sum <= 16:
            if 2 <= dealer_card <= 6:
                return 0  # 停牌
            else:
                return 1  # 要牌
        elif player_sum == 12:
            if 4 <= dealer_card <= 6:
                return 0  # 停牌
            else:
                return 1  # 要牌
        else:
            return 1  # 要牌

# 运行10000局游戏
rewards = []
for _ in range(10000):
    state, _ = env.reset()
    done = False
    
    while not done:
        action = optimal_policy(state)
        state, reward, done, _, _ = env.step(action)
    
    rewards.append(reward)

# 输出结果
print(f"得分: {np.sum(rewards)}")
print(f"胜率: { + np.sum(rewards) / 10000} ")
print(f"这并不是最优策略, 理论上最优策略能把胜率差异压缩到0.05%, 也可能是这个gym问题, 或者策略的问题, 但总之比梗直哥训练的垃圾策略好多了吧")
# print(f"平均得分: {np.mean(rewards):.4f}")
# print(f"胜率: {len([r for r in rewards if r > 0])/10000:.4f}")
# 这么简单的代码, 还是会给出很多错误, AI完全取代程序员不可能. 程序员利用AI提升工作效率.

得分: -221.0
胜率: -0.0221 
这并不是最优策略, 理论上最优策略能把胜率差异压缩到0.05%, 也可能是这个gym问题, 或者策略的问题, 但总之比梗直哥训练的垃圾策略好多了吧


## 初始化环境
www.gymlibrary.dev/environments/toy_text/blackjack

In [2]:
import numpy as np
import gym
 
env = gym.make("Blackjack-v1")  #加载21点游戏环境Blackjack-v1

In [3]:
# 查看观测空间
env.observation_space

Tuple(Discrete(32), Discrete(11), Discrete(2))

In [4]:
# 查看行为空间
env.action_space

Discrete(2)

## 21点游戏过程

In [5]:
# 胜利
state, _ = env.reset()
state

(20, 2, False)

In [6]:
next_state, reward, terminated, truncated, info = env.step(1) #要牌
next_state, reward, terminated

((26, 2, False), -1.0, True)

In [7]:
next_state, reward, terminated, truncated, info = env.step(0) #不要牌
next_state, reward, terminated

((26, 2, False), -1.0, True)

In [8]:
# 胜利
state, _ = env.reset()
state

(6, 1, False)

In [9]:
next_state, reward, terminated, truncated, info = env.step(0) #不要牌
next_state, reward, terminated

((6, 1, False), -1.0, True)

In [10]:
# 失败
state, _ = env.reset()
state

(4, 10, False)

In [11]:
next_state, reward, terminated, truncated, info = env.step(0) #不要牌
next_state, reward, terminated

((4, 10, False), -1.0, True)

In [12]:
# 失败
state, _ = env.reset()
state

(8, 8, False)

In [13]:
next_state, reward, terminated, truncated, info = env.step(1) #要牌
next_state, reward, terminated

((15, 8, False), 0.0, False)

## 蒙特卡洛方法

In [14]:
# 初始化Q值表，用于存储Q(state,action)
Q = {}
explore_rate = 0.2 # 探索率为0.2
policy = {} # 创建空字典作为策略表，存储在状态s下选择动作a的概率
rewards = {} # 创建空字典用于保存(state,action)对的累积奖励
num_episodes = 5000 # 总回合数

In [15]:
# 模拟回合
def generate_episode(Q,policy):
    #回合观测序列列表
    episode = []
    # 初始化环境并获取初始状态
    state, _ = env.reset()
    while True:
        # 若状态s第一次出现，将它添加到Q和policy中
        if state not in Q.keys():
            Q[state] = {0:0, 1:0} #初始化stick和hit的Q值
            policy[state] = [0.5, 0.5] #初始化stick和hit的概率，各为0.5
            rewards[state] = {0:[] , 1: []} #初始化空列表用于存储累积奖励
        # 利用policy采样一个动作
        action = np.random.choice([0,1], p=policy[state],size=1)[0]
        #与环境交互，产生奖励与下一状态
        next_state, reward, terminated, truncated, info = env.step(action)
        #将状态state、动作action、奖励reward添加到episode中
        episode.append((state, action, reward))
        #到达终止状态，循环结束
        if terminated:
            break
        state = next_state
    return episode

In [16]:
# 开始学习
for i in range(num_episodes):
    #产生一个观测序列，包括该回合每一步的state,action, reward
    episode = generate_episode(Q, policy)
    #初始化该回合累积奖励
    episode_sum_reward = 0.0
    #反向遍历观测序列中的每一步
    for t in range(len(episode))[::-1]:
        #分别保存每一步的状态、动作和奖励
        state ,action, reward = episode[t]
        #计算累积奖励
        episode_sum_reward += reward
        #将episode_sum_reward加入（state，action）的rewards中
        rewards[state][action].append(episode_sum_reward)
        # 更新Q表中的Q值
        Q[state][action] = np.mean(rewards[state][action])
        # 更新策略
        policy_action = np.argmax(list(Q[state].values()))
        policy[state][policy_action] = 1 - explore_rate
        policy[state][1 - policy_action] = explore_rate

In [17]:
Q

{(16, 10, True): {0: -1.0, 1: -0.2631578947368421},
 (19, 10, True): {0: 0.0, 1: -0.2222222222222222},
 (20, 1, False): {0: 0.13157894736842105, 1: -0.8333333333333334},
 (18, 9, False): {0: -0.17857142857142858, 1: -0.6},
 (19, 9, False): {0: 0.2916666666666667, 1: -0.42857142857142855},
 (21, 9, False): {0: 1.0, 1: -1.0},
 (14, 6, False): {0: -0.3548387096774194, 1: -0.4},
 (18, 6, False): {0: 0.5652173913043478, 1: -1.0},
 (19, 6, False): {0: 0.6190476190476191, 1: -0.6666666666666666},
 (7, 4, False): {0: 0.2, 1: -1.0},
 (15, 6, False): {0: -0.5555555555555556, 1: -0.4583333333333333},
 (17, 6, False): {0: -0.15, 1: -0.25},
 (13, 6, True): {0: -0.3333333333333333, 1: -1.0},
 (16, 2, False): {0: -0.3333333333333333, 1: -0.6666666666666666},
 (19, 4, False): {0: 0.4090909090909091, 1: -0.9},
 (13, 5, False): {0: -0.2413793103448276, 1: -1.0},
 (17, 5, False): {0: 0.17391304347826086, 1: -0.5},
 (20, 5, False): {0: 0.7428571428571429, 1: -0.3333333333333333},
 (10, 5, False): {0: 0.42

In [18]:
rewards

{(16, 10, True): {0: [-1.0, -1.0, -1.0, -1.0],
  1: [1.0,
   1.0,
   -1.0,
   -1.0,
   -1.0,
   0.0,
   1.0,
   -1.0,
   -1.0,
   -1.0,
   1.0,
   -1.0,
   -1.0,
   1.0,
   -1.0,
   0.0,
   -1.0,
   1.0,
   -1.0]},
 (19,
  10,
  True): {0: [1.0,
   -1.0,
   -1.0,
   -1.0,
   1.0,
   -1.0,
   1.0,
   1.0,
   -1.0,
   -1.0,
   1.0,
   -1.0,
   0.0,
   1.0,
   1.0,
   -1.0,
   1.0,
   -1.0,
   1.0,
   1.0,
   1.0,
   -1.0,
   -1.0], 1: [-1.0, 1.0, 1.0, 0.0, -1.0, -1.0, -1.0, -1.0, 1.0]},
 (20,
  1,
  False): {0: [1.0,
   0.0,
   -1.0,
   1.0,
   1.0,
   -1.0,
   -1.0,
   1.0,
   0.0,
   -1.0,
   1.0,
   1.0,
   1.0,
   -1.0,
   1.0,
   1.0,
   0.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   -1.0,
   -1.0,
   -1.0,
   -1.0,
   1.0,
   1.0,
   -1.0,
   -1.0,
   1.0,
   -1.0,
   1.0,
   -1.0,
   -1.0,
   1.0,
   -1.0], 1: [-1.0,
   -1.0,
   1.0,
   -1.0,
   -1.0,
   -1.0,
   -1.0,
   -1.0,
   -1.0,
   -1.0,
   -1.0,
   -1.0]},
 (18,
  9,
  False): {0: [-1.0,
   -1.0,
   -1.0,
   1.0,

In [19]:
# 总奖励
total_rewards = 0
for k,v in rewards.items():
    total_rewards += sum(v[0])
    total_rewards += sum(v[1])
print(total_rewards)

-1294.0


In [20]:
policy

{(16, 10, True): [0.2, 0.8],
 (19, 10, True): [0.8, 0.2],
 (20, 1, False): [0.8, 0.2],
 (18, 9, False): [0.8, 0.2],
 (19, 9, False): [0.8, 0.2],
 (21, 9, False): [0.8, 0.2],
 (14, 6, False): [0.8, 0.2],
 (18, 6, False): [0.8, 0.2],
 (19, 6, False): [0.8, 0.2],
 (7, 4, False): [0.8, 0.2],
 (15, 6, False): [0.2, 0.8],
 (17, 6, False): [0.8, 0.2],
 (13, 6, True): [0.8, 0.2],
 (16, 2, False): [0.8, 0.2],
 (19, 4, False): [0.8, 0.2],
 (13, 5, False): [0.8, 0.2],
 (17, 5, False): [0.8, 0.2],
 (20, 5, False): [0.8, 0.2],
 (10, 5, False): [0.8, 0.2],
 (20, 10, False): [0.8, 0.2],
 (12, 2, False): [0.8, 0.2],
 (19, 2, False): [0.8, 0.2],
 (8, 5, False): [0.2, 0.8],
 (17, 1, False): [0.8, 0.2],
 (13, 2, False): [0.8, 0.2],
 (21, 2, False): [0.8, 0.2],
 (8, 10, False): [0.2, 0.8],
 (12, 10, False): [0.2, 0.8],
 (6, 7, False): [0.8, 0.2],
 (9, 7, False): [0.8, 0.2],
 (19, 7, False): [0.8, 0.2],
 (14, 6, True): [0.8, 0.2],
 (6, 4, False): [0.2, 0.8],
 (20, 7, False): [0.8, 0.2],
 (7, 10, False): [0

## 时序差分方法

In [21]:
# 初始化Q值表，用于存储Q(state,action)
Q = {}
explore_rate = 0.2 # 探索率为0.2
policy = {} # 创建空字典作为策略表，存储在状态s下选择动作a的概率
rewards = {} # 创建空字典用于保存(state,action)对的累积奖励
num_episodes = 5000 # 总回合数
alpha = 0.1  # 学习率

In [22]:
# 生成回合
def generate_episode(Q,policy):
    #回合观测序列列表
    episode = []
    # 初始化环境并获取初始状态
    state, _ = env.reset()
    while True:
        # 若状态s第一次出现，将它添加到Q和policy中
        if state not in Q.keys():
            Q[state] = {0:0, 1:0} #初始化stick和hit的Q值
            policy[state] = [0.5, 0.5] #初始化stick和hit的概率，各为0.5
            rewards[state] = {0:[] , 1: []} #初始化空列表用于存储累积奖励
        # 利用policy采样一个动作
        action = np.random.choice([0,1], p=policy[state],size=1)[0]
        #与环境交互，产生奖励与下一状态
        next_state, reward, terminated, truncated, info = env.step(action)
        #将状态state、动作action、奖励reward添加到episode中
        episode.append((state, action, reward, next_state))
        #到达终止状态，循环结束
        if terminated:
            break
        state = next_state
    return episode

In [23]:
# 开始学习
for i in range(num_episodes):
    #产生一个观测序列，包括该回合每一步的state,action, reward
    episode = generate_episode(Q, policy)
    #初始化该回合累积奖励
    episode_sum_reward = 0.0
    #反向遍历观测序列中的每一步
    for t in range(len(episode)):
        #分别保存每一步的状态、动作和奖励
        state ,action, reward, next_state = episode[t]
        if next_state not in Q.keys():
            Q[next_state] = {0: 0.0, 1: 0.0}
        #计算累积奖励
        episode_sum_reward += reward
        #将episode_sum_reward加入（state，action）的rewards中
        rewards[state][action].append(episode_sum_reward)
        # TD（0）方式更新Q表中的Q值
        Q[state][action] += alpha * (reward + max(Q[next_state].values()) - Q[state][action])
        # 更新策略
        policy_action = np.argmax(list(Q[state].values()))
        policy[state][policy_action] = 1 - explore_rate
        policy[state][1 - policy_action] = explore_rate

In [None]:
#policy

In [25]:
# 总奖励
total_rewards = 0
for k,v in rewards.items():
    total_rewards += sum(v[0])
    total_rewards += sum(v[1])
print(total_rewards)

-1305.0
