In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import random
from collections import defaultdict

# 1) Create Blackjack environment
env = gym.make('Blackjack-v1', sab=True)

# 2) Q‐table: default 0 for unseen state‐action pairs
Q = defaultdict(lambda: np.zeros(env.action_space.n))

# 3) Hyperparameters
alpha          = 0.1       # learning rate
gamma          = 1.0       # discount factor (no decay)
epsilon        = 1.0       # exploration prob
epsilon_min    = 0.05
epsilon_decay  = 0.999995
episodes       = 500_000

# 4) Track training rewards
rewards = []

# 5) Training loop (Q‐learning)
for ep in range(1, episodes + 1):
    state = env.reset()  # state is (player_sum, dealer_card, usable_ace)
    done  = False
    ep_reward = 0

    while not done:
        # ε‐greedy action selection
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = int(np.argmax(Q[state]))

        next_state, reward, done, _ = env.step(action)
        ep_reward += reward

        # Q‐learning update
        best_next = 0 if done else np.max(Q[next_state])
        td_target = reward + gamma * best_next
        td_error  = td_target - Q[state][action]
        Q[state][action] += alpha * td_error

        state = next_state

    rewards.append(ep_reward)
    # decay ε
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    # periodic reporting
    if ep % 50_000 == 0:
        avg = np.mean(rewards[-50_000:])
        print(f"Episodes {ep-49_999}-{ep}: Avg Reward = {avg:.4f}, ε = {epsilon:.4f}")

# 6) Evaluate learned policy
test_episodes = 100_000
test_rewards  = []
for _ in range(test_episodes):
    state = env.reset()
    done  = False
    tot_r = 0
    while not done:
        action = int(np.argmax(Q[state]))
        state, r, done, _ = env.step(action)
        tot_r += r
    test_rewards.append(tot_r)

print(f"\nTest over {test_episodes} episodes: Avg Reward = {np.mean(test_rewards):.4f}")

# 7) Plot smoothed training reward
window = 10000
smoothed = np.convolve(rewards, np.ones(window)/window, mode='valid')
plt.plot(smoothed)
plt.title(f"Smoothed Reward (window={window})")
plt.xlabel("Episode")
plt.ylabel("Average Reward")
plt.show()

# 8) Derive and print policy table
#    0 = stick, 1 = hit
policy = {s: int(np.argmax(a)) for s, a in Q.items()}
print("\nLearned Policy (rows=player sum 12–21, cols=dealer 1–10)")

for usable_ace in [True, False]:
    print(f"\nUsable Ace = {usable_ace}")
    header = "PSum \\ D  " + " ".join(f"{d:2d}" for d in range(1,11))
    print(header)
    for psum in range(12, 22):
        row = f"{psum:8d} "
        for dcard in range(1,11):
            a = policy.get((psum, dcard, usable_ace), 0)
            row += f" {a}"
        print(row)
