In [1]:
import numpy as np
import gym

def generate_episode(env, policy):
    episode = []
    state = env.reset()[0]  # Reset environment and get initial state
    while True:
        action = policy(state)
        next_state, reward, done, _, _ = env.step(action)
        episode.append((state, action, reward))
        if done:
            break
        state = next_state
    return episode

def monte_carlo_prediction(env, policy, num_episodes, gamma=1.0):
    V = {}  # State-value function
    returns = {}  # Stores returns for each state
    
    for _ in range(num_episodes):
        episode = generate_episode(env, policy)
        G = 0
        visited_states = set()
        
        for t in range(len(episode) - 1, -1, -1):  # Reverse traversal
            state, action, reward = episode[t]
            G = gamma * G + reward
            if state not in visited_states:  # First-visit MC
                visited_states.add(state)
                if state not in returns:
                    returns[state] = []
                returns[state].append(G)
                V[state] = np.mean(returns[state])
    
    return V

def sample_policy(state):
    """A simple policy: Hit if player's sum < 20, else Stick."""
    player_sum, _, _ = state
    return 0 if player_sum >= 20 else 1

if __name__ == "__main__":
    env = gym.make("Blackjack-v1", natural=False, sab=False)
    num_episodes = 50000
    V = monte_carlo_prediction(env, sample_policy, num_episodes)
    
    # Display estimated values for states with player_sum 12-21
    for player_sum in range(12, 22):
        state = (player_sum, 2, False)  # Dealer's showing card = 2, No usable Ace
        print(f"Value of state {state}: {V.get(state, 0):.2f}")


  if not isinstance(terminated, (bool, np.bool8)):


Value of state (12, 2, False): -0.56
Value of state (13, 2, False): -0.56
Value of state (14, 2, False): -0.59
Value of state (15, 2, False): -0.61
Value of state (16, 2, False): -0.68
Value of state (17, 2, False): -0.64
Value of state (18, 2, False): -0.70
Value of state (19, 2, False): -0.73
Value of state (20, 2, False): 0.66
Value of state (21, 2, False): 0.88
