<a href="https://colab.research.google.com/github/divyasree-coder/AIML--2025/blob/main/LAB_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Install gym if not already installed
!pip install gymnasium

import gymnasium as gym
import numpy as np
from collections import defaultdict




In [4]:
env = gym.make("Blackjack-v1", sab=True)

In [6]:
def mc_policy_evaluation(policy, env, num_episodes=100000, gamma=1.0):
    returns_sum = defaultdict(float)
    returns_count = defaultdict(int)
    V = defaultdict(float)

    for _ in range(num_episodes):
        episode = []
        state, _ = env.reset()
        while True:
            action = policy(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            episode.append((state, action, reward))
            if terminated or truncated:
                break
            state = next_state

        G = 0
        visited = set()
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = gamma * G + reward
            if state not in visited:
                returns_sum[state] += G
                returns_count[state] += 1
                V[state] = returns_sum[state] / returns_count[state]
                visited.add(state)
    return V




def simple_policy(state):
    player_sum, dealer_card, usable_ace = state
    return 0 if player_sum >= 20 else 1  # 0 = stick, 1 = hit



def mc_control_epsilon_greedy(env, num_episodes=100000, gamma=1.0, epsilon=0.1):
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    policy = defaultdict(int)

    for _ in range(num_episodes):
        episode = []
        state, _ = env.reset()
        while True:
            probs = np.ones(env.action_space.n) * epsilon / env.action_space.n
            best_action = np.argmax(Q[state])
            probs[best_action] += (1.0 - epsilon)
            action = np.random.choice(np.arange(env.action_space.n), p=probs)

            next_state, reward, terminated, truncated, _ = env.step(action)
            episode.append((state, action, reward))
            if terminated or truncated:
                break
            state = next_state

        G = 0
        visited = set()
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = gamma * G + reward
            if (state, action) not in visited:
                returns_sum[(state, action)] += G
                returns_count[(state, action)] += 1
                Q[state][action] = returns_sum[(state, action)] / returns_count[(state, action)]
                visited.add((state, action))

    for state in Q:
        policy[state] = np.argmax(Q[state])
    return policy, Q



# Policy Evaluation
V = mc_policy_evaluation(simple_policy, env)
print("Value of state (20, 10, False):", V[(20, 10, False)])

# Policy Control
policy, Q = mc_control_epsilon_greedy(env)
print("Best action for state (20, 10, False):", policy[(20, 10, False)])




Value of state (20, 10, False): 0.4171390463487829
Best action for state (20, 10, False): 0
