# Solving Blackjack with Q-Learning

**Actions**:
- stand
- hit
- bet big
- bet small


## Imports and Environment Setup

In [1]:
pip install seaborn

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [3]:
from __future__ import annotations
from collections import defaultdict, deque

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.patches import Patch
from tqdm import tqdm

import gymnasium as gym
from gym import spaces
from gymnasium.spaces import Discrete, Tuple, Box
from gymnasium.envs.toy_text.blackjack import BlackjackEnv

In [4]:
class CustomBlackjackEnv(gym.Env):
    def __init__(self):
        # Action space: 0 = small bet, 1 = big bet, 2 = hit, 3 = stand
        self.action_space = spaces.Discrete(4)

        # Observation space: player's hand total, dealer's showing card, usable ace, balance
        self.observation_space = spaces.Tuple((
            spaces.Discrete(32),  # Player's hand total (0-31 to handle busts)
            spaces.Discrete(11),  # Dealer's showing card (1-10)
            spaces.Discrete(2),   # Usable ace (0 or 1)
            spaces.Discrete(1001) # Balance (0 to 1000)
        ))

        # Heavy penalty for illegal actions
        self.illegal_action_penalty = -100

        # Initialize environment state
        self.reset()

    def reset(self):
        """Reset the environment to start a new episode."""
        self.balance = 500          # Starting balance
        self.phase = "bet"          # Start with betting phase
        self.current_bet = 0        # Current bet
        self.player_hand = []       # Player's hand
        self.dealer_hand = []       # Dealer's hand
        self.usable_ace_player = False
        self.done = False
        return self._deal_initial_hands()

    def _deal_initial_hands(self):
        """Deal initial hands to player and dealer."""
        self.player_hand = [self._draw_card(), self._draw_card()]
        self.dealer_hand = [self._draw_card(), self._draw_card()]
        self.usable_ace_player = 1 in self.player_hand and sum(self.player_hand) + 10 <= 21
        return self._get_obs()

    def step(self, action):
        """Take a step in the environment."""
        if self.phase == "bet":
            if action not in [0, 1]:  # Invalid action during betting phase
                return self._get_obs(), self.illegal_action_penalty, False, {}
            return self._bet_phase(action)
        
        elif self.phase == "play":
            if action not in [2, 3]:  # Invalid action during gameplay phase
                return self._get_obs(), self.illegal_action_penalty, False, {}
            return self._play_phase(action)
        
        else:
            raise ValueError("Invalid game phase.")

    def _bet_phase(self, action):
        """Handle betting actions."""
        # Reset hands for a new round
        self.player_hand = []
        self.dealer_hand = []
        self.usable_ace_player = False
        
        if action == 0:  # Small bet
            self.current_bet = 50
        elif action == 1:  # Big bet
            self.current_bet = 100
        else:
            raise ValueError("Invalid action during betting phase.")
        
        # Deduct the bet amount
        self.balance -= self.current_bet
        
        # Deal new cards to player and dealer
        self._deal_initial_hands()

        # Transition to gameplay phase
        self.phase = "play"
        return self._get_obs(), 0, False, {}

    def _play_phase(self, action):
        """Handle gameplay actions."""
        if action == 2:  # Hit
            self.player_hand.append(self._draw_card())
            self.usable_ace_player = 1 in self.player_hand and sum(self.player_hand) + 10 <= 21
            if self._hand_value(self.player_hand) > 21:  # Player busts
                reward = -self.current_bet
                self.phase = "bet"
                return self._get_obs(), reward, self.balance <= 0, {}
            return self._get_obs(), 0, False, {}
        
        elif action == 3:  # Stand
            # Dealer plays
            while self._hand_value(self.dealer_hand) < 17:
                self.dealer_hand.append(self._draw_card())
            
            # Determine outcome
            player_value = self._hand_value(self.player_hand)
            dealer_value = self._hand_value(self.dealer_hand)

            if dealer_value > 21 or player_value > dealer_value:
                reward = self.current_bet
                self.balance += 2 * self.current_bet  # Add winnings to balance
            elif player_value == dealer_value:
                reward = 0
            else:
                reward = -self.current_bet
            
            self.phase = "bet"
            return self._get_obs(), reward, self.balance <= 0, {}

    def _draw_card(self):
        """Draw a card from the deck (values 1-10)."""
        return np.random.randint(1, 11)

    def _hand_value(self, hand):
        """Calculate the value of a hand."""
        value = sum(hand)
        if 1 in hand and value + 10 <= 21:
            return value + 10  # Use ace as 11
        return value

    def _get_obs(self):
        """Return the current observation."""
        return (
            self._hand_value(self.player_hand),       # Player's hand total
            self.dealer_hand[0],                     # Dealer's showing card
            int(self.usable_ace_player),             # Usable ace
            self.balance                              # Balance
        )


## Observing our Custom Environment

In [5]:
env = CustomBlackjackEnv()

In [6]:
done = False
obs = env.reset()
print(f"Initial Observation: {obs}")

done = False
while not done:
    action = env.action_space.sample()
    obs, reward, done, info = env.step(action)
    print(f"Action: {action}, Observation: {obs}, Reward: {reward}, Done: {done}")

Initial Observation: (10, 4, 0, 500)
Action: 2, Observation: (10, 4, 0, 500), Reward: -100, Done: False
Action: 1, Observation: (7, 8, 0, 400), Reward: 0, Done: False
Action: 3, Observation: (7, 8, 0, 400), Reward: -100, Done: False
Action: 0, Observation: (14, 4, 0, 350), Reward: 0, Done: False
Action: 0, Observation: (14, 4, 0, 350), Reward: -100, Done: False
Action: 1, Observation: (14, 4, 0, 350), Reward: -100, Done: False
Action: 1, Observation: (14, 4, 0, 350), Reward: -100, Done: False
Action: 2, Observation: (19, 4, 0, 350), Reward: 0, Done: False
Action: 1, Observation: (19, 4, 0, 350), Reward: -100, Done: False
Action: 0, Observation: (19, 4, 0, 350), Reward: -100, Done: False
Action: 2, Observation: (25, 4, 0, 350), Reward: -50, Done: False
Action: 0, Observation: (14, 10, 0, 300), Reward: 0, Done: False
Action: 0, Observation: (14, 10, 0, 300), Reward: -100, Done: False
Action: 2, Observation: (21, 10, 0, 300), Reward: 0, Done: False
Action: 1, Observation: (21, 10, 0, 300)

## Agent Training

### Setup

In [7]:
import numpy as np
import random
from collections import defaultdict

In [8]:
# Q-table: Use a dictionary with default values for all (state, action) pairs
Q = defaultdict(float)

# Parameters
alpha = 0.1         # Learning rate
gamma = 0.99        # Discount factor
epsilon = 0.1       # Exploration rate
episodes = 10000    # Number of episodes to train
max_rounds = 20     # Max rounds per episode

In [9]:
def discretize_state(state):
    """
    Convert the state into a discrete form for the Q-table.
    """
    player_hand = min(state[0], 21)  # Limit hand value to 21
    dealer_card = state[1]          # Dealer's showing card
    usable_ace = state[2]           # 0 or 1
    balance = min(state[3] // 100, 5)  # Group balance into ranges of 100
    
    return (player_hand, dealer_card, usable_ace, balance)

In [10]:
def train_q_learning(env, Q, alpha, gamma, epsilon, episodes, max_rounds):
    for episode in range(episodes):
        state = env.reset()  # Reset environment
        state = discretize_state(state)  # Discretize initial state
        done = False
        rounds = 0

        while not done and rounds < max_rounds:
            # Epsilon-greedy action selection
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()  # Explore
            else:
                # Exploit: Choose action with max Q-value
                action = max(range(env.action_space.n), key=lambda a: Q[(state, a)])

            # Take the action
            next_state, reward, done, _ = env.step(action)
            next_state = discretize_state(next_state)  # Discretize next state

            # Q-learning update
            best_next_action = max(range(env.action_space.n), key=lambda a: Q[(next_state, a)])
            Q[(state, action)] += alpha * (reward + gamma * Q[(next_state, best_next_action)] - Q[(state, action)])

            # Move to next state
            state = next_state
            rounds += 1

        # Optionally, decay epsilon over time
        epsilon = max(0.01, epsilon * 0.995)

    return Q

In [11]:
def evaluate_agent(env, Q, episodes=100):
    total_rewards = 0

    for episode in range(episodes):
        state = env.reset()
        state = discretize_state(state)
        done = False

        while not done:
            # Always take the best action
            action = max(range(env.action_space.n), key=lambda a: Q[(state, a)])
            next_state, reward, done, _ = env.step(action)
            state = discretize_state(next_state)
            total_rewards += reward

    avg_reward = total_rewards / episodes
    return avg_reward

In [17]:
env = CustomBlackjackEnv()

# Train the agent
Q = train_q_learning(env, Q, alpha=0.1, gamma=0.99, epsilon=0.1, episodes=10000, max_rounds=20)

# Evaluate the agent
# avg_reward = evaluate_agent(env, Q, episodes=100)
# print(f"Average Reward Over 100 Episodes: {avg_reward}")


In [None]:
print("Done")