In [156]:
import gymnasium as gym

import numpy as np
import polars as pl

from collections import defaultdict

import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [157]:
env = gym.make("Blackjack-v1", natural=False, sab=False)


In [158]:
class BlackjackAgent():
    def __init__(self, threshold):
        self.threshold = threshold

    def get_action(self, state):
        if state[0] < self.threshold:
            return 1
        else:
            return 0

    def update(self, state, action, reward, next_state):
        pass


In [159]:
def first_visit_mc_prediction(episodes):
    gamma = 1
    values_by_state = defaultdict(list)

    for sequence in episodes:
        G = 0
        states_visited = []

        for step in sequence:
            state, action, reward = step
            G = gamma * G + reward
            if state not in states_visited:
                values_by_state[state].append(G)


    value_function = {}
    for state, values in values_by_state.items():
        value_function[state] = np.mean(values)

    return value_function

In [160]:
def convert_value_function_to_df(value_function):
    estimated_value_function = first_visit_mc_prediction(episodes)

    value_function = pl.DataFrame(
        [
            {"player": k[0], "dealer": k[1], "value": v}
            for k, v in estimated_value_function.items()
        ]
    ).sort("player", "dealer")

    return value_function


In [161]:
def generate_episodes(agent, n_episodes=100_000):
    episodes = []

    for _ in range(n_episodes):
        state, _ = env.reset()
        terminated = False
        sequence = []

        while not terminated:
            current_state = state[:2]

            # determine next action and execute it and update value function
            action = agent.get_action(state)
            state, reward, terminated, truncated, info = env.step(action)
            agent.update(state, action, reward, state)

            sequence.append((current_state, action, reward))

        episodes.append(sequence)

    return episodes

In [162]:
episodes = generate_episodes(BlackjackAgent(threshold=17))
value_function = convert_value_function_to_df(episodes)

z = (
    value_function.pivot(index="player", on="dealer", values="value")
    .drop("player")
    .to_numpy()
)
x = np.arange(1, 11)
y = np.arange(4, 22)

fig = go.Figure(data=[go.Surface(x=x, y=y, z=z)])
fig.update_layout(
    title="First Visit MC Prediction",
    width=500,
    height=500,
    margin=dict(l=65, r=50, b=65, t=90),
    scene=dict(
        xaxis_title="Dealer Showing",
        yaxis_title="Player Sum",
        zaxis_title="Avg. Reward",
    ),
)
fig.show()


In [163]:
episodes = generate_episodes(BlackjackAgent(threshold=18), n_episodes=100_000)
value_function = convert_value_function_to_df(episodes)

z = (
    value_function.pivot(index="player", on="dealer", values="value")
    .drop("player")
    .to_numpy()
)
x = np.arange(1, 11)
y = np.arange(4, 22)

fig = go.Figure(data=[go.Surface(x=x, y=y, z=z)])
fig.update_layout(
    title="First Visit MC Prediction",
    width=500,
    height=500,
    margin=dict(l=65, r=50, b=65, t=90),
    scene=dict(
        xaxis_title="Dealer Showing",
        yaxis_title="Player Sum",
        zaxis_title="Avg. Reward",
    ),
)
fig.show()


In [164]:
episodes = generate_episodes(BlackjackAgent(threshold=19))
value_function = convert_value_function_to_df(episodes)

z = value_function.pivot(index="player", on="dealer", values="value").drop("player").to_numpy()
x = np.arange(1, 11)
y = np.arange(4, 22)

fig = go.Figure(data=[go.Surface(x=x, y=y, z=z)])
fig.update_layout(
    title="First Visit MC Prediction",
    width=500,
    height=500,
    margin=dict(l=65, r=50, b=65, t=90),
    scene=dict(
        xaxis_title="Dealer Showing",
        yaxis_title="Player Sum",
        zaxis_title="Avg. Reward",
    )
)
fig.show()