## Gymnasium

In [1]:
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from torch.distributions.normal import Normal
import gymnasium as gym
from gymnasium.wrappers import FlattenObservation
from skrl.models.torch.deterministic import DeterministicMixin
from skrl.models.torch import Model
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG


### Blackjack

In [2]:
env = gym.make('Blackjack-v1', natural=False, sab=False)

In [10]:
class DeterministicActor(DeterministicMixin, Model):
    def __init__(self, observation_space, action_space, device="cpu", clip_acitons=False):
        Model.__init__(self, observation_space, action_space, device)
        DeterministicMixin.__init__(self, clip_acitons)
        
        # if isinstance(observation_space, gym.spaces.Tuple):
        #     self.observation_spaces = observation_space.spaces
        #     self.num_observations = sum([space.shape[0] for space in self.observation_spaces])
        # else:
        #     raise ValueError("Observation space must be of type gym.spaces.Tuple")
        
        self.linear_layer_1 = nn.Linear(self.num_observations, 32)
        self.linear_layer_2 = nn.Linear(32, 16)
        self.action_layer = nn.Linear(16, self.num_actions)
    
    def compute(self, inputs, role):
        concatenated_inputs = torch.cat(
            [inputs["states"][space_idx] for space_idx in range(len(self.observation_spaces))],
            dim=1
        )
        
        x = nn.functional.relu(self.linear_layer_1(concatenated_inputs))
        x = nn.functional.relu(self.linear_layer_2(x))
        return torch.sigmoid(self.action_layer(x)), {}

In [11]:
class DeterministicCritic(DeterministicMixin, Model):
    def __init__(self, observation_space, action_space, device="cpu", clip_acitons=False):
        Model.__init__(self, observation_space, action_space, device)
        DeterministicMixin.__init__(self, clip_acitons)
        
        self.linear_layer_1 = nn.Linear(self.num_observations + self.num_actions, 32)
        self.linear_layer_2 = nn.Linear(32, 16)
        self.linear_layer_3 = nn.Linear(16, 1)
    
    def compute(self, inputs, role):
        x = nn.functional.relu(self.linear_layer_1(torch.cat([inputs["states"], inputs["taken_action"]], dim=1)))
        x = nn.functional.relu(self.linear_layer_2(x))
        return self.linear_layer_3(x), {}

In [12]:
memory = RandomMemory(memory_size=1000)

In [13]:
models = {}
models["policy"] = DeterministicActor(gym.spaces.flatten_space(env.observation_space), env.action_space)
models["target_policy"] = DeterministicActor(gym.spaces.flatten_space(env.observation_space), env.action_space)
models["policy"] = DeterministicCritic(gym.spaces.flatten_space(env.observation_space), env.action_space)
models["target_policy"] = DeterministicCritic(gym.spaces.flatten_space(env.observation_space), env.action_space)

In [14]:
for model in models.values():
    model.init_parameters(method_name="normal_", mean=0.0, std=0.1)

In [15]:
cfg_agent = DDPG_DEFAULT_CONFIG.copy()
cfg_agent["discount_factor"] = 0.9
agent = DDPG(models=models,
             memory=memory,  # only required during training
             cfg=cfg_agent,
             observation_space=env.observation_space,
             action_space=env.action_space,
             device="cpu")

In [16]:
num_episodes = 1000  # Set the number of episodes for training
for episode in range(num_episodes):
    obs = env.reset()
    done = False
    episode_reward = 0
    timestep = 0 
    while not done:
        action = agent.act(obs, timestep, num_episodes)
        next_obs, reward, done, _ = env.step(action)
        episode_reward += reward
        agent.replay_buffer.push(obs, action, reward, next_obs, done)
        obs = next_obs
        agent.update()
        timestep += timestep
    print(f"Episode {episode + 1}: Total Reward: {episode_reward}")

KeyError: 'taken_action'

In [19]:
env.reset()

((20, 10, 0), {})