In [None]:
!pip install stable-baselines3[extra]


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stable-baselines3[extra]
  Downloading stable_baselines3-1.6.2-py3-none-any.whl (170 kB)
[K     |████████████████████████████████| 170 kB 5.2 MB/s 
Collecting gym==0.21
  Downloading gym-0.21.0.tar.gz (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 51.0 MB/s 
Collecting autorom[accept-rom-license]~=0.4.2
  Downloading AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting rich
  Downloading rich-12.6.0-py3-none-any.whl (237 kB)
[K     |████████████████████████████████| 237 kB 53.5 MB/s 
[?25hCollecting ale-py==0.7.4
  Downloading ale_py-0.7.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 34.3 MB/s 
Collecting AutoROM.accept-rom-license
  Downloading AutoROM.accept-rom-license-0.4.2.tar.gz (9.8 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[

In [None]:
import numpy as np
import gym
from gym import spaces

In [None]:
from gym.utils.seeding import np_random
class YatzeeEnv(gym.Env):
  """
  Yatzee Environment that follows gym interface.
  This is an env where the agent must learn to select which dices to keep and which to throw again in order to get the max reward according to Yatzee rules
  """
  # Because of google colab, we cannot implement the GUI ('human' render mode)
  metadata = {'render.modes': ['console']}

  def __init__(self, grid_size=10):
    super(YatzeeEnv, self).__init__()
    dices_to_throw = 5

    # action space of type MultiBinary tells which dice to throw again (1) and which to keep (0)
    self.action_space = spaces.MultiBinary(dices_to_throw)

    # observation space of type MultiDiscrete describes free slots on Yatzee sheet in the following order:
    # [dice-1 ; dice-2 ; dice-3 ; dice-4 ; dice-5 ; #attempt ; target-slot]
    self.observation_space = spaces.MultiDiscrete([7])

  def reset(self):
    """
    Important: the observation must be a numpy array
    :return: (np.array)
    """
    # initialize 5 dices as well as counter for attempt and indicator what yatzee slot to fill
    self.observation_space = np.random.randint(low=1, high=7, size=dices_to_throw+2)
    # indicator what yatzee slot to fill (1: 1-pips, 2: 2 pips, .. , 3er pasch, 4er pasch,  full-house, short street, long street, yatzee, chance )
    self.observation_space[-1] = np.random.randint(low=1, high=7)
    # initiate number of attempts (counting 3-2-1)
    self.observation_space[-2] = 3

    return self.observation_space

  # some helper functions
  def attemptsLeft (self):
    return self.observation_space[-2]

  def decAttempts (self):
    self.observation_space[-2] -= 1

  def actionRequiresRethrow(self):
    return (1 in self.action_space)

  def countSamePips (self, pipsToCount):
    actualSum = 0
    maxSum = 5 * pipsToCount
    for idx in range(5):
     actualSum += self.observation_space[idx] if (self.observation_space[idx] == pipsToCount) else 0
    return maxSum, actualSum

  def countAllPips (self):
    return np.sum( self.observation_space [:5] )

  def throwDices (self):
    for idx in range (5):
      if (action[idx] == 1):
        self.observation_space[idx] = np.random.randint(low=1, high=7)


  def getReward(self):
    max_reward, reward = 0
    match = True
    sorted_dices = np.sort(self.observation_space [:5])

    # sum of 1-pip dices .. 6-pip dices
    if (self.observation_space[-1] in range (1..6)):
      max_reward, reward = countSamePips (self.observation_space[-1])

    # three-the-same
    elif (self.observation_space[-1] == 7):
      max_reward = 3 * 6
      if (sorted_dices[0] == sorted_dices[1] == sorted_dices[2]) or
          (sorted_dices[1] == sorted_dices[2] == sorted_dices[3]) or
          (sorted_dices[2] == sorted_dices[3] == sorted_dices[4]):
        reward = 3 * sorted_dices[2]

    # four-the-same
    elif (self.observation_space[-1] == 8):
      max_reward = 4 * 6
      if (sorted_dices[0] == sorted_dices[1] == sorted_dices[2] == sorted_dices[3]) or
          (sorted_dices[1] == sorted_dices[2] == sorted_dices[3] == sorted_dices[4]):
        reward = 4 * sorted_dices[2]

    # full house
    elif (self.observation_space[-1] == 9):
      max_reward = 25
      if (sorted_dices[0] == sorted_dices[1] == sorted_dices[2]) and (sorted_dices[3] == sorted_dices[4]) or
          (sorted_dices[0] == sorted_dices[1]) and (sorted_dices[2] == sorted_dices[3] == sorted_dices[4]):
        reward = 25

    # short street
    elif (self.observation_space[-1] == 10):
      max_reward = 30
      # each number increased by 0 or 1
      for idx in range (0:5)
         match = match and ((sorted_dices[idx] == sorted_dices[idx+1]) or (sorted_dices[idx] == sorted_dices[idx+1] + 1))
      # and last - first >= 3 (avoid more than one same pips on dice)
      match = match and (sorted_dices[-1] - sorted_dices[0]) >= 3
      if match:
        reward = 30

    # long street
    elif (self.observation_space[-1] == 11):
      max_reward = 40
      # each number increased by 1
      for idx in range (0:5)
         match = match and (sorted_dices[idx] == sorted_dices[idx+1] + 1)
      if match:
        reward = 40

    # yatzee
    elif (self.observation_space[-1] == 12):
      max_reward = 50
      for idx in range (0:5)
         match = match and (sorted_dices[idx] == sorted_dices[idx+1])
      if match:
        reward = 50

    # chance
    else:
      max_reward = 30
      reward = countAllPips()

    return max_reward, reward


  def step(self, action):
    """
    """
    # countdown #attempts
    self.decAttempts()

    # calculate reward in any case
    max_reward, reward = self.getReward ()

    if (attemptsLeft() == 0):
      return self.observation_space, reward, True, {msg: "reached last attempt"}
    else:
      # perform re-throw
      for idx in range (5):
        if (action[idx] == 1):
          self.observation_space[idx] = np.random.randint(low=1, high=7)

      # re-throw in spite of max points reached
      if (actionRequiresRethrow() and (max_reward == reward)):
          return self.observation_space, -50, False, {msg: "rethrow in spite of max points reached"}
      # regular return
      else
          return self.observation_space, reward, False, {msg: "regular return not done yet"}


  def render(self, mode='console'):
    if mode != 'console':
      raise NotImplementedError()
    # agent is represented as a cross, rest as a dot
    print("." * self.agent_pos, end="")
    print("x", end="")
    print("." * (self.grid_size - self.agent_pos))

  def close(self):
    pass
