adapted from: https://medium.com/@paulswenson2/an-introduction-to-building-custom-reinforcement-learning-environment-using-openai-gym-d8a5e7cf07ea

Creates a basic env a single player on 2D grid.

In [None]:
import random
import numpy as np
import os
import time

from IPython.display import clear_output

from gymnasium import Env
from gymnasium import spaces
# from stable_baselines3.common.env_checker import check_env
# you can use check_env to check if your environment is working properly but 
# the tutorial which our code is based on seems to be outdated because check_env is not be satisfied with lots of things

In [None]:
NOTHING = 0
PLAYER = 1
WIN = 2
LOSE = 3

# action values
UP = 0
DOWN = 1
LEFT = 2
RIGHT = 3

In [None]:
class BasicEnv(Env):
    """
    A mxn grid
    
    """
    def __init__(self, m, n) -> None:
        self.m = m # horizontal/ rows
        self.n = n # vertical/ columns
        self.total_states = m * n

        # init state
        self.reset()

        self.observation_space = spaces.Box(0, 3, [self.total_states,], dtype=np.int16)
        self.action_space = spaces.Discrete(4) # shortcut for defining the actions 0-3

    def print_grid_idx(self):
        """
        helper func to print the grid indices
        """
        print(np.array(np.arange(self.total_states, dtype=np.int32)).reshape(self.m, self.n))

    def print_state_as_grid(self):
        """
        helper func to print the grid
        """
        print(self.state.reshape(self.m, self.n))

    def step(self, action):
        info = {}

        done = False
        reward = -0.01
        previous_position = self.player_pos

        if action == UP:
            if (self.player_pos - self.m) >= 0:
                self.player_pos -= self.m
        elif action == DOWN:
            if (self.player_pos + self.m) < self.total_states:
                self.player_pos += self.m
        elif action == LEFT:
            # check column index; == 0 means leftmost column
            if (self.player_pos % self.n) != 0:
                self.player_pos -= 1
        elif action == RIGHT:
            if (self.player_pos % self.n) != self.n - 1:
                self.player_pos += 1
        else:
            # check for invalid actions
            raise Exception("invalid action")
        
        if self.state[self.player_pos] == WIN:
            reward = 1
            self.cumulative_reward += reward
            done = True
            clear_screen()
            print(f'Cumulative Reward: {self.cumulative_reward}')
            print('YOU WIN!!!!')
        elif self.state[self.player_pos] == LOSE:
            reward = -1.0
            self.cumulative_reward += reward
            done = True
            clear_screen()
            print(f'Cumulative Reward: {self.cumulative_reward}')
            print('YOU LOSE')

        if not done:
            self.state[previous_position] = NOTHING
            self.state[self.player_pos] = PLAYER
        
        self.cumulative_reward += reward
        return self.state, reward, done, info
    
    def render(self):
        pretty_print(self.state, self.cumulative_reward, self.m, self.n)

    def reset(self):
        self.cumulative_reward = 0
        self.state = [NOTHING] * self.total_states
        self.player_pos = random.randrange(0, self.total_states)
        self.win_pos = random.randrange(0, self.total_states)
        self.lose_pos = random.randrange(0, self.total_states) 

        while self.win_pos == self.player_pos:
            self.win_pos = random.randrange(0, self.total_states)
        while self.lose_pos == self.player_pos or self.lose_pos == self.win_pos:
            self.lose_pos = random.randrange(0, self.total_states)

        # assign player and win/lose positions
        self.state[self.player_pos] = PLAYER
        self.state[self.win_pos] = WIN
        self.state[self.lose_pos] = LOSE

        # convert to array (gym requirement)
        self.state = np.array(self.state, dtype=np.int32)
    
def pretty_print(state_array, cumulative_reward, m, n):
    clear_screen()
    print(f'Cumulative Reward: {cumulative_reward}')
    print()
    for i in range(m):
        for j in range(n):
            print('{:4}'.format(state_array[i*6 + j]), end = "")
        print()

def clear_screen():
    clear_output()
    os.system("cls")

In [None]:
# manual entry of actions
env = BasicEnv(6, 6)
env.render()
time.sleep(0.001)
action = int(input("Enter action:"))
state, reward, done, info = env.step(action)

while not done:
    env.render()
    time.sleep(0.001)
    action = int(input("Enter action:"))
    state, reward, done, info = env.step(action)

In [None]:
# random actions
env = BasicEnv(6, 6)
env.render()
time.sleep(0.01)
action = env.action_space.sample()
state, reward, done, info = env.step(action)

steps = 0
while not done:
    env.render()
    time.sleep(0.01)
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    steps += 1

    if steps % 15 == 0:
        print(f'{steps} Steps reached. Ending.')
        break