In [1]:
from typing import Union
from enum import Enum, auto

import numpy as np
import matplotlib.pyplot as plt
import random

from tqdm import trange

from maze import Maze, Status, Enviroment

In [2]:
class Agent:
    def __init__(self, seed: Union[str, bytes, int]) -> None:
        self.values = np.full((5 * 5), 1, dtype = np.float64)
        self.policy = np.full((5 * 5, 4), 1, dtype = np.float64)
        # self.policy[99, ::] = [0, 0, 0, 0]
        self.envi = Enviroment(seed = seed)

    def vis_pol(self) -> None:
        self.envi.reset()
        status = Status.IN_PROGRESS
        current_loc = 0
        i = 0
        path = []
        while status == Status.IN_PROGRESS:
            try:
                # Determine, from the current policy, a move for the current state
                move = random.choices(['n', 's', 'w', 'e'], weights = self.policy[current_loc], k = 1)[0]
                new_status, new_loc = self.envi.step(move)
                status = new_status
                print(status)
                path.append(new_loc)
                i += 1
            except AssertionError:
                break       
        print(path)
        print(self.envi.trace_path(path))

    def trace(self) -> dict:
        self.envi.reset()
        state_reward = {}
        status = Status.IN_PROGRESS
        current_loc = 0
        i = 0
        while status == Status.IN_PROGRESS:
            # Determine, from the current policy, a move for the current state
            move = random.choices(['n', 's', 'w', 'e'], weights = self.policy[current_loc], k = 1)[0]
            new_status, reward, new_loc = self.envi.next(move)
            status = new_status
            state_reward[(new_loc, i)] = reward
            i += 1
        return state_reward
    
    def approx_values(self) -> None:
        returns = [[] for i in range(25)]
        for i in trange(10000):
            rewards = self.trace()
            visited = set()
            for s, i in rewards:
                if s in visited:
                    continue
                else:
                    visited.add(s)
                    returns[s].append(rewards[(s, i)])
                    self.values[s] = np.mean(returns[s])
            for i in range(25):
                neighbors = np.zeros(4)
                x, y = i % 5, i // 5
                for j, (dx, dy) in enumerate([(0, -1), (0, 1), (-1, 0), (1, 0)]):
                    nx, ny = x + dx, y + dy
                    if (0 <= nx < 5 and 0 <= ny < 5):
                        neighbors[j] = (self.values[ny * 5 + nx])
                    else:
                        pass
                self.policy[i] = np.exp(-neighbors) / np.sum(np.exp(-neighbors))

In [None]:
agent = Agent(b'imbored')
print(agent.envi.envi)
agent.approx_values()
agent.vis_pol()