In [75]:
from collections import defaultdict
import numpy as np 

class Env:
    def __init__(self):
        self.WIDTH = 4
        self.HEIGHT = 4
        self.reward_map = np.zeros((self.HEIGHT, self.WIDTH), dtype=int)
        self.reward_map[0, 3] = 1 #reward
        self.reward_map[3, 3] = -1 #punish
    
    def next_step(self, state):
        if 0 <= state[0] < 4 and 0 <= state[1] < 4:
            reward = self.reward_map[state]
        else:
            reward = -1
        return reward 


class Agent:
    def __init__(self, env: Env):
        self.state = np.array((3, 0)) # initial location, current location
        self.memory = defaultdict(dict) # size: (H, W, Number of Actions) H, W: Map size
        self.initial_action_rewards = {1: 0, 2: 0, 3: 0, 4: 0}
        self.actions = [1, 2, 3, 4] # up, right, down, left
        self.reward = 0
        
        self.new_memory = np.zeros((env.HEIGHT, env.WIDTH, len(self.actions)))



    def expect2(self, state: tuple[int, int]):
        history_of_state = self.new_memory[state[0], state[1]]
        return history_of_state
    
    def best_action2(self, history_of_state):
        arg = np.argmax(history_of_state)
        return self.actions[arg]
        
    

    def expect(self, state: tuple[int, int]):
        if len(self.memory[tuple(state)]) == 0: # if it's initial then give initial value.
            self.memory[tuple(state)] = self.initial_action_rewards
        history_of_state = self.memory[tuple(state)]
        return history_of_state

    def best_action(self, state, history_of_state):
    
        # if len(history_of_state) < 4:
        #     action = np.random.choice(self.actions)
        # else: 
        arg = np.argmax(list(history_of_state.values()))
        # action_rewards = list(self.memory[tuple(state)].keys())
        action = self.actions[arg]
        print(f"history: {history_of_state}, Arg: {arg}, Action: {action}")

        # print(f"action: {action}")
        return action 
        

    def state_transition(self, action):
        if action == 1:
            new_state = self.state + np.array([-1, 0])
        elif action == 2:
            new_state = self.state + np.array([0, 1])
        elif action == 3:
            new_state = self.state + np.array([1, 0])
        elif action == 4:
            new_state = self.state + np.array([0, -1])
        else:
            raise Exception("Unvalid action")
        return new_state

    def move_to_state(self, new_state):
        if 0 <= new_state[0] < 4 and 0 <= new_state[1] < 4:
            return new_state 
        else:
            return self.state



In [76]:
env = Env()


In [77]:
env.reward_map

array([[ 0,  0,  0,  1],
       [ 0,  0,  0,  0],
       [ 0,  0,  0,  0],
       [ 0,  0,  0, -1]])

In [78]:
agent = Agent(env)

In [79]:
agent.state

array([3, 0])

In [87]:
agent.new_memory

array([[[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]]])

In [90]:
agent.new_memory[0, 0]
agent.actions

[1, 2, 3, 4]

In [92]:
agent.actions.index(3)

2

In [85]:
agent.new_memory[0, 3]

array([0., 0., 0., 0.])

In [52]:
hist = agent.expect2(agent.state)
hist 

array([0., 0., 0., 0.])

In [54]:
action = agent.best_action2(hist)
action

1

In [59]:
new_state = agent.state_transition(action)
new_state 

array([2, 0])

In [66]:
reward = env.next_step(tuple(new_state))
reward 

0

In [67]:
move_to_new_state = agent.move_to_state(new_state)
move_to_new_state

array([2, 0])

In [72]:
history = agent.expect2(agent.state)
action = agent.best_action2(history)
new_state = agent.state_transition(action)
reward = env.next_step(tuple(new_state))
agent.new_memory[tuple(agent.state)][action] +=  reward

move_to_new_state = agent.move_to_state(new_state)

agent.state = move_to_new_state
agent.reward += reward

In [None]:

m = np.zeros((4,4,4))


array([[[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]]])

In [33]:
m[(2, 3)] = [0, 0, -20, 0]

In [21]:
m[0, 3][1] = -10

In [16]:
env.reward_map[3, 3]

-1

In [36]:
np.argmax(m[2, 3])

0

In [None]:
# if there's same values while find argmax, and then want to pick randoms.

np.unique(m[2,3], )

array([-20.,   0.])

In [34]:
m

array([[[  0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.],
        [  0., -10.,   0.,   0.]],

       [[  0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.]],

       [[  0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.],
        [  0.,   0., -20.,   0.]],

       [[  0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.],
        [  1., -10.,   0.,   0.]]])

In [86]:
agent.memory[0, 0]

{}

In [97]:
np.random.rand()

0.2280029074692811

In [114]:
score = {0:0, 1:0, 2:0, 3:0}
for i in range(1000):
    result = np.random.randint(0, 4)
    score[result] += 1  


In [115]:
score 

{0: 264, 1: 250, 2: 254, 3: 232}