In [1]:
import numpy as np
import random
import math
from IPython.display import clear_output
import time

In [9]:
class Grid_World:
    def __init__(self,width,height):
        self.width = width
        self.height = height
        x_goal = random.randint(0,width-1)
        y_goal = random.randint(0,height-1)
        self.goal = (x_goal,y_goal)
        self.reward_goal = 10
        self.reward_step = -1
        
    def get_new_state(self,previous_state, action):
        (x,y) = previous_state
        if action == "up":
            y = y - 1
        if action == "right":
            x = x + 1 
        if action == "down":
            y = y+1
        if action == "left":
            x = x - 1 
        return (x,y)
    
    def possible_actions(self,state):
        actions = []
        x,y = state
        if y > 0:
            actions.append("up")
        if x > 0:
            actions.append("left")
        if x < self.width-1:
            actions.append("right")
        if y < self.height-1:
            actions.append("down")
        return actions
    
    def spawn(self):
        x_spawn = random.randint(0,self.width-1)
        y_spawn = random.randint(0,self.height-1)
        while (x_spawn,y_spawn) == self.goal:
            x_spawn = random.randint(0,self.width)
            y_spawn = random.randint(0,self.height)
        return (x_spawn,y_spawn)
    def viz(self,agent):
        line = "_"*self.width*3
        print(line)
        for j in range(self.height):
            for i in range(self.width):
                s="  "
                if (i,j)==self.goal:
                    s="G."
                if (i,j)==agent:
                    s="A."
                print('|'+s,end="")
            print("|")
            print(line)

In [13]:
def opposite_action(action):
    if action==None:
        return None
    if action == "up":
        return "down"
    if action == "down":
        return "up"
    if action == "left":
        return "right"
    if action == "right":
        return "left"

In [3]:
class State:
    def __init__(self,name):
        self.name = name
        self.occur = 0
        self.value = 0
    def update(self,Gt):
        self.occur = self.occur +1
        self.value = self.value + (Gt - self.value) / self.occur

In [4]:
class Episode:
    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []
    def add_step(self,action,new_state,reward):
        self.states.append(new_state)
        self.actions.append(action)
        self.rewards.append(reward)

In [16]:
class Agent:
    
    def __init__(self,name,discount):
        self.name = name
        self.discount = discount
        self.state_value = {}          # state ===> value
        self.state_action_value = {}   # ( state , action ) ===> value
        self.policy = {}               # state ===> action
        self.state_occur = {}          # state ===> nb of occurences 
        
    def consume_episode(self,episode):
        states = episode.states
        action = episode.actions
        rewards = episode.rewards
        for i in range(len(states)):
            Gt = rewards[i]
            discount = self.discount
            for j in range(i+1,len(rewards)):
                Gt = Gt + discount * rewards[j]
                discount = discount * self.discount
            state = [s for s in self.policy.keys() if s.name == states[i]][0]
            state.update(Gt)
            already_updated.append(states[i])
            
    def update_policy(self):
        my_states = self.policy.keys()
        for state in my_states:
            (x,y) = state.name
            actions = grid.possible_actions((x,y))
            m = None
            for action in actions:
                (new_x,new_y) = grid.get_new_state((x,y), action)
                v = None
                for next_state in my_states:
                    if next_state.name == (new_x,new_y):
                        v=next_state.value
                        if (m == None) :
                            m = v
                            a = action
                        elif (m < v) :
                            m = v
                            a = action
            self.policy[state] = a
            
    def choose_random_action(self,grid,x,y):
        actions = grid.possible_actions((x,y))
        index = random.randint(0,len(actions)-1)
        action = actions[index]
        return action
    
    def explore(self,grid,nb_episodes):
        self.policy = {}
        self.policy[State(grid.goal)] = None
        for i in range(nb_episodes):
            (x,y) = grid.spawn()
            episode = Episode()
            searching = True
            past_action = None
            while searching:
                clear_output(wait=True)
                grid.viz((x,y))
                time.sleep(0.1)
                try:
                    current = [s for s in self.policy.keys() if s.name == (x,y)][0]
                    action = self.policy[current]
                    if action == None:
                        action = self.choose_random_action(grid,x,y)
                except: 
                    self.policy[State((x,y))] = None
                    action = self.choose_random_action(grid,x,y)
                while(action == opposite_action(past_action)):
                    action = self.choose_random_action(grid,x,y)
                past_action = action    
                (x,y) = grid.get_new_state((x,y), action)
                reward = -1
                if (x,y)==grid.goal:
                    reward = 10
                    searching = False
                episode.add_step(action,(x,y),reward)
            episode.add_step(action,grid.goal,10)
            print("complted")
            self.consume_episode(episode)
            self.update_policy()
            
grid = Grid_World(16,8)
agent = Agent("binome",1)
agent.explore(grid,100)

________________________________________________
|  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
________________________________________________
|  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
________________________________________________
|  |  |  |A.|G.|  |  |  |  |  |  |  |  |  |  |  |
________________________________________________
|  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
________________________________________________
|  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
________________________________________________
|  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
________________________________________________
|  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
________________________________________________
|  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
________________________________________________
complted


In [17]:
line = "_"*grid.width*3
print(line)
for j in range(grid.height):
    for i in range(grid.width):
        s = "  "
        for state in agent.policy.keys():
            if (i,j)==grid.goal:
                s="G."
            elif (i,j)==state.name:
                try:
                    s = agent.policy[state][0]+"."
                except : 
                    s="N."
        print('|'+s,end="")
    print("|")
    print(line)

________________________________________________
|r.|l.|  |d.|d.|d.|d.|r.|d.|l.|d.|r.|r.|d.|r.|l.|
________________________________________________
|d.|  |d.|d.|d.|l.|l.|l.|d.|d.|d.|d.|d.|d.|l.|u.|
________________________________________________
|d.|r.|r.|r.|G.|  |u.|l.|l.|l.|l.|l.|l.|l.|l.|d.|
________________________________________________
|r.|d.|u.|u.|u.|l.|  |u.|u.|u.|u.|u.|u.|l.|d.|d.|
________________________________________________
|  |r.|r.|u.|u.|u.|l.|u.|u.|u.|u.|l.|u.|r.|r.|l.|
________________________________________________
|  |  |u.|u.|r.|u.|l.|d.|  |r.|u.|l.|d.|l.|u.|u.|
________________________________________________
|d.|r.|r.|r.|u.|u.|r.|l.|l.|r.|u.|l.|l.|l.|d.|l.|
________________________________________________
|u.|l.|u.|  |  |u.|u.|u.|r.|r.|u.|u.|u.|l.|l.|l.|
________________________________________________


In [18]:
line = "_"*grid.width*3
print(line)
for j in range(grid.height):
    for i in range(grid.width):
        s = "  "
        for state in agent.policy.keys():
            
            if (i,j)==state.name:
                s = str(state.value)[0:2]
        print('|'+s,end="")
    print("|")
    print(line)

________________________________________________
|-1|0|  |3.|2.|1.|0|-7|0.|-6|-9|-9|-8|-7|-7|-7|
________________________________________________
|-0|  |0|10|15|13|10|-7|1.|-8|-2|-9|-8|-5|-7|-9|
________________________________________________
|0.|-2|13|16|19|  |8.|6.|5.|3.|2.|1.|-0|-4|-8|-8|
________________________________________________
|3.|4.|0|13|16|13|  |0.|-0|0|0.|-9|-3|-6|-7|-6|
________________________________________________
|  |6.|8.|11|0|11|-0|-9|0|-2|-0|-9|-6|-9|-6|-6|
________________________________________________
|  |  |-4|-9|5.|8.|2.|-9|  |-5|-1|-9|-6|-9|-7|-9|
________________________________________________
|-7|-8|-4|-3|-0|-3|-0|3.|-9|-9|-2|-3|-4|-9|-7|-8|
________________________________________________
|-7|-9|-9|  |  |-9|-9|-9|-9|-7|-7|-7|-5|-6|-6|-9|
________________________________________________
