In [1]:
import numpy as np
import pygame
import random
import math

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
class GridWorld(object):
    def __init__(self, rows=10, cols=10):
        # Define global variables
        self.SCREEN_WIDTH = 450
        self.SCREEN_HEIGHT = 450

        # Define grid variables
        self.rows = rows
        self.cols = cols
        self.grid = self.create_grid()
        self.G_MARGIN = 1
        self.TILE_W = int(self.SCREEN_WIDTH / self.cols)
        self.TILE_H = int(self.SCREEN_HEIGHT / self.rows)

        # Define start point and endpoint
        self.startpoint = (0,0)
        self.endpoint = (7,7)
        self.agent_pos = self.startpoint
        self.grid[self.startpoint[0]][self.startpoint[1]] = 1
        self.grid[self.endpoint[0]][self.endpoint[1]] = 2
        
        # INIT COLORS FOR GRID
        self.BLACK = (0, 0, 0)
        self.WHITE = (255, 255, 255)
        self.GREEN = (0, 255, 0) # START POINT
        self.RED = (255, 0, 0) # RED IS ENDPOINT
        self.ORANGE = (255,165,0) # Q-Value
    
    def create_grid(self):
        return np.zeros((self.rows, self.cols))   
    
    def render(self, screen, qTable, display_q=False):
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                quit()
        
        for row in range(self.rows):
            for column in range(self.cols):
                if(display_q):
                    maxInQTable = np.max(qTable, axis=0)
                    grid = 1 * np.ones((10, 10, 4))
                    values = np.max(qTable, -1).reshape(10, 10)
                    color = (255* values[row][column], values[row][column]*191, 240*values[row][column])
                else:
                    # Color tiles white
                    if self.grid[row][column] == 0:
                        color = self.WHITE

                # Color agent green
                if self.grid[row][column] == 1:
                    color = self.GREEN
                # Color endpoint red
                if self.grid[row][column] == 2:
                    color = self.RED


                pygame.draw.rect(screen,
                color,
                [(self.G_MARGIN + self.TILE_W) * column + self.G_MARGIN,
                (self.G_MARGIN + self.TILE_H) * row + self.G_MARGIN,
                self.TILE_W, self.TILE_H])
                pygame.draw.rect(screen,
                color,
                [(self.G_MARGIN + self.TILE_W) * column + self.G_MARGIN,
                (self.G_MARGIN + self.TILE_H) * row + self.G_MARGIN,
                self.TILE_W, self.TILE_H])

   
                
        pygame.display.flip()
        

    
    def reset(self):
        self.endpoint = (7,7)
        self.agent_pos = self.startpoint
        self.grid = self.create_grid()
        self.grid[self.startpoint[0]][self.startpoint[1]] = 1
        self.grid[self.endpoint[0]][self.endpoint[1]] = 2
        return self.agent_pos

    def __valid_move(self, action):
        if(action == 0 and self.agent_pos[0]-1 >=0):
            return True
        elif(action == 1 and self.agent_pos[1]+1 < self.cols):
            return True
        elif(action == 2 and self.agent_pos[0]+1 < self.rows):
            return True
        elif(action == 3 and self.agent_pos[1]-1 >= 0):
            return True
        else:
            return False

    def step(self, action):
        # SHOULD RETURN observation, reward, done, info
        # VALID MOVE CHECK
        validMove = self.__valid_move(action)
        reward = 0
        done = False
        info = {}
        
        if(validMove):
            # REMOVE OLD COLORING
            self.grid[self.agent_pos[0]][self.agent_pos[1]] = 0
        
            # NEW POSITION COLORING
            if(action == 0): #UP
                self.grid[self.agent_pos[0]-1][self.agent_pos[1]] = 1
                self.agent_pos = ((self.agent_pos[0]-1, self.agent_pos[1]))
                info["direction"] = "up"
            elif(action == 1): #RIGHT
                self.grid[self.agent_pos[0]][self.agent_pos[1]+1] = 1
                self.agent_pos = ((self.agent_pos[0], self.agent_pos[1]+1))
                info["direction"] = "right"
            elif(action == 2): #DOWN
                self.grid[self.agent_pos[0]+1][self.agent_pos[1]] = 1
                self.agent_pos = ((self.agent_pos[0]+1, self.agent_pos[1]))
                info["direction"] = "down"
            elif(action == 3): #LEFT
                self.grid[self.agent_pos[0]][self.agent_pos[1]-1] = 1
                self.agent_pos = ((self.agent_pos[0], self.agent_pos[1]-1))
                info["direction"] = "left"
                
            if(self.agent_pos == self.endpoint): 
                reward = 1
                done = True
                
            return (self.agent_pos, reward, done, info)
                
        else:
            if(self.agent_pos == self.endpoint): 
                reward = 1
                done = True
            return (self.agent_pos, reward, done, info)


In [3]:
class GridWorldAgent():
    def __init__(self, rows, cols, min_lr=0.1, min_epsilon=0.1, discount=0.95):
        self.min_lr = min_lr
        self.min_epsilon = min_epsilon
        self.discount = discount
        self.action_space = [0,1,2,3]
        self.qTable = np.zeros((rows, cols, len(self.action_space)))
        self.actions = {"up":0, "right":1, "down":2, "left":3}
       

    def exploration_rate(self, n):
        #Decaying exploration rate
        return max(self.min_epsilon, min(1, 1.0 - math.log10((n  + 1) / 25)))
    
    def learning_rate(self,n):
        #Decaying exploration rate
        return max(self.min_lr, min(1.0, 1.0 - math.log10((n + 1) / 25)))

    def choose_action(self, state, n):
        if (np.random.random() < self.exploration_rate(n)):
            return random.randint(0, 3)
        else:
            return np.argmax(self.qTable[state])
        
    def updateQ(self, state, action, reward, new_state, lr):
        future_optimal_value = np.max(self.qTable[new_state])
        learned_value = reward + self.discount * future_optimal_value
        old_value = self.qTable[state][action]
        self.qTable[state][action] = (1-lr)*old_value + lr*learned_value 
        

In [None]:
env = GridWorld(rows=10, cols=10)
agent = GridWorldAgent(env.rows, env.cols, min_epsilon=0.05, discount=0.95)

# INIT PYGAME SETTINGS
pygame.init()
screen = pygame.display.set_mode((env.SCREEN_WIDTH, env.SCREEN_HEIGHT))
#white = [255, 255, 255]
#screen.fill(white)
pygame.display.set_caption("Gridworld Q-Learning")
clock = pygame.time.Clock()
fps = 60

for i_episode in range(1000):
    current_state = env.reset()
    for t in range(100):
        if(i_episode % 25 == 0 and t == 0): print("Episode:",i_episode)
        if(i_episode > 300): 
            fps = 15
            env.render(screen, agent.qTable, display_q=True)
            clock.tick(fps)
       
        action = agent.choose_action(current_state, i_episode)
        observation, reward, done, info = env.step(action)
        
        new_state = observation
        lr= agent.learning_rate(i_episode) # decaying learning rate
        
        #Update QTables with new state values
        agent.updateQ(current_state, action, reward, new_state, lr)
        
        # Set new state as current state
        current_state = new_state
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
quit()

Episode: 0
Episode finished after 85 timesteps
Episode finished after 68 timesteps
Episode finished after 65 timesteps
Episode finished after 41 timesteps
Episode: 25
Episode finished after 99 timesteps
Episode finished after 78 timesteps
Episode: 50
Episode finished after 63 timesteps
Episode finished after 79 timesteps
Episode finished after 51 timesteps
Episode finished after 99 timesteps
Episode finished after 45 timesteps
Episode finished after 29 timesteps
Episode finished after 32 timesteps
Episode finished after 42 timesteps
Episode finished after 60 timesteps
Episode finished after 26 timesteps
Episode finished after 51 timesteps
Episode finished after 33 timesteps
Episode finished after 54 timesteps
Episode: 75
Episode finished after 38 timesteps
Episode finished after 40 timesteps
Episode finished after 35 timesteps
Episode finished after 34 timesteps
Episode finished after 49 timesteps
Episode finished after 37 timesteps
Episode finished after 26 timesteps
Episode finished 

Episode finished after 20 timesteps
Episode finished after 16 timesteps
Episode finished after 16 timesteps
Episode finished after 18 timesteps
Episode finished after 16 timesteps
Episode finished after 16 timesteps
Episode finished after 18 timesteps
Episode finished after 18 timesteps
Episode finished after 16 timesteps
Episode finished after 16 timesteps
Episode finished after 18 timesteps
Episode finished after 18 timesteps
Episode finished after 18 timesteps
Episode finished after 17 timesteps
Episode finished after 18 timesteps
Episode finished after 20 timesteps
Episode finished after 16 timesteps
Episode finished after 17 timesteps
Episode finished after 18 timesteps
Episode finished after 16 timesteps
Episode finished after 16 timesteps
Episode finished after 18 timesteps
Episode finished after 16 timesteps
Episode finished after 18 timesteps
Episode: 325
Episode finished after 20 timesteps
Episode finished after 20 timesteps
Episode finished after 16 timesteps
Episode finishe