## Q-Learning: In Video Games Easily

https://www.youtube.com/watch?v=A5eihauRQvo
    
Rules: The agent (yellow box) has to reach one of the goals to end the game (green or red cell).
- Rewards: Each step gives a negative reward of -0.04. The red cell gives a negative reward of -1. The green one gives a positive reward of +1.
- States: Each cell is a state the agent can be.
- Actions: There are only 4 actions. Up, Down, Right, Left.

In [8]:
from tkinter import *
master = Tk()

triangle_size = 0.1
cell_score_min = -0.2
cell_score_max = 0.2
Width = 100
(x, y) = (5, 5)
actions = ["up", "down", "left", "right"]

board = Canvas(master, width=x*Width, height=y*Width)
player = (0, y-1)
score = 1
restart = False
walk_reward = -0.04

walls = [(1, 1), (1, 2), (2, 1), (2, 2)]
specials = [(4, 1, "red", -1), (4, 0, "green", 1)]
cell_scores = {}


def create_triangle(i, j, action):
    if action == actions[0]:
        return board.create_polygon((i+0.5-triangle_size)*Width, (j+triangle_size)*Width,
                                    (i+0.5+triangle_size)*Width, (j+triangle_size)*Width,
                                    (i+0.5)*Width, j*Width,
                                    fill="white", width=1)
    elif action == actions[1]:
        return board.create_polygon((i+0.5-triangle_size)*Width, (j+1-triangle_size)*Width,
                                    (i+0.5+triangle_size)*Width, (j+1-triangle_size)*Width,
                                    (i+0.5)*Width, (j+1)*Width,
                                    fill="white", width=1)
    elif action == actions[2]:
        return board.create_polygon((i+triangle_size)*Width, (j+0.5-triangle_size)*Width,
                                    (i+triangle_size)*Width, (j+0.5+triangle_size)*Width,
                                    i*Width, (j+0.5)*Width,
                                    fill="white", width=1)
    elif action == actions[3]:
        return board.create_polygon((i+1-triangle_size)*Width, (j+0.5-triangle_size)*Width,
                                    (i+1-triangle_size)*Width, (j+0.5+triangle_size)*Width,
                                    (i+1)*Width, (j+0.5)*Width,
                                    fill="white", width=1)


def render_grid():
    global specials, walls, Width, x, y, player
    for i in range(x):
        for j in range(y):
            board.create_rectangle(i*Width, j*Width, (i+1)*Width, 
                                   (j+1)*Width, fill="white", width=1)
            temp = {}
            for action in actions:
                temp[action] = create_triangle(i, j, action)
            cell_scores[(i,j)] = temp
    for (i, j, c, w) in specials:
        board.create_rectangle(i*Width, j*Width, (i+1)*Width, (j+1)*Width, fill=c, width=1)
    for (i, j) in walls:
        board.create_rectangle(i*Width, j*Width, (i+1)*Width, (j+1)*Width, fill="black", width=1)

render_grid()


def set_cell_score(state, action, val):
    global cell_score_min, cell_score_max
    triangle = cell_scores[state][action]
    green_dec = int(min(255, max(0, (val - cell_score_min) * 255.0 / (cell_score_max - cell_score_min))))
    green = hex(green_dec)[2:]
    red = hex(255-green_dec)[2:]
    if len(red) == 1:
        red += "0"
    if len(green) == 1:
        green += "0"
    color = "#" + red + green + "00"
    board.itemconfigure(triangle, fill=color)


def try_move(dx, dy):
    global player, x, y, score, walk_reward, me, restart
    if restart == True:
        restart_game()
    new_x = player[0] + dx
    new_y = player[1] + dy
    score += walk_reward
    if (new_x >= 0) and (new_x < x) and (new_y >= 0) and (new_y < y) and not ((new_x, new_y) in walls):
        board.coords(me, new_x*Width+Width*2/10, 
                     new_y*Width+Width*2/10, 
                     new_x*Width+Width*8/10, 
                     new_y*Width+Width*8/10)
        player = (new_x, new_y)
    for (i, j, c, w) in specials:
        if new_x == i and new_y == j:
            score -= walk_reward
            score += w
            if score > 0:
                print("Success! score: ", score)
            else:
                print("Fail! score: ", score)
            restart = True
            return
    #print("score: ", score)


def call_up(event):
    try_move(0, -1)


def call_down(event):
    try_move(0, 1)


def call_left(event):
    try_move(-1, 0)


def call_right(event):
    try_move(1, 0)


def restart_game():
    global player, score, me, restart
    player = (0, y-1)
    score = 1
    restart = False
    board.coords(me, player[0]*Width+Width*2/10, player[1]*Width+Width*2/10, 
                 player[0]*Width+Width*8/10, player[1]*Width+Width*8/10)

def has_restarted():
    return restart

master.bind("<Up>", call_up)
master.bind("<Down>", call_down)
master.bind("<Right>", call_right)
master.bind("<Left>", call_left)

me = board.create_rectangle(player[0]*Width+Width*2/10, player[1]*Width+Width*2/10,
                            player[0]*Width+Width*8/10, player[1]*Width+Width*8/10,
                            fill="orange", width=1, tag="me")

board.grid(row=0, column=0)


def start_game():
    master.mainloop()

In [9]:
import threading
import time

discount = 0.3
actions = actions
states = []
Q = {}
for i in range(x):
    for j in range(y):
        states.append((i, j))

for state in states:
    temp = {}
    for action in actions:
        temp[action] = 0.1
        set_cell_score(state, action, temp[action])
    Q[state] = temp

for (i, j, c, w) in specials:
    for action in actions:
        Q[(i, j)][action] = w
        set_cell_score((i, j), action, w)


def do_action(action):
    s = player
    r = -score
    if action == actions[0]:
        try_move(0, -1)
    elif action == actions[1]:
        try_move(0, 1)
    elif action == actions[2]:
        try_move(-1, 0)
    elif action == actions[3]:
        try_move(1, 0)
    else:
        return
    s2 = player
    r += score
    return s, action, r, s2


def max_Q(s):
    val = None
    act = None
    for a, q in Q[s].items():
        if val is None or (q > val):
            val = q
            act = a
    return act, val


def inc_Q(s, a, alpha, inc):
    Q[s][a] *= 1 - alpha
    Q[s][a] += alpha * inc
    set_cell_score(s, a, Q[s][a])


def run():
    global discount
    time.sleep(1)
    alpha = 1
    t = 1
    while True:
        # Pick the right action
        s = player
        max_act, max_val = max_Q(s)
        (s, a, r, s2) = do_action(max_act)

        # Update Q
        max_act, max_val = max_Q(s2)
        inc_Q(s, a, alpha, r + discount * max_val)

        # Check if the game has restarted
        t += 1.0
        if has_restarted():
            restart_game()
            time.sleep(0.01)
            t = 1.0

        # Update the learning rate
        alpha = pow(t, -0.1)

        # MODIFY THIS SLEEP IF THE GAME IS GOING TOO FAST.
        time.sleep(0.1)


t = threading.Thread(target=run)
t.daemon = True
t.start()
start_game()

Fail! score:  -1.6400000000000003
Fail! score:  -2.7600000000000016
Success! score:  0.6799999999999997
Success! score:  0.1999999999999995
Success! score:  0.3999999999999997
Success! score:  0.7999999999999996
Success! score:  1.1999999999999997
Success! score:  1.1999999999999997
Success! score:  0.8399999999999996
Success! score:  1.6799999999999997
Success! score:  1.3199999999999996
Success! score:  1.1199999999999997
Success! score:  1.5999999999999996
Success! score:  1.5999999999999996
Success! score:  1.6799999999999997
Success! score:  1.7199999999999998
Success! score:  1.7199999999999998
Success! score:  1.6399999999999997
Success! score:  1.7199999999999998
Success! score:  1.7199999999999998
Success! score:  1.7199999999999998
Success! score:  1.7199999999999998
Success! score:  1.7199999999999998
Success! score:  1.7199999999999998
Success! score:  1.7199999999999998
Success! score:  1.7199999999999998
Success! score:  1.7199999999999998
Success! score:  1.7199999999999

KeyboardInterrupt: 

Exception in thread Thread-5:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/threading.py", line 917, in _bootstrap_inner
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-d128e7f065bc>", line 72, in run
    inc_Q(s, a, alpha, r + discount * max_val)
  File "<ipython-input-9-d128e7f065bc>", line 56, in inc_Q
    set_cell_score(s, a, Q[s][a])
  File "<ipython-input-8-a34c3732a76b>", line 73, in set_cell_score
    board.itemconfigure(triangle, fill=color)
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/tkinter/__init__.py", line 2578, in itemconfigure
    return self._configure(('itemconfigure', tagOrId), cnf, kw)
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/tkinter/__init__.py", line 1476, in _configure
    self.tk.call(_flatten((self._w, c