In [None]:
from turtle import Screen
import time
from turtle import Turtle

screen = Screen()
screen.setup(width=1300, height=900)
screen.bgcolor("black")

# create a State Board
a = -300, 150
START = (-225, -225)
WIN = (225, 75)
LOSE = (225, -75)
DETERMINISTIC = True


class Board:
    timmy = Turtle()
    timmy.shape("turtle")
    timmy.color("white")
    timmy.speed("fastest")

    timmy.penup()
    timmy.goto(a)

    for k in range(3):
        for j in range(4):
            for i in range(4):
                if j == 3 and k == 1 or k == 2 and j == 0 or k == 0 and j == 3 or j == 1 and k==1:
                    break
                timmy.pendown()
                timmy.forward(150)
                timmy.right(90)
            timmy.penup()
            timmy.forward(150)
        timmy.penup()
        timmy.goto(-300, 0)
        if k == 1:
            timmy.goto(-300, -150)

    timmy.goto(150, 0)
    timmy.fillcolor("red")
    timmy.begin_fill()
    for i in range(4):
        timmy.pendown()
        timmy.forward(150)
        timmy.right(90)
    timmy.end_fill()

    timmy.penup()

    timmy.goto(-300, -150)
    timmy.fillcolor("yellow")
    timmy.begin_fill()
    for i in range(4):
        timmy.pendown()
        timmy.forward(150)
        timmy.right(90)
    timmy.end_fill()

    timmy.penup()

    timmy.goto(150, 150)
    timmy.fillcolor("green")
    timmy.begin_fill()
    for i in range(4):
        timmy.pendown()
        timmy.forward(150)
        timmy.right(90)
    timmy.end_fill()

    timmy.penup()

    timmy.goto(-150, 0)
    timmy.fillcolor("grey")
    timmy.begin_fill()
    for i in range(4):
        timmy.pendown()
        timmy.forward(150)
        timmy.right(90)
    timmy.end_fill()

    timmy.penup()
    timmy.goto(WIN)
    timmy.write(f"WIN", align="center", font=("Arial", 15, "normal"))
    timmy.goto(LOSE)
    timmy.write(f"LOOSE", align="center", font=("Arial", 15, "normal"))
    timmy.goto(-75, -75)
    timmy.write(f"WALL", align="center", font=("Arial", 15, "normal"))
    timmy.goto(START)
    timmy.color("black")
    timmy.write(f"START", align="center", font=("Arial", 15, "normal"))

    timmy.hideturtle()


import numpy as np

# centroid of each grid
STATES = [[[-225, 75], [-75, 75], [75, 75], [225, 75]],
          [[-225, -75], [-75, -75], [75, -75], [225, -75]],
          [[-225, -225], [-75, -225], [75, -225], [225, -225]]]

agent = Turtle()
agent.speed("fastest")
agent.shape("turtle")
agent.color("blue")
agent.penup()
agent.goto(-225, -225)


# class for various state functions
class State:
    def __init__(self, state=(-225, -225)):
        self.state = state
        self.isEnd = False
        self.determine = DETERMINISTIC
        self.x = agent.xcor()
        self.y = agent.ycor()

    def giveReward(self):
        if self.state == WIN:
            return 1
        elif self.state == LOSE:
            return -1
        else:
            return 0

    def isEndFunc(self):
        if (self.state == WIN) or (self.state == LOSE):
            self.isEnd = True

    def nxtPosition(self, action):

        if self.determine:
            if action == "up":
                nxtState = (self.x, self.y + 150)
            elif action == "down":
                nxtState = (self.x, self.y - 150)
            elif action == "left":
                nxtState = (self.x - 150, self.y)
            else:
                nxtState = (self.x + 150, self.y)

            # if next state legal
            if (nxtState[0] >= -300) and (nxtState[0] <= 300):
                if (nxtState[1] >= -300) and (nxtState[1] <= 150):
                    if nxtState != (-75, -75):
                        return nxtState
            return self.state


# create an agent
class Agent:

    def __init__(self):
        self.states = []
        self.actions = ["up", "down", "left", "right"]
        self.State = State()
        self.lr = 0.2
        self.exp_rate = 0.3

        self.value = Turtle()
        self.value.color("white")
        self.value.penup()

        self.state_values = {}
        for i in STATES:
            for j in i:
                self.state_values[(j[0], j[1])] = 0
                if j == [-75, -75]:
                    continue

                if j == [225, 75]:
                    self.value.goto(225, 37)
                    self.value.write(f"1", align="center", font=("Arial", 15, "normal"))
                    break

                if j == [225, -75]:
                    self.value.goto(225, -102)
                    self.value.write(f"-1", align="center", font=("Arial", 15, "normal"))
                    break

                elif j == [-225, -225]:
                    j = i[i.index(j) + 1]

                self.value.goto(j[0], j[1])
                self.value.write(f"0", align="center", font=("Arial", 15, "normal"))

    def chooseAction(self):
        # choose action with most expected value
        mx_nxt_reward = 0
        action = ""

        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            for a in self.actions:
                b = self.State.nxtPosition(a)
                nxt_reward = self.state_values[b]
                if nxt_reward >= mx_nxt_reward:
                    action = a
                    mx_nxt_reward = nxt_reward
        return action

    def takeAction(self, action):
        position = self.State.nxtPosition(action)
        agent.goto(position)
        return State(state=position)

    def reset(self):
        self.states = []
        self.State = State()

    def play(self, rounds):
        i = 0
        while i < rounds:
            # back propagate reward the reward till the end
            if self.State.isEnd:
                # back propagate
                reward = self.State.giveReward()
                # explicitly assign end state to reward values
                self.state_values[self.State.state] = reward
                print("Game End Reward", reward)
                for s in reversed(self.states):
                    reward = self.state_values[s] + self.lr * (reward - self.state_values[s])
                    self.state_values[s] = round(reward, 3)
                self.reset()
                i += 1

            else:
                action = self.chooseAction()
                # append trace
                self.states.append(self.State.nxtPosition(action))
                print("current position {} action {}".format(self.State.state, action))
                # by taking the action, it reaches the next state
                self.State = self.takeAction(action)
                # mark is end
                self.State.isEndFunc()
                print("nxt state", self.State.state)
                print("---------------------")

    # display each state values
    def showValues(self):
        self.value.clear()
        for i in STATES:
            for j in i:
                if j == [225, 75] or j == [225, -75] or j == [-225, -225] or j == [-75, -75]:
                    continue
                else:
                    self.value.goto(j[0], j[1])
                    self.value.write(str(self.state_values[(j[0], j[1])]).ljust(6), align="center", font=("Arial", 15, "normal"))
        self.value.hideturtle()


A = Agent()

for i in range(1):
    screen.update()
    time.sleep(0.1)
    A.play(50)
    A.showValues()

screen.exitonclick()