In [70]:
import numpy as np
import random
from builtins import range, input


In [115]:
class Environment:

    def __init__(self, size = 5):
        self.board = np.zeros((size, size))

        self.size = size
        self.movement = None
        self.states = []
        self.num_states = None  
        self.gotApple = False

        self.apple = []
        self.snake = [] 
        
        self.reset()

    def reset(self):
        self.set_snake()
        self.set_apple()
        
    def draw_board(self):
        for i in range(self.size):
            for j in range(self.size):
                p = False   
                if(j == 0):
                    print("|", end="")
                if(i == self.apple[0] and j == self.apple[1]):
                    print("🍏", end="") 
                    p = True
                for x, s in enumerate(self.snake):
                    if(i == s[0] and j == s[1]):
                        if(x == len(self.snake) -1 ):
                            print("🌝 ", end="")
                            p = True
                        else:
                            print("🌕 ", end="")
                            p = True
                if(p == False):
                    print(" ⌏ ", end="")
                if(j == self.size - 1):
                    print("|")
        print("")
        
    def game_over(self): 
        #Case hits on itself
        for index, snakePart in enumerate(self.snake):
            # Remove the snakePart to validate if there is a duplicated
            snakeWithoutSnakePart = np.delete(self.snake, index, axis=0)
            for withoutSnakePart in snakeWithoutSnakePart:
                if (snakePart == withoutSnakePart).all():
                    return True 
            
        #Case hits a wall
        for p in self.snake[-1]:
            if(p < 0 or p == self.size):
                return True
        return False
    
    
    def get_state(self):
            # S = Total of all environmental variations
            # | S | = 4 ^ (self.size)
            # Four possible states, being Snake's Head, Snake's Body, Empty or the Apple
            # The head and body are different as the states change according to the snake's direction
            k = 0
            h = 0
            for i in range(self.size):
                for j in range(self.size):
                    p = False  
                    if(i == self.apple[0] and j == self.apple[1]):
                        v = 3
                        p = True
                    for x, s in enumerate(self.snake):
                        if(i == s[0] and j == s[1]):
                            if(x == len(self.snake) -1 ):
                                v = 2
                                p = True
                            else:
                                v = 1
                                p = True
                    if(p == False):
                        v =  0
                    h += (4**k) * v
                    k += 1
            return h

    def get_state_hash(self):
        state = self.get_state()
        if state not in self.states:
            self.states.append(state)
        return self.states.index(state)
    
    def reward(self):
        reward = 0
        if self.gotApple:
            self.gotApple = False
            return 100
        for i in range(2):
            
            if(self.movement[i] != 0):
                diff = self.apple[i] - self.snake[-1][i]
                if(diff >= 0):
                    if(self.movement[i] > 0):
                        reward += 10
                    else:
                        reward -= 5
                if(diff <= 0):
                    if(self.movement[i] < 0):
                        reward += 10
                    else:
                        reward -= 5
        if self.game_over():
            reward -= 200

        return reward
    
    def set_apple(self):
        while True:
            apple = np.array([random.randint(0, self.size -1),random.randint(0, self.size -1)])
            inSnake = any((apple == snakePart).all() for snakePart in self.snake)
            if(not inSnake):
                break
        self.apple = apple
        
    def set_snake(self):
        self.snake = np.array([[2,0], [2,1]]) 

    def get_value_movement(self, movement):
        # up = 0, left = 1, down = 2, right = 3
        return [-1, 0] if movement == 0 else [0, 1] if movement == 3 else [1, 0] if movement == 2 else [0,-1]
    
    def move_snake(self, action):
        # Do the reverse path
        # The head is the FIRST to be moved and it is at the END of the list
        for index in range(len(self.snake)):
            position = (index + 1) * -1
            # Snake's head
            if(position == -1):
                self.movement = self.get_value_movement(action)
                newPosition = np.sum([self.snake[position], np.array(self.movement)], axis=0)
                
                # Get the apple
                if((newPosition == self.apple).all()):
                    self.snake = np.concatenate([self.snake, [self.apple]])
                    self.gotApple = True
                    self.set_apple()
                    return
                else:
                    oldPosition = np.copy(self.snake[position])
                    self.snake[position] = newPosition
            # Snake's body
            else:
                newPosition = np.copy(oldPosition)
                oldPosition = np.copy(self.snake[position])
                self.snake[position] = newPosition

In [133]:
class Agent:

    def __init__(self, size = 5):
        self.Q = np.zeros([10**size, 10])

        # Reward discount factor
        self.discount = 0.9 

        #Learning rate
        self.learning_rate = 0.99
        
        # Initial probability of exploitation
        self.epsilon = 1.0  

        # Epsilon decay
        self.epsilon_decay = 0.99
        
    def get_action(self, state):
        # Epsilon-greedy 
        if random.uniform(0, 1) < self.epsilon:
            # Exploration
            return random.randint(0, 3)
        else:
            return np.argmax(self.Q[state])

    def take_action(self, env, action):
        env.move_snake(action)
        return env.get_state_hash(), env.reward(), env.game_over() 
    
    def train(self, env, epochs = 3000, draw = False):
        for episode in range(1,epochs+1):
            done = False
            G, reward = 0,0
            state = env.get_state_hash()
            firstState = state
            lastAction = 3
            env.reset()
            if draw:
                  env.draw_board()
            while done == False:
                action = self.get_action(state)
                while (action != lastAction) and (action % 2 == lastAction % 2):
                    action = self.get_action(state)
                lastAction = action
                next_state, reward, done = self.take_action(env, action)
                
                self.Q[state,action] += self.learning_rate * (reward + self.discount * np.max(self.Q[next_state]) - self.Q[state,action]) 
                G += reward                
                if draw:
                  env.draw_board()
                state = next_state
            
            if(episode % 1000 == 0):
                print("Episode {} / Reward = {}".format(episode, G))
            self.epsilon = max(self.epsilon * self.epsilon_decay, 0.01)
        finalState = state
        print('Reward: ', G)

    def play(self, env):
        done = False
        G, reward = 0,0
        state = env.get_state_hash()
        firstState = state
        lastAction = 3
        env.reset()
        env.draw_board()
        while done == False:
          print("8: up, 4: left, 2: down, 6:right")
          action = int(input("Movement: "))
          action = {8: 0, 6:3, 2:2, 4:1}[action]
          while (action != lastAction) and (action % 2 == lastAction % 2):
                action = int(input("Movement: "))
          lastAction = action
          next_state, reward, done = self.take_action(env, action)
          self.Q[state,action] += self.learning_rate * (reward + self.discount * np.max(self.Q[next_state]) - self.Q[state,action]) 
          G += reward
          env.draw_board()
          state = next_state
        print('Reward: ', G)

In [136]:
env = Environment()
agent = Agent()
agent.train(env, epochs=7000)


Episode 1000 / Reward = 55
Episode 2000 / Reward = 180
Episode 3000 / Reward = 205
Episode 4000 / Reward = 150
Episode 5000 / Reward = 360
Episode 6000 / Reward = 150
Episode 7000 / Reward = 285
Reward:  285


In [137]:
agent.train(env, epochs=1, draw=True)


| ⌏  ⌏  ⌏ 🍏 ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
|🌕 🌝  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |

| ⌏  ⌏  ⌏ 🍏 ⌏ |
| ⌏ 🌝  ⌏  ⌏  ⌏ |
| ⌏ 🌕  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |

| ⌏  ⌏  ⌏ 🍏 ⌏ |
| ⌏ 🌕 🌝  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |

| ⌏  ⌏ 🌝 🍏 ⌏ |
| ⌏  ⌏ 🌕  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |

| ⌏  ⌏ 🌕 🌝  ⌏ |
| ⌏  ⌏ 🌕  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏ 🍏|

| ⌏  ⌏ 🌕 🌕  ⌏ |
| ⌏  ⌏  ⌏ 🌝  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏ 🍏|

| ⌏  ⌏  ⌏ 🌕  ⌏ |
| ⌏  ⌏  ⌏ 🌕 🌝 |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏ 🍏|

| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏ 🌕 🌕 |
| ⌏  ⌏  ⌏  ⌏ 🌝 |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏ 🍏|

| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏ 🌕 |
| ⌏  ⌏  ⌏  ⌏ 🌕 |
| ⌏  ⌏  ⌏  ⌏ 🌝 |
| ⌏  ⌏  ⌏  ⌏ 🍏|

| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏ 🌕 |
| ⌏  ⌏  ⌏  ⌏ 🌕 |
| ⌏ 🍏 ⌏  ⌏ 🌕 |
| ⌏  ⌏  ⌏  ⌏ 🌝 |

| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏ 🌕 |
| ⌏ 🍏 ⌏  ⌏ 🌕 |
| ⌏  ⌏  ⌏ 🌝 🌕 |

| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ 