In [None]:
# Imports
import numpy as np
import random
from builtins import range, input

In [None]:
class Environment:

    def __init__(self, size = 5):
        self.board = np.zeros((size, size))

        self.size = size
        self.movement = None
        self.states = []
        self.num_states = None  
        
        self.apple = []
        self.snake = [] 
        
        self.reset()

    def reset(self):
        self.set_snake()
        self.set_apple()
        
    def draw_board(self):
        print(self.snake)
        for i in range(self.size):
            for j in range(self.size):
                p = False   
                if(j == 0):
                    print("|", end="")
                if(i == self.apple[0] and j == self.apple[1]):
                    print("🍏", end="") 
                    p = True
                for x, s in enumerate(self.snake):
                    if(i == s[0] and j == s[1]):
                        if(x == len(self.snake) -1 ):
                            print("🌝", end="")
                            p = True
                        else:
                            print("🌕", end="")
                            p = True
                if(p == False):
                    print(" ⌏ ", end="")
                if(j == self.size - 1):
                    print("|")
        print("")
        
    def game_over(self): 
        #Case hits on itself
        for p in self.snake[::-2]:
            if (p == self.snake[-1]).all():
                return True
            
        #Case hits a wall
        for p in self.snake[-1]:
            if(p < 0 or p == self.size):
                return True
        return False
    
    
    def get_state(self):
            # S = Total of all environmental variations
            # | S | = 4 ^ (self.size)
            # Four possible states, being Snake's Head, Snake's Body, Empty or the Apple
            # The head and body are different as the states change according to the snake's direction
            k = 0
            h = 0
            for i in range(self.size):
                for j in range(self.size):
                    p = False  
                    if(i == self.apple[0] and j == self.apple[1]):
                        v = 3
                        p = True
                    for x, s in enumerate(self.snake):
                        if(i == s[0] and j == s[1]):
                            if(x == len(self.snake) -1 ):
                                v = 2
                                p = True
                            else:
                                v = 1
                                p = True
                    if(p == False):
                        v =  0
                    h += (4**k) * v
                    k += 1
            return h

    def get_state_hash(self):
        state = self.get_state()
        if state not in self.states:
            self.states.append(state)
        return self.states.index(state)
    
    def reward(self):
        reward = 0
        if (self.snake[-1] == self.apple).all():
            reward += 100
        for i in range(2):
            diff = self.apple[i] - self.snake[-1][i]
            if(diff > 0):
                if(self.movement[i] > 0):
                    reward += 5
                else:
                    reward -= 5
            if(diff < 0):
                if(self.movement[i] < 0):
                    reward += 5
                else:
                    reward -= 5
        if self.game_over():
            reward -= 100

        return reward
    
    def set_apple(self):
        while True:
            apple = np.array([random.randint(0, self.size),random.randint(0, self.size)])
            if(apple not in self.snake):
                break
        self.apple = apple
        
    def set_snake(self):
        self.snake = np.array([[2,0], [2,1]]) 

    def get_value_movement(self, movement):
        # up = 0, left = 1, down = 2, right = 3
        return [-1, 0] if movement == 0 else [0, 1] if movement == 3 else [1, 0] if movement == 2 else [0,-1]
    
    def move_snake(self, action):
        self.movement = self.get_value_movement(action)
        print(self.movement)
        newPosition = np.sum([self.snake[-1], self.movement], axis=0)
        if((newPosition == self.apple).all()):
            self.snake.append(self.apple)
            self.set_apple()
        else:
            head = self.snake[-1]

            # Positions of the body, without the head
            for position in reversed(self.snake[:-1]):
                old = position
                position = self.snake[(-1 * index) -1]
                self.snake[(-1 * index) -1] = old


In [None]:
class Agent:

    def __init__(self, size = 5):
        self.Q = np.zeros([4**size, 4])

        # Reward discount factor
        self.discount = 0.99 

        #Learning rate
        self.learning_rate = 0.99
        
        # Initial probability of exploitation
        self.epsilon = 1.0  

        # Epsilon decay
        self.epsilon_decay = 0.99
        
    def get_action(self, state):
        # Epsilon-greedy 
        if random.uniform(0, 1) < self.epsilon:
            # Exploration
            return random.randint(0, 3)
        else:
            return np.argmax(self.Q[state])

    def take_action(self, env, action):
        env.move_snake(action)
        return env.get_state_hash(), env.reward(), env.game_over() 
    
    def train(self, env, epochs = 50, draw = False):
        for episode in range(1,epochs+1):
            done = False
            G, reward = 0,0
            state = env.get_state_hash()
            firstState = state
            env.reset()
            if draw:
                  env.draw_board()
            while done == False:
                action = self.get_action(state)
                next_state, reward, done = self.take_action(env, action)
                self.Q[state,action] += self.learning_rate * (reward + self.discount * np.max(self.Q[next_state]) - self.Q[state,action]) 
                G += reward
                if draw:
                  env.draw_board()
                state = next_state
            
            if(episode % 1000 == 0):
                print("Episode {} / Reward = {}".format(episode, G))
            self.epsilon = max(self.epsilon * self.epsilon_decay, 0.01)
        finalState = state


In [None]:
env = Environment()
agent = Agent()
agent.train(env)

In [None]:
agent.train(env, epochs=1, draw=True)

[[2 0]
 [2 1]]
| ⌏  ⌏  ⌏ 🍏 ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
|🌕🌝 ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |

[0, 1]
0 [2 1]
1 [2 0]
[[2 2]
 [2 2]]
| ⌏  ⌏  ⌏ 🍏 ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏ 🌕🌝 ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |
| ⌏  ⌏  ⌏  ⌏  ⌏ |



In [None]:

for x in reversed([2,3,6][:-1]):
  print(x)

3
2
