In [53]:
import numpy as np
from PIL import Image
import cv2
import matplotlib.pyplot as plt
import pickle
import time

plt.style.use('ggplot')

In [54]:
SIZE = 10
NUM_EPISODES = 25000
MOVE_PENALTY = -1
ENEMY_PENALTY = -300
FOOD_REWARD = 25

epsilon = 0.9
EPSILON_DECAY = 0.9998
SHOW_EVERY = 2500

start_q_table = "qtable-1692090992.pickle"    # None or a File

LEARNING_RATE = 0.1
DISCOUNT = 0.95

# Grid objects
PLAYER_N = 1
FOOD_N = 2
ENEMY_N = 3

color_d = {
    1: (255, 175, 0),
    2: (0, 255, 0),
    3: (0, 0, 255)
}

In [55]:
class Blob:
    def __init__(self):
        self.x = np.random.randint(0, SIZE)
        self.y = np.random.randint(0, SIZE)
    
    def __str__(self):
        return f"{self.x}, {self.y}"
    
    def __sub__(self, other):
        return (self.x-other.x, self.y-other.y)
    
    def action(self, choice):
        if choice == 0:
            self.move(x=1, y=1)
        elif choice == 1:
            self.move(x=-1, y=-1)
        elif choice == 2:
            self.move(x=-1, y=1)
        elif choice == 3:
            self.move(x=1, y=-1)
    
    def move(self, x=False, y=False):
        if not x:
            self.x += np.random.randint(-1, 2)
        else:
            self.x += x
        
        if not y:
            self.y += np.random.randint(-1, 2)
        else:
            self.y += y

        if self.x < 0:
            self.x = 0
        elif self.x >= SIZE:
            self.x = SIZE - 1
        
        if self.y < 0:
            self.y = 0
        elif self.y >= SIZE:
            self.y = SIZE - 1

In [56]:
if start_q_table is None:
    q_table = {}
    # (x1, y1) relative position of food wrt player
    # (x2, y2) relative position of enemy wrt player
    for x1 in range(-SIZE+1, SIZE):
        for y1 in range(-SIZE+1, SIZE):
            for x2 in range(-SIZE+1, SIZE):
                for y2 in range(-SIZE+1, SIZE):
                    q_table[(x1, y1), (x2, y2)] = [np.random.uniform(-5, 0) for _ in range(4)]
else:
    with open(start_q_table, 'rb') as f:
        q_table = pickle.load(f)

In [57]:
rewards_collected = []
MAX_STEPS_PER_EPISODE = 200
for episode in range(NUM_EPISODES):
    player = Blob()
    food = Blob()
    enemy = Blob()

    if episode % SHOW_EVERY == 0:
        print(f"Episode: {episode} | Epsilon: {epsilon}")
        print(f"Last {SHOW_EVERY} episodes mean: {np.mean(rewards_collected[-SHOW_EVERY:])}")
        show = True
    else:
        show = False

    episode_reward = 0
    for i in range(MAX_STEPS_PER_EPISODE):
        obs = (player-food, player-enemy)
        if np.random.random() > epsilon:
            action = np.argmax(q_table[obs])
        else:
            action = np.random.randint(0, 4)
        
        player.action(action)

        if player.x == enemy.x and player.y == enemy.y:
            reward = ENEMY_PENALTY
        elif player.x == food.x and player.y == food.y:
            reward = FOOD_REWARD
        else:
            reward = MOVE_PENALTY
        
        new_obs = (player-food, player-enemy)
        max_future_q = np.max(q_table[new_obs])
        current_q = q_table[obs][action]

        if reward == FOOD_REWARD:
            new_q = FOOD_REWARD
        elif reward == ENEMY_PENALTY:
            new_q = ENEMY_PENALTY
        else:
            new_q = current_q * (1 - LEARNING_RATE) + LEARNING_RATE * (reward + DISCOUNT * max_future_q)
        
        q_table[obs][action] = new_q

        if show:
            env = np.zeros((SIZE, SIZE, 3), dtype=np.uint8)
            env[food.y, food.x] = color_d[FOOD_N]
            env[enemy.y, enemy.x] = color_d[ENEMY_N]
            env[player.y, player.x] = color_d[PLAYER_N]

            img = Image.fromarray(env, 'RGB')
            img = img.resize((300, 300), Image.NEAREST)
            cv2.imshow("D-Ash", np.array(img))
            if reward == FOOD_REWARD or reward == ENEMY_PENALTY:
                if cv2.waitKey(500) & 0xFF == ord('q'):
                    break
            else:
                if cv2.waitKey(100) & 0xFF == ord('q'):
                    break

        episode_reward += reward
        if reward == FOOD_REWARD or reward == ENEMY_PENALTY:
            break
    
    rewards_collected.append(episode_reward)
    epsilon *= EPSILON_DECAY

moving_avg = np.convolve(rewards_collected, np.ones(SHOW_EVERY) / SHOW_EVERY, mode='valid')

plt.plot([i for i in range(len(moving_avg))], moving_avg)
plt.ylabel(f"Reward {SHOW_EVERY} MA")
plt.xlabel(f"Episode #")
plt.show()

with open(f"qtable-{int(time.time())}.pickle", "wb") as f:
    pickle.dump(q_table, f)