In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D,Flatten,Activation
from tensorflow.keras.callbacks import TensorBoard
from collections import deque
import random
import numpy as np
import time
import tensorflow as tf
import os
import random
from tqdm import tqdm
from PIL import Image
import cv2

In [2]:
replay_memory_size = 50_000
min_replay_memory_size = 1_000
model_name = '256x2'
minibatch_size = 64
discount = 0.99
update_target_every = 5
MIN_REWARD = -200
MEMORY_FRACTION = 0.20
# Enviroment settings
EPISODES = 20_000

# Exploration settings
epsilon = 1
EPSILON_DECAY = 0.99975
MIN_EPSILON = 0.001

# stats settings 
AGGREGATE_STATS_EVERY = 50
SHOW_PREVIEW = False

In [3]:
class BlobEnv:
    SIZE = 10
    RETURN_IMAGES = True
    MOVE_PENALTY = 1
    ENEMY_PENALTY = 300
    FOOD_REWARD = 25
    OBSERVATION_SPACE_VALUES = (SIZE, SIZE, 3)  # 4
    ACTION_SPACE_SIZE = 9
    PLAYER_N = 1  # player key in dict
    FOOD_N = 2  # food key in dict
    ENEMY_N = 3  # enemy key in dict
    # the dict! (colors)
    d = {1: (255, 175, 0),
         2: (0, 255, 0),
         3: (0, 0, 255)}

    def reset(self):
        self.player = Blob(self.SIZE)
        self.food = Blob(self.SIZE)
        while self.food == self.player:
            self.food = Blob(self.SIZE)
        self.enemy = Blob(self.SIZE)
        while self.enemy == self.player or self.enemy == self.food:
            self.enemy = Blob(self.SIZE)

        self.episode_step = 0

        if self.RETURN_IMAGES:
            observation = np.array(self.get_image())
        else:
            observation = (self.player-self.food) + (self.player-self.enemy)
        return observation

    def step(self, action):
        self.episode_step += 1
        self.player.action(action)

        #### MAYBE ###
        #self.enemy.move()
        #self.food.move()
        ##############

        if self.RETURN_IMAGES:
            new_observation = np.array(self.get_image())
        else:
            new_observation = (self.player-self.food) + (self.player-self.enemy)

        if self.player == self.enemy:
            reward = -self.ENEMY_PENALTY
        elif self.player == self.food:
            reward = self.FOOD_REWARD
        else:
            reward = -self.MOVE_PENALTY

        done = False
        if reward == self.FOOD_REWARD or reward == -self.ENEMY_PENALTY or self.episode_step >= 200:
            done = True

        return new_observation, reward, done

    def render(self):
        img = self.get_image()
        img = img.resize((300, 300))  # resizing so we can see our agent in all its glory.
        cv2.imshow("image", np.array(img))  # show it!
        cv2.waitKey(1)

    # FOR CNN #
    def get_image(self):
        env = np.zeros((self.SIZE, self.SIZE, 3), dtype=np.uint8)  # starts an rbg of our size
        env[self.food.x][self.food.y] = self.d[self.FOOD_N]  # sets the food location tile to green color
        env[self.enemy.x][self.enemy.y] = self.d[self.ENEMY_N]  # sets the enemy location to red
        env[self.player.x][self.player.y] = self.d[self.PLAYER_N]  # sets the player tile to blue
        img = Image.fromarray(env, 'RGB')  # reading to rgb. Apparently. Even tho color definitions are bgr. ???
        return img


env = BlobEnv()

# For stats
ep_rewards = [-200]

# For more repetitive results
random.seed(1)
np.random.seed(1)
#tf.set_random_seed(1)

# Memory fraction, used mostly when trai8ning multiple agents
#gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=MEMORY_FRACTION)
#backend.set_session(tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)))

# Create models folder
if not os.path.isdir('models'):
    os.makedirs('models')



In [4]:
class Blob:
    def __init__(self, size):
        self.size = size
        self.x = np.random.randint(0, size)
        self.y = np.random.randint(0, size)

    def __str__(self):
        return f"Blob ({self.x}, {self.y})"

    def __sub__(self, other):
        return (self.x-other.x, self.y-other.y)

    def __eq__(self, other):
        return self.x == other.x and self.y == other.y

    def action(self, choice):
        '''
        Gives us 9 total movement options. (0,1,2,3,4,5,6,7,8)
        '''
        if choice == 0:
            self.move(x=1, y=1)
        elif choice == 1:
            self.move(x=-1, y=-1)
        elif choice == 2:
            self.move(x=-1, y=1)
        elif choice == 3:
            self.move(x=1, y=-1)

        elif choice == 4:
            self.move(x=1, y=0)
        elif choice == 5:
            self.move(x=-1, y=0)

        elif choice == 6:
            self.move(x=0, y=1)
        elif choice == 7:
            self.move(x=0, y=-1)

        elif choice == 8:
            self.move(x=0, y=0)

    def move(self, x=False, y=False):

        # If no value for x, move randomly
        if not x:
            self.x += np.random.randint(-1, 2)
        else:
            self.x += x

        # If no value for y, move randomly
        if not y:
            self.y += np.random.randint(-1, 2)
        else:
            self.y += y

        # If we are out of bounds, fix!
        if self.x < 0:
            self.x = 0
        elif self.x > self.size-1:
            self.x = self.size-1
        if self.y < 0:
            self.y = 0
        elif self.y > self.size-1:
            self.y = self.size-1


In [5]:

# Own Tensorboard class
class ModifiedTensorBoard(TensorBoard):

    # Overriding init to set initial step and writer (we want one log file for all .fit() calls)
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.step = 1
        self.writer = tf.summary.create_file_writer(self.log_dir)
        self._log_write_dir = self.log_dir
        #self.writer =tf.compat.v1.summary.FileWriter(self.log_dir)

    # Overriding this method to stop creating default log writer
    def set_model(self, model):
        pass

    # Overrided, saves logs with our step number
    # (otherwise every .fit() will start writing from 0th step)
    def on_epoch_end(self, epoch, logs=None):
        self.update_stats(**logs)

    # Overrided
    # We train for one batch only, no need to save anything at epoch end
    def on_batch_end(self, batch, logs=None):
        pass

    # Overrided, so won't close writer
    def on_train_end(self, _):
        pass

    # Custom method for saving own metrics
    # Creates writer, writes custom metrics and closes writer
    def _write_logs(self, logs, index):
        with self.writer.as_default():
            for name, value in logs.items():
                tf.summary.scalar(name, value, step=index)
                self.step += 1
                self.writer.flush()
        
    def update_stats(self, **stats):
        with self.writer.as_default():

            for key, value in stats.items():

                tf.summary.scalar(key,value,step=self.step)

                self.writer.flush()
        #self._write_logs(stats, self.step)



In [6]:
class DQNAgent:

    def __init__(self):
        # main model trained every step
        self.model = self.create_model()
        # Target model is what we predict against every step
        #print(self.model.summary())
        self.target_model = self.create_model()
        self.target_model.set_weights(self.model.get_weights())
        
        self.replay_memory = deque(maxlen = replay_memory_size)
        
        self.tensorboard = ModifiedTensorBoard(log_dir="logs\{}-{}".format(model_name, int(time.time())))
        
        self.target_update_counter = 0
    def create_model(self):
        model = Sequential()
        model.add(Conv2D(256, (3,3),activation = 'relu', input_shape = env.OBSERVATION_SPACE_VALUES))
        model.add(MaxPooling2D(2,2))
        model.add(Dropout(0.2))
        
        model.add(Conv2D(256, (3,3),activation = 'relu'))
        model.add(MaxPooling2D(2,2))
        model.add(Dropout(0.2))
        
        model.add(Flatten())
        model.add(Dense(64))
        
        model.add(Dense(env.ACTION_SPACE_SIZE,activation = 'linear'))
        model.compile(loss = 'mse', optimizer = 'adam', metrics = ['accuracy'])
        #model.build()
        
        return model
    
    def update_replay_memory(self, transition):
        self.replay_memory.append(transition)
        
    def get_qs(self, state):
        return self.model.predict(np.array(state).reshape(-1,*state.shape)/255)[0]
    def train(self,terminal_state, step):
        if len(self.replay_memory)< min_replay_memory_size:
            return
        minibatch = random.sample(self.replay_memory, minibatch_size)
        
        current_states = np.array([transition[0] for transition in minibatch])/255
        current_qs_list = self.model.predict(current_states)
        
        new_current_states = np.array([transition[3] for transition in minibatch])/255
        future_qs_list = self.target_model.predict(new_current_states)
        
        # images from the game
        X = []
        
        # Actions we decide to take
        y = []
        
        for index, (current_state,action, reward, new_current_state,done)in enumerate(minibatch):
            if not done:
                max_future_q = np.max(future_qs_list[index])
                new_q = reward + discount * max_future_q
                
            else: 
                new_q = reward
        
            current_qs = current_qs_list[index]
            current_qs[action] = new_q
            
            X.append(current_state)
            y.append(current_qs)
        self.model.fit(np.array(X)/255, np.array(y),batch_size = minibatch_size,verbose = 0, shuffle = False, callbacks = [self.tensorboard] if terminal_state else None)
        # updating to determine if we want to update target_model
        if terminal_state:
            self.target_update_counter += 1 
            
        if self.target_update_counter > update_target_every:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter = 0

In [7]:
agent = DQNAgent()

for episode in tqdm(range(1,EPISODES+1), ascii = True , unit = 'episode'):
    agent.tensorboard.step = episode
    
    episode_reward = 0
    step = 1
    current_state = env.reset()
    
    done = False
    
    while not done:
        if np.random.random() > epsilon:
            action = np.argmax(agent.get_qs(current_state))
        else:
            action = np.random.randint(0,env.ACTION_SPACE_SIZE)
                
        new_state, reward, done = env.step(action)
            
        episode_reward += reward
            
        if SHOW_PREVIEW and not episode % AGGREGATE_STATS_EVERY:
            env.render()
                
        agent.update_replay_memory((current_state, action, reward ,new_state, done))
        agent.train(done,step)
            
        current_state = new_state
        step += 1
            
        # Append episode reward to a list and log stats (every given number of episodes)
    ep_rewards.append(episode_reward)
    if not episode % AGGREGATE_STATS_EVERY or episode == 1:
        average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
        min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
        max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
        agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)

        # Save model, but only when min reward is greater or equal a set value
        if min_reward >= MIN_REWARD:
            agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')

    # Decay epsilon
    if epsilon > MIN_EPSILON:
        epsilon *= EPSILON_DECAY
        epsilon = max(MIN_EPSILON, epsilon)

  0%|                                                                        | 18/20000 [00:11<4:01:22,  1.38episode/s]



  0%|                                                                       | 20/20000 [01:00<80:09:38, 14.44s/episode]



  0%|                                                                       | 22/20000 [01:26<74:08:05, 13.36s/episode]



  0%|                                                                      | 26/20000 [04:42<161:18:26, 29.07s/episode]



  0%|                                                                      | 28/20000 [07:46<343:10:15, 61.86s/episode]



  0%|1                                                                     | 30/20000 [08:25<226:03:45, 40.75s/episode]



  0%|1                                                                     | 32/20000 [10:48<330:32:54, 59.59s/episode]



  0%|1                                                                     | 38/20000 [14:14<160:26:43, 28.94s/episode]



  0%|1                                                                     | 40/20000 [16:01<210:35:37, 37.98s/episode]



  0%|1                                                                     | 42/20000 [17:01<134:46:34, 24.31s/episode]


KeyboardInterrupt: 