In [12]:
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import TensorBoard
from keras.optimizers import Adam
from collections import deque
import numpy as np

import time
import random
import tensorflow as tf
import os
from tqdm import tqdm
from PIL import Image
import cv2
import keras

from blob import Blob, BlobEnv



In [13]:
best_avg_award = -201  # For model save, beat score to save initially


In [22]:
DISCOUNT = 0.99
REPLAY_MEMORY_SIZE = 10_000  # How many last steps to keep for model training
MIN_REPLAY_MEMORY_SIZE = 1_000  # Minimum number of steps in a memory to start training
MINIBATCH_SIZE = 32  # How many steps (samples) to use for training
UPDATE_TARGET_EVERY = 2  # every 5 episodes (usually steps I guess)
MODEL_NAME = 'v3' #NOT SURE BOUT THIS

# Environment settings
EPISODES = 1000
# Exploration settings
epsilon = 1  # not a constant, going to be decayed
percent_to_min = 0.95
MIN_EPSILON = 0.1



# x^(num_episodes*0.8) = min_val
# log(x) * (n*0.8) = log(min_val)
# x = exp(log(min_val)/(n*0.8))
EPSILON_DECAY = np.exp(np.log(MIN_EPSILON)/(EPISODES*percent_to_min))

#  Stats settings
AGGREGATE_STATS_EVERY = 100  # episodes to record stats AND render
SHOW_PREVIEW = False



# epsilon start at 100%
# move towards min_eps by episode k out of n


In [15]:
# Own Tensorboard class
class ModifiedTensorBoard(TensorBoard):

    # Overriding init to set initial step and writer (we want one log file for all .fit() calls)
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.step = 1
        
        # OLD CODE
        # AttributeError: module 'tensorboard.summary._tf.summary' has no attribute 'FileWriter' 
        # self.writer = tf.summary.FileWriter(self.log_dir)
        
        # FROM GITHUB FIX
        self.writer = tf.summary.create_file_writer(self.log_dir)
        self._log_write_dir = self.log_dir


    # Overriding this method to stop creating default log writer
    def set_model(self, model):
        
        # original version
        # pass
        
        # code from somewhere else to fix something?
        # WHY USE set_model instead of just saying mod_ten_board.model = ...???
        self.model = model
        self._train_dir = os.path.join(self._log_write_dir, 'train')
        self._train_step = self.model._train_counter
        self._val_dir = os.path.join(self._log_write_dir, 'validation')
        self._val_step = self.model._test_counter
        self._should_write_train_graph = False
          
        

    # Overrided, saves logs with our step number
    # (otherwise every .fit() will start writing from 0th step)
    
    # ??? NOT SURE WHAT THAT MEANS ABOVE ???
    def on_epoch_end(self, epoch, logs=None):
        self.update_stats(**logs)

        
        
    # Overrided
    # We train for one batch only, no need to save anything at epoch end
    def on_batch_end(self, batch, logs=None):
        pass

    
    
    
    
    # Overrided, so won't close writer
    def on_train_end(self, _):
        pass

    
    
    
    # Custom method for saving own metrics
    # Creates writer, writes custom metrics and closes writer
    def update_stats(self, **stats):
        # original version here
        # pass
        
        self._write_logs(stats, self.step)      
        with self.writer.as_default():
            for key, value in stats.items():
                tf.summary.scalar(key, value, step = self.step)
                self.writer.flush()
    
    
    
# AttributeError: 'ModifiedTensorBoard' object has no attribute '_write_logs'

    # GITHUB SOLUTION
    def _write_logs(self, logs, index):
        pass
        # with self.writer.as_default():
        #     for name, value in logs.items():
        #         tf.summary.scalar(name, value, step=index)
        #         self.step += 1
        #         self.writer.flush()
                
# AttributeError: 'ModifiedTensorBoard' object has no attribute '_train_dir'

                
                
                
                
# class ModifiedTensorBoard(TensorBoard):

# def __init__(self, **kwargs):
#     super().__init__(**kwargs)
#     self.step = 1
#     self.writer = tf.summary.create_file_writer(self.log_dir)
#     self._log_write_dir = self.log_dir

# def set_model(self, model):
#     self.model = model

#     self._train_dir = os.path.join(self._log_write_dir, 'train')
#     self._train_step = self.model._train_counter

#     self._val_dir = os.path.join(self._log_write_dir, 'validation')
#     self._val_step = self.model._test_counter

#     self._should_write_train_graph = False

# def on_epoch_end(self, epoch, logs=None):
#     self.update_stats(**logs)

# def on_batch_end(self, batch, logs=None):
#     pass

# def on_train_end(self, _):
#     pass

# def update_stats(self, **stats):
#     with self.writer.as_default():
#         for key, value in stats.items():
#             tf.summary.scalar(key, value, step = self.step)
#             self.writer.flush()

In [23]:
class DQNAgent:
    STATE_SIZE = 4
    MODEL_INPUT_SIZE = (-1,1, STATE_SIZE) # -1 for number of samples
    def __init__(self):
        # main model, gets trained every step
        self.model = self.create_model()
        
        # used to predict every step
        self.target_model = self.create_model()
        self.target_model.set_weights(self.model.get_weights())
        self.replay_memory = deque(maxlen = REPLAY_MEMORY_SIZE)
        self.tensorboard = ModifiedTensorBoard(log_dir = f'logs/{MODEL_NAME}-{int(time.time())}')
        self.target_update_counter = 0
        
    def create_model(self):
        model = Sequential()
        
        model.add(Dense(4, input_shape=(1,self.STATE_SIZE), activation='relu'))
        # model.add(Dense(8, activation='relu'))
        # model.add(Dense(4, activation='relu'))

        model.add(Dense(env.ACTION_SPACE_SIZE, activation = 'linear'))
        
        # ORIGINAL WITH IMAGES
        # model.add((256, (3,3), input_shape = env.OBSERVATION_SPACE_VALUES))
#         model.add(Activation('relu'))
#         model.add(MaxPooling2D((2,2),2))
#         model.add(Dropout(0.2))
        
#         model.add(Conv2D(256, (3,3), activation='relu'))
#         model.add(Activation('relu'))
#         model.add(MaxPooling2D(2,2))
#         model.add(Dropout(0.2))
        
#         model.add(Flatten())
        # model.add(Dense(64))
        # model.add(Dense(env.ACTION_SPACE_SIZE, activation = 'linear'))
        
        model.compile(loss = 'mse', optimizer = Adam(learning_rate= 0.001), metrics = ['accuracy'])
        return model
    
    
    
    def update_replay_memory(self, transition):
        self.replay_memory.append(transition)
    def get_qs(self, state):
            
        # predict returns array for multiple predictions -> 0 index after
        
        s = np.array(state).reshape(self.MODEL_INPUT_SIZE)
        return self.model.predict(np.array(s), verbose = 0)[0] 
    
    def train(self):
        # Start training only if certain number of samples is already saved
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return
        # Get a minibatch of random samples from memory replay table
        minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)

        # Get current states from minibatch, then query NN model for Q values
        current_states = np.array([transition[0] for transition in minibatch]).reshape(self.MODEL_INPUT_SIZE)
        # Get future states from minibatch, then query NN model for Q values
        # When using target network, query it, otherwise main network should be queried
        new_current_states = np.array([transition[3] for transition in minibatch]).reshape(self.MODEL_INPUT_SIZE)
        
        current_qs_list = self.model.predict(current_states, verbose = 0)
        future_qs_list = self.target_model.predict(new_current_states, verbose = 0)
        
#         print(f'cur_qs_list has shape {current_qs_list.shape}')
#         print(f'new_qs_list has shape {future_qs_list.shape}')
    
        
        
        
        X = []
        y = []

        # Now we need to enumerate our batches
        for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):

            # If not a done state, get new q from future states, otherwise set it to 0
            # almost like with Q Learning, but we use just part of equation here
            if not done:
                max_future_q = np.max(future_qs_list[index])
                new_q = reward + DISCOUNT * max_future_q
            else:
                new_q = reward

            # Update Q value for given state
            current_qs = current_qs_list[index]
            # print(f'current_qs_list shape = {current_qs_list.shape}')
            # print(f'current qs shape = {current_qs.shape}')
            # print(f'new q = {new_q}')
            # print(f'action = {action}')
            current_qs[0,action] = new_q

            # And append to our training data
            X.append(current_state)
            y.append(current_qs)

        
        # Update target network counter every episode
        self.target_update_counter += 1
        
        
        # Fit on all samples as one batch, log only on terminal state
        # print('HERE!')
        self.model.fit(np.array(X).reshape(self.MODEL_INPUT_SIZE), np.array(y), 
                       batch_size=MINIBATCH_SIZE, 
                       verbose=0, shuffle=False, 
                       callbacks=[self.tensorboard] if self.target_update_counter % UPDATE_TARGET_EVERY == 0 else None)
        
        
        

        # If counter reaches set value, update target network with weights of main network
        if self.target_update_counter >= UPDATE_TARGET_EVERY:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter = 0
            
            


In [24]:
env = BlobEnv()
agent = DQNAgent()

In [8]:
# most_recent_model = 'models/v2____24.00max_-199.32avg_-486.00min__1667200184.model'
# agent.model.set_weights(keras.models.load_model(most_recent_model).get_weights())

In [9]:
# different model structure using 6 states
# most_recent_model = 'models/simple____25.00max_-104.08avg_-478.00min__1667070188.model'
# agent.model.set_weights(keras.models.load_model(most_recent_model).get_weights())
# agent.target_model.set_weights((keras.models.load_model(most_recent_model).get_weights()))

In [10]:
# Image.fromarray(board, 'RGB') 
# cv2.cvtColor(img, cv2.  COLOR_BGR2RGB)

In [25]:

# For stats
# WHY IS THIS [-200]????, changed to empty list
epsilon = 1  # not a constant, going to be decayed
ep_rewards = []

# For more repetitive results
random.seed(1)
np.random.seed(1)
# tf.set_random_seed(1)
tf.random.set_seed(1)

# Create models folder
if not os.path.isdir('models'):
    os.makedirs('models')
    
    
    
    
    # Iterate over episodes
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
    # Update tensorboard step every episode
    agent.tensorboard.step = episode

    # Restarting episode - reset episode reward and step number
    episode_reward = 0
    # episode step is kept track of in environment!
    # step = 1

    # Reset environment and get initial state
    current_state = env.reset()
    if SHOW_PREVIEW and not episode % AGGREGATE_STATS_EVERY:
            env.render()
            
    # Reset flag and start iterating until episode ends
    done = False
    while not done:
        # This part stays mostly the same, the change is to query a model for Q values
        if np.random.random() > epsilon:
            # Get action from Q table
            action = np.argmax(agent.get_qs(current_state))
        else:
            # Get random action
            action = np.random.randint(0, env.ACTION_SPACE_SIZE)

        new_state, reward, done = env.step(action)
        
        # Transform new continous state to new discrete state and count reward
        episode_reward += reward


        # Every step we update replay memory and train main network
        # if env.episode_step % 10 == 0 or abs(reward) > 1:
        agent.update_replay_memory((current_state, action, reward, new_state, done))
        current_state = new_state
        
        # step is kept track of in environment!
        # step += 1
    # train after episode
    
        if SHOW_PREVIEW and not episode % AGGREGATE_STATS_EVERY:
                env.render()
        # else:
        #     agent.train()
    
    
    
    
    agent.train()
        
        
        
    # Append episode reward to a list and log stats (every given number of episodes)
    ep_rewards.append(episode_reward)
    if (episode % AGGREGATE_STATS_EVERY)==0 or episode == 1:
        average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
        min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
        max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
        agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)
        print(f'episode = {episode}: (min,avg,best) = ({min_reward},{average_reward}, {max_reward}).')
        print(f'\tepsilon = {epsilon}')
        # Save model, but only when min reward is greater or equal a set value
        if average_reward >= best_avg_award:
            print(f'\t!!! beat old avg award of {best_avg_award}')
            best_avg_award = average_reward
            agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')
    # Decay epsilon
    if epsilon > MIN_EPSILON:
        epsilon *= EPSILON_DECAY
        epsilon = max(MIN_EPSILON, epsilon)


  1%|2                                   | 7/1000 [00:00<00:17, 57.14episodes/s]

episode = 1: (min,avg,best) = (-376,-376.0, -376).
	epsilon = 1


 10%|###4                              | 100/1000 [00:32<10:04,  1.49episodes/s]

episode = 100: (min,avg,best) = (-574,-235.23, 25).
	epsilon = 0.7866643579629186


 20%|######8                           | 200/1000 [01:43<15:34,  1.17s/episodes]

episode = 200: (min,avg,best) = (-582,-271.88, 23).
	epsilon = 0.6173426982315325


 30%|##########2                       | 300/1000 [04:00<09:05,  1.28episodes/s]

episode = 300: (min,avg,best) = (-492,-258.17, 24).
	epsilon = 0.48446583756087014


 40%|#############6                    | 400/1000 [06:35<11:55,  1.19s/episodes]

episode = 400: (min,avg,best) = (-528,-238.01, 23).
	epsilon = 0.38018939632056514


 50%|#################                 | 500/1000 [09:07<10:02,  1.21s/episodes]

episode = 500: (min,avg,best) = (-553,-246.06, 25).
	epsilon = 0.2983574193844677


 60%|####################4             | 600/1000 [11:53<11:58,  1.80s/episodes]

episode = 600: (min,avg,best) = (-564,-237.09, 25).
	epsilon = 0.2341389595902939


 70%|#######################7          | 700/1000 [15:26<06:39,  1.33s/episodes]

episode = 700: (min,avg,best) = (-512,-245.47, 24).
	epsilon = 0.18374288298620148


 80%|###########################2      | 800/1000 [19:29<08:27,  2.54s/episodes]

episode = 800: (min,avg,best) = (-510,-248.41, 25).
	epsilon = 0.144194059404544


 90%|##############################6   | 900/1000 [23:53<04:02,  2.42s/episodes]

episode = 900: (min,avg,best) = (-564,-258.95, 24).
	epsilon = 0.11315772578316718


100%|#################################| 1000/1000 [27:59<00:00,  1.68s/episodes]

episode = 1000: (min,avg,best) = (-544,-261.17, 25).
	epsilon = 0.1





In [None]:
agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')

In [None]:
# For stats
# WHY IS THIS [-200]????, changed to empty list
ep_rewards = []

# For more repetitive results
random.seed(1)
np.random.seed(1)
# tf.set_random_seed(1)
tf.random.set_seed(1)

# Create models folder
if not os.path.isdir('models'):
    os.makedirs('models')
    
    
    
    
    # Iterate over episodes
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
    # Update tensorboard step every episode
    agent.tensorboard.step = episode

    # Restarting episode - reset episode reward and step number
    episode_reward = 0
    # episode step is kept track of in environment!
    # step = 1

    # Reset environment and get initial state
    current_state = env.reset()
    print(f'current_state = {current_state}')
    if SHOW_PREVIEW and not episode % AGGREGATE_STATS_EVERY:
            env.render()
            
    # Reset flag and start iterating until episode ends
    done = False
    while not done:
        # This part stays mostly the same, the change is to query a model for Q values
        if np.random.random() > epsilon:
            # Get action from Q table
            action = np.argmax(agent.get_qs(current_state))
        else:
            # Get random action
            action = np.random.randint(0, env.ACTION_SPACE_SIZE)

        new_state, reward, done = env.step(action)
        
        # Transform new continous state to new discrete state and count reward
        episode_reward += reward

        if SHOW_PREVIEW and not episode % AGGREGATE_STATS_EVERY:
            env.render(episode)

        # Every step we update replay memory and train main network
        # agent.update_replay_memory((current_state, action, reward, new_state, done))
        current_state = new_state
        
        # step is kept track of in environment!
        # step += 1
    # train after episode
    ep_rewards.append(episode_reward)

In [None]:
ep_rewards
