In [1]:
# if colab

# !pip install pybullet
# !pip install gym
# !apt-get install python-opengl -y
# !apt install xvfb -y
# !pip install gym pyvirtualdisplay > /dev/null 2>&1
# !pip install -q git+https://github.com/tensorflow/examples.git

In [2]:
import os
import cv2
import tensorflow as tf 
from tensorflow.keras import layers, models
import numpy as np 
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
import pybullet_envs
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay

In [3]:
np_seed = 654765645
tf_seed = 776644345
np.random.seed(np_seed)
tf.random.set_seed(tf_seed)

# check if GPU
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
# colab

# from google.colab import drive
# drive.mount('/content/drive')

# root_dir = "drive/My Drive/"
# base_dir = root_dir + 'CPCtesting'
# os.makedirs(base_dir,exist_ok=True)

# train_dir = base_dir + '/train'
# os.makedirs(train_dir,exist_ok=True)

# model_dir = base_dir + '/model'
# os.makedirs(model_dir,exist_ok=True)

# if local machine
base_dir = "."

train_dir = base_dir + '/train'
os.makedirs(train_dir,exist_ok=True)

model_dir = base_dir + '/model'
os.makedirs(model_dir,exist_ok=True)

logs_base_dir = base_dir + '/logs'

log_dir = base_dir + '/training_logs_save'

reward_dir = base_dir + '/training_rewards_save'




# tensorboard
%load_ext tensorboard
os.makedirs(logs_base_dir, exist_ok=True)
%tensorboard --logdir {logs_base_dir}

ERROR: Could not find `tensorboard`. Please ensure that your PATH
contains an executable `tensorboard` program, or explicitly specify
the path to a TensorBoard binary by setting the `TENSORBOARD_BINARY`
environment variable.

In [5]:
# get data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

In [6]:
class CPCModel(tf.keras.Model):
    def __init__(self,code_size, predict_terms, terms=4, units=256, image_size=64, channels=3):
        super(CPCModel, self).__init__()
        self.code_size = code_size
        self.predict_terms = predict_terms
        self.terms = terms
        self.units = units
        self.image_size = image_size
        self.channels = channels

        self.conv1 = tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=2, activation='linear')
        self.bn1 = tf.keras.layers.BatchNormalization()
        self.lrelu1 = tf.keras.layers.LeakyReLU()
        self.conv2 = tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=2, activation='linear')
        self.bn2 = tf.keras.layers.BatchNormalization()
        self.lrelu2 = tf.keras.layers.LeakyReLU()
        self.conv3 = tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=2, activation='linear')
        self.bn3 = tf.keras.layers.BatchNormalization()
        self.lrelu3 = tf.keras.layers.LeakyReLU()
        self.conv4 = tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=2, activation='linear')
        self.bn4 = tf.keras.layers.BatchNormalization()
        self.lrelu4 = tf.keras.layers.LeakyReLU()
        self.flatten = tf.keras.layers.Flatten()
        self.dense5 = tf.keras.layers.Dense(units=256, activation='linear')
        self.bn5 = tf.keras.layers.BatchNormalization()
        self.lrelu5 = tf.keras.layers.LeakyReLU()
        self.dense6 = tf.keras.layers.Dense(units=code_size, activation='linear', name='encoder_embedding')

        self.gru = tf.keras.layers.GRU(units, return_sequences=False, name='ar_context')
        self.linear = tf.keras.layers.Dense(predict_terms*code_size, activation='linear')    
   
    def encoding(self,x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.lrelu1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.lrelu2(x)
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.lrelu3(x)
        x = self.conv4(x)
        x = self.bn4(x)
        x = self.lrelu4(x)
        x = self.flatten(x)
        x = self.dense5(x)
        x = self.bn5(x)
        x = self.lrelu5(x)
        z = self.dense6(x)
        return z
  
    def get_context(self, x):
        z = self.encoding(x)
        z = tf.reshape(z, [-1, self.terms, self.code_size])
        c = self.gru(z)
        return c
    def get_prediction(self, x):
        c = self.get_context(x)
        z_hats = self.linear(c)
        z_hat = tf.reshape(z_hats, [-1, self.predict_terms, self.code_size])
        return z_hat

    def optimizer(self):
        pass

    def loss(self,weights,biases,labels,inputs,num_samples,num_classes): 
        loss = tf.nn.nce_loss(
        weights, biases, labels, inputs, num_sampled, num_classes, num_true=1,
        sampled_values=None, remove_accidental_hits=False, name='nce_loss')
        return loss
  
    def call(self,inputs):
        x_tm, x_tp = inputs
        x_tm = tf.reshape(x_tm, [-1, self.image_size, self.image_size, self.channels])
        x_tp = tf.reshape(x_tp, [-1, self.image_size, self.image_size, self.channels])
        z_hat = self.get_prediction(x_tm)
        z_tp = self.encoding(x_tp)
        z_tp = tf.reshape(z_tp, [-1, self.predict_terms, self.code_size])
        dot_prods = tf.reduce_mean(tf.reduce_mean(z_hat*z_tp, axis=-1), axis=-1, keepdims=True)
        probs = tf.sigmoid(dot_prods)
        return probs


  # def save(self):
  #       f1 = os.path.join(folder,'target_actor')
  #       f2 = os.path.join(folder, 'target_critic')
  #       f3 = os.path.join(folder, 'actor')
  #       f4 = os.path.join(folder, 'critic')
  #       self.target_actor.save(f1)
  #       self.target_critic.save(f2)
  #       self.actor.save(f3)
  #       self.critic.save(f4)


  # def load(self):
  #   pass

In [7]:
class ReplayBuffer():
    def __init__(self,state_space,action_space,capacity,batch):
        self.capacity = capacity
        self.batch = batch
        
        self.avaliable_batch = 0
        self.idx = 0
        self.entries = 0 
        
        self.states = np.empty((self.capacity,state_space),dtype = np.float32)
        self.next_states = np.empty((self.capacity,state_space),dtype = np.float32)
        self.actions = np.empty((self.capacity,action_space),dtype = np.float32)
        self.rewards = np.empty((self.capacity,1),dtype = np.float32)
        self.not_dones = np.empty((self.capacity, 1), dtype=np.float32)
        
    def add(self,state,next_state,action,reward,done):
        np.copyto(self.states[self.idx], state)
        np.copyto(self.actions[self.idx], action)
        np.copyto(self.rewards[self.idx], reward)
        np.copyto(self.next_states[self.idx], next_state)
        np.copyto(self.not_dones[self.idx], not done)
        #self.avaliable_batch= (self.avaliable_batch + 1) if self.avaliable_batch < self.batch else self.batch
        #self.entries = (self.entries + 1) if self.entries < self.capacity else self.capacity
        self.idx = (self.idx + 1) % self.capacity
        
    def sample(self):
        #num = self.avaliable_batch
        #if(num > self.batch):
        #    num = self.batch
        #print('avaliable_batch: ',self.avaliable_batch, "entries: ", self.entries,'capacity: ', self.capacity)
        idx = np.random.choice(self.capacity,size = self.batch,replace=False)
        #print('test idx: ', idx)
        
        states = tf.convert_to_tensor(self.states[idx])
        next_states = tf.convert_to_tensor(self.next_states[idx])
        actions = tf.convert_to_tensor(self.actions[idx])
        rewards = tf.convert_to_tensor(self.rewards[idx])
        not_dones = tf.convert_to_tensor(self.not_dones[idx])
        
        return states,next_states,actions,rewards,not_dones
    
    def fill_buffer(self,timesteps,state,prev_timesteps):
        print('sim test: ',env._max_episode_steps,":",timesteps)
        for step in range(timesteps):
            action = env.action_space.sample()
            next_state, reward, done, info = env.step(action)
            np.copyto(self.states[step], state)
            np.copyto(self.actions[step], action)
            np.copyto(self.rewards[step], reward)
            np.copyto(self.next_states[step], next_state)
            np.copyto(self.not_dones[step], not done)
            state = next_state
            if(done):
                print("step: ", step)
                state = env.reset()
                print('done seeding replay buffer')            
            
        

class Actor(tf.keras.Model):
    def __init__(self,action_space,critic,actor_lr = 0.001,variance = 0.2):
        super(Actor,self).__init__()
        
        #params
        self.std = np.sqrt(variance)
        self.noise_flag = 1.0
        self.action_space = action_space
        
        #optimizer
        self.opt = tf.keras.optimizers.Adam(actor_lr)
        self.critic = critic
       
        #model
        self.dense1 = tf.keras.layers.Dense(400,activation = 'relu',dtype='float32')
        self.dense2 = tf.keras.layers.Dense(300,activation='relu',dtype='float32')
        self.dense3 = tf.keras.layers.Dense(action_space,activation = 'tanh',dtype='float32')    
        
    def loss(self,states,actions):
        actions = self(states)
        #stateactions = tf.concat([states,actions],-1)
        Q = self.critic(states,actions)
        loss = - tf.reduce_mean(Q)
        return loss
    
    def update(self,states,actions):
        with tf.GradientTape() as tape:
            loss = self.loss(states,actions)

        grad = tape.gradient(loss,self.trainable_variables)
        self.opt.apply_gradients(zip(grad, self.trainable_variables))
        #print('actor loss: ', loss ,"\n" )
        return loss
    
    def set_noise_flag(self,num):
        self.noise_flag = np.float32(not not num)
    
    def continous_noise(self):
        result = np.random.normal(0,self.std,size=(self.action_space,))
        return self.noise_flag *result
    
    def call(self,x):
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dense3(x)
        return x

class Critic(tf.keras.Model):
    def __init__(self,critic_lr = 0.001):
        super(Critic,self).__init__()
        
        # optimizer
        self.opt = tf.keras.optimizers.Adam(critic_lr)
        
        # loss
        #self.loss = tf.keras.losses.MSE
        
        
        # layers
        self.dense1 = tf.keras.layers.Dense(400,activation = 'relu',dtype='float32')
        self.dense2 = tf.keras.layers.Dense(300,activation='relu',dtype='float32')
        self.dense3 = tf.keras.layers.Dense(1,dtype='float32') 
        
    #loss
    def loss(self,actual,pred):
        result = tf.keras.losses.MSE(actual,pred)
        #print('result: ', result)
        #print('actual: ', actual.shape) # shape (16,1)
        #print('pred: ',pred.shape) # shape (16,1,1,1)
        return result
    
    def update(self,states,actions,Q_h):
        match = Q_h.shape[0]
        with tf.GradientTape() as tape:
            Q = self.call(states,actions)
            Q = tf.reshape(Q,(1,1,1,match))
            Q_h = tf.reshape(Q_h,(1,1,1,match))
            loss = self.loss(Q,Q_h)

        grad = tape.gradient(loss,self.trainable_variables)
        self.opt.apply_gradients(zip(grad, self.trainable_variables))
        #print('critic loss: ', loss ,"\n" )
        return loss
    
    #predict
    def call(self,states,actions):
        x = tf.concat([states,actions],-1)
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dense3(x)
        return x
    
    
class SAC(tf.keras.Model):
    def __init__(self,
                 state_space,
                 action_space,
                 capacity = 1000,
                 batch = 1, 
                 tau=0.999,
                 gamma=0.9,
                 actor_lr = 0.001, 
                 critic_lr = 0.0001,
                 variance = 1.0):
        super(SAC,self).__init__()
        # tensorboard callbacks
        self.cb = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=1/3, patience=2, min_lr=1e-4),
                   tf.keras.callbacks.ModelCheckpoint('weights/weights.{epoch:02d}-{val_binary_accuracy:.2f}.cpkt',
                                          monitor='val_binary_accuracy', save_best_only=True, save_weights_only=True),
                   tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=3),
                   tf.keras.callbacks.TensorBoard()]
        
        
        #hyperparameters
        self.batch = batch
        self.tau = tau
        self.gamma = gamma
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.noise_flag = 1
        self.std = np.sqrt(variance)
        
        
        #spaces
        self.action_space = action_space
        self.state_space = state_space
        self.state_action_space = self.action_space + self.state_space
        
        # replay buffer
        self.replay_buffer = ReplayBuffer(self.state_space,self.action_space,capacity,self.batch)
        
        # models
        self.critic = Critic(critic_lr = critic_lr)        
        self.actor = Actor(self.action_space,actor_lr=actor_lr,critic = self.critic)
        #self.critic.compile(optimizer = self.critic.opt,loss = self.critic.loss)
        #self.actor.compile(optimizer = self.actor.opt,loss = self.actor.loss)
        
        # target models
        self.target_actor = Actor(self.action_space,actor_lr=actor_lr,critic = self.critic)
        self.target_critic = Critic(critic_lr = critic_lr)  
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())
        
        #cpc
        #self.cpc = CPC(code_size=128, predict_terms=4, terms=4, units=256, image_size=64, channels=3)
    
    def store_replay(self,state,next_state,action,reward,done):
        self.replay_buffer.add(state,next_state,action,reward,done)
    
    def set_labels(self,states,new_states,actions,rewards):
        mu = self.target_actor(new_states)
        #print(mu,states)
#         stateactions = tf.concat([states,mu],1)
        Q_h = self.target_critic(new_states,mu)
        y = rewards + self.gamma * Q_h
        #y = np.concatenate(self.y,0).astype('float32') #.reshape((self.minibatch_size,1,1,1))
        #print('y: ',self.y)
        y = tf.reshape(y,(self.replay_buffer.batch,1,1,1))
        return y 
    
        
    def discrete_random_noise(self):
        pass
    
    def update_target_weights(self):   
        tgt_critic_weight = self.target_critic.get_weights()
        tgt_actor_weight = self.target_actor.get_weights()
        actor_weight = self.actor.get_weights()
        critic_weight = self.target_actor.get_weights()
        
        for idx,(part_tgt,part_net) in enumerate(zip(tgt_critic_weight,critic_weight)):
            tgt_critic_weight[idx] = self.tau*part_tgt + (1-self.tau)*part_net
        
        for idx,(part_tgt,part_net) in enumerate(zip(tgt_actor_weight,actor_weight)):
            tgt_actor_weight[idx] = self.tau*part_tgt + (1-self.tau)*part_net
            
        self.target_actor.set_weights(tgt_actor_weight)
        self.target_critic.set_weights(tgt_critic_weight)
            
    def save(self,filename):
        self.actor.save_weights(filename)
        self.critic.save_weights(filename)
        self.target_actor.save_weights(filename)
        self.target_critic.save_weights(filename)
    
    def load(self,filename):
        self.actor.load_weights(filename)
        self.critic.load_weights(filename)
        self.target_actor.load_weights(filename)
        self.target_critic.load_weights(filename)
            

In [8]:
class DataHandler:
    def __init__(self, batch_size, terms, predict_terms=1, image_size=64, color=False, rescale=True, aug=True, is_training=True, method='cpc'):
        self.batch_size = batch_size
        self.terms = terms
        self.predict_terms = predict_terms
        self.image_size = image_size
        self.color = color
        self.rescale = rescale
        self.aug = aug
        self.is_training = is_training
        self.method = method
        self.lena = cv2.imread(os.path.join(base_dir,'lena.jpg'))
        (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
        if self.is_training:
            self.x = x_train/255.0
            self.y = y_train
        else:
            self.x = x_test/255.0
            self.y = y_test
        self.idxs = []
        for i in range(10):
            y = y_train if self.is_training else y_test
            self.idxs.append(np.where(y == i)[0])
        self.n_samples = len(self.y)//terms if self.method == 'cpc' else len(self.y)
        self.shape = self.x.shape
        self.n_batches = self.n_samples//batch_size

    def __iter__(self):
        return self

    def __next__(self):
        return self.cpc_batch() if self.method == 'cpc' else self.benchmark_batch()

    def __len__(self):
        return self.n_batches

    def cpc_batch(self):
        img_labels = np.zeros((self.batch_size, self.terms + self.predict_terms))
        sentence_labels = np.ones((self.batch_size, 1)).astype('int32')
        for bi in range(self.batch_size):
            seed = np.random.randint(10)
            sentence = np.arange(seed, seed + self.terms + self.predict_terms) % 10
            if bi < self.batch_size//2:
                num = np.arange(10)
                predicted = sentence[-self.predict_terms:]
                for i, p in enumerate(predicted):
                    predicted[i] = np.random.choice(num[num != p], 1)
                sentence[-self.predict_terms:] = predicted % 10
                sentence_labels[bi, :] = 0
            img_labels[bi, :] = sentence
        images = self.get_samples(img_labels).reshape((self.batch_size, self.terms+self.predict_terms, self.image_size, self.image_size, 3))
        x_images = images[:, :-self.predict_terms, ...]
        y_images = images[:, -self.predict_terms:, ...]
        idx = np.random.choice(self.batch_size, self.batch_size, replace=False)
        return [x_images[idx], y_images[idx]], sentence_labels[idx]

    def get_samples(self, img_labels):
        idx = []
        for label in img_labels.flatten():
            idx.append(np.random.choice(self.idxs[int(label)], 1)[0])
        img_batch = self.x[idx, :, :]
        if self.aug:
            img_batch = self._aug_batch(img_batch)
        return img_batch

    def _aug_batch(self, img_batch):
        if self.image_size != 28:
            resized = []
            for i in range(img_batch.shape[0]):
                resized.append(cv2.resize(img_batch[i], (self.image_size, self.image_size)))
            img_batch = np.stack(resized)
        img_batch = img_batch.reshape((img_batch.shape[0], 1, self.image_size, self.image_size))
        img_batch = np.concatenate([img_batch, img_batch, img_batch], axis=1)

        if self.color:
            img_batch[img_batch >= 0.5] = 1
            img_batch[img_batch < 0.5] = 0
            for i in range(img_batch.shape[0]):
                x_c = np.random.randint(0, self.lena.shape[0] - self.image_size)
                y_c = np.random.randint(0, self.lena.shape[1] - self.image_size)
                img = self.lena[x_c:x_c+self.image_size, y_c:y_c+self.image_size]
                img = np.array(img).transpose((2, 0, 1))/255.0
                for j in range(3):
                    img[j, :, :] = (img[j, :, :] + np.random.uniform(0, 1))/2.0
                img[img_batch[i, :, :, :] == 1] = 1 - img[img_batch[i, :, :, :] == 1]
                img_batch[i, :, :, :] = img

        if self.rescale:
            img_batch = img_batch * 2 - 1
        img_batch = img_batch.transpose((0, 2, 3, 1))
        return img_batch

    def benchmark_batch(self):
        idx = np.random.choice(len(self.x), self.batch_size, replace=False)
        img_batch = self.x[idx]
        label_batch = self.y[idx]
        if self.aug:
            img_batch = self._aug_batch(img_batch)
        label_batch = label_batch.reshape((-1, 1))
        return img_batch, label_batch

In [9]:
# #train loop
# dh_train = DataHandler(64, 4, predict_terms=4, image_size=64, color=True, rescale=True, aug=True, is_training=True, method='cpc')
# dh_test = DataHandler(64, 4, predict_terms=4, image_size=64, color=True, rescale=True, aug=True, is_training=False, method='cpc')
# accuracy_metric_train = tf.keras.metrics.BinaryAccuracy()
# loss_metric_train = tf.keras.metrics.BinaryCrossentropy()
# accuracy_metric_test = tf.keras.metrics.BinaryAccuracy()
# loss_metric_test = tf.keras.metrics.BinaryCrossentropy()
# cpc = CPCModel(code_size=128, predict_terms=4, terms=4, units=256, image_size=64, channels=3)
# optim = tf.keras.optimizers.Adam(1e-3)
# cb = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=1/3, patience=2, min_lr=1e-4),
#       tf.keras.callbacks.ModelCheckpoint('weights/weights.{epoch:02d}-{val_binary_accuracy:.2f}.cpkt',
#                                           monitor='val_binary_accuracy', save_best_only=True, save_weights_only=True),
#       tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=3),
#       tf.keras.callbacks.TensorBoard()]
# cpc.compile(optimizer=optim, loss='binary_crossentropy', metrics=['binary_accuracy'])
# cpc.fit(x=dh_train, epochs=10, validation_data=dh_test, steps_per_epoch=60000//64, validation_steps=10000//64, callbacks=cb)


In [10]:
%tensorboard --logdir {logs_base_dir}

ERROR: Could not find `tensorboard`. Please ensure that your PATH
contains an executable `tensorboard` program, or explicitly specify
the path to a TensorBoard binary by setting the `TENSORBOARD_BINARY`
environment variable.

In [11]:
# train loop params


episodes = 200
episode_steps = 1000
buffer_size = 10000
batch_size = 16

# pybullet setup
env = gym.make('HalfCheetahBulletEnv-v0')
env.render(mode = 'human')
env._max_episode_steps = episode_steps





In [12]:
writer = tf.summary.create_file_writer(log_dir)
writer_reward = tf.summary.create_file_writer(reward_dir)

#get spaces
state_space = env.observation_space.shape[0]
action_space = env.action_space.shape[0]
print(state_space,action_space)

26 6


In [13]:
state = env.reset()
sac = SAC(action_space=action_space,
          state_space=state_space,
          capacity = buffer_size,
          batch = batch_size,
          tau = 0.999,
          gamma = 0.99,
          actor_lr = 0.0001,
          critic_lr = 0.001,
          variance = 0.3)

#fill replay buffer
env._max_episode_steps = buffer_size
sac.replay_buffer.fill_buffer(buffer_size, state, episode_steps) # self,timesteps,state,prev_timesteps
env._max_episode_steps = episode_steps


env = gym.wrappers.Monitor(env, "baseline_training", video_callable=lambda episode: True, force="true")
state = env.reset()

for episode in range(episodes):
    sumreward = 0
    for step in range(episode_steps):
        #print(observation)
        print('t: ',step, ' :episode: ',episode)
        #print('state: ',state)
        
        # get action
        state = tf.cast(tf.reshape(state,(1,1,state_space)),dtype='float32')
        #print(state)
        tensor_action = sac.actor(state)+sac.actor.continous_noise()
        action = tensor_action[0][0]
        #print('action: ',action)
        
        #get loss
        #q_loss = sac.critic(state,tensor_action)
        
        
        # execute action
        next_state, reward, done, info = env.step(action)
        sumreward += reward

        # store transitions
        sac.store_replay(state,next_state,action,reward,done)
        
        #print('state: ',state)
        #print('next_state: ',next_state)
        #print('action: ',action)
        #print('reward: ',reward)

        #sample minibatch from data
        states,next_states,actions,rewards,not_done = sac.replay_buffer.sample()
        
        #set labels y_i
        y = sac.set_labels(states,next_states,actions,rewards)
        
        # update critic net
        q_loss = sac.critic.update(states, actions, y)

        print('q_loss: ', q_loss[0][0].numpy())
        with writer.as_default():
            tf.summary.scalar('Squared QLosses (qtarget - qval)^2', q_loss[0][0][0].numpy(),
                              step=episode * episode_steps + step + 1)
        
        #losses[episode*timesteps + t] = loss
        #losses[i_episode*timesteps+] = history.history
        
        #update actor net
        sac.actor.update(states,actions)
        #print('weight check: ',rl.actor.get_weights(),'\n')
        
        #update target nets
        sac.update_target_weights()
        
        state = next_state
        if done:
            state = env.reset()
            #rewards[episode] = sumreward
            #sac.save(base_dir+'/baseline_model')
            print("Episode {} finished after {} timesteps with reward {}".format(episode,step+1,sumreward))
            with writer_reward.as_default():
                tf.summary.scalar('Episode sum reward', sumreward,step=episode)
            break
print('done') 
sac.save(base_dir+'/baseline_model')

sim test:  10000 : 10000
step:  9999
done seeding replay buffer
t:  0  :episode:  0
q_loss:  [1.6673005]
t:  1  :episode:  0
q_loss:  [0.99453485]
t:  2  :episode:  0
q_loss:  [0.3904292]
t:  3  :episode:  0
q_loss:  [1.1167037]
t:  4  :episode:  0
q_loss:  [0.7583326]
t:  5  :episode:  0
q_loss:  [0.56657004]
t:  6  :episode:  0
q_loss:  [0.11957851]
t:  7  :episode:  0
q_loss:  [0.73382115]
t:  8  :episode:  0
q_loss:  [0.37331802]
t:  9  :episode:  0
q_loss:  [0.44329017]
t:  10  :episode:  0
q_loss:  [0.22944075]
t:  11  :episode:  0
q_loss:  [0.4245382]
t:  12  :episode:  0
q_loss:  [0.5026316]
t:  13  :episode:  0
q_loss:  [0.2762586]
t:  14  :episode:  0
q_loss:  [0.6947439]
t:  15  :episode:  0
q_loss:  [0.24559486]
t:  16  :episode:  0
q_loss:  [0.45345622]
t:  17  :episode:  0
q_loss:  [0.55860496]
t:  18  :episode:  0
q_loss:  [0.4149734]
t:  19  :episode:  0
q_loss:  [0.36037403]
t:  20  :episode:  0
q_loss:  [0.43833846]
t:  21  :episode:  0
q_loss:  [0.32847083]
t:  22  :

t:  190  :episode:  0
q_loss:  [0.10827016]
t:  191  :episode:  0
q_loss:  [0.4491059]
t:  192  :episode:  0
q_loss:  [0.21075132]
t:  193  :episode:  0
q_loss:  [0.27546114]
t:  194  :episode:  0
q_loss:  [0.33728853]
t:  195  :episode:  0
q_loss:  [0.5383558]
t:  196  :episode:  0
q_loss:  [0.44953692]
t:  197  :episode:  0
q_loss:  [0.43449247]
t:  198  :episode:  0
q_loss:  [0.2692543]
t:  199  :episode:  0
q_loss:  [0.38534868]
t:  200  :episode:  0
q_loss:  [0.38211733]
t:  201  :episode:  0
q_loss:  [0.25014293]
t:  202  :episode:  0
q_loss:  [0.14787029]
t:  203  :episode:  0
q_loss:  [0.25517994]
t:  204  :episode:  0
q_loss:  [0.36086458]
t:  205  :episode:  0
q_loss:  [0.31397307]
t:  206  :episode:  0
q_loss:  [0.67410994]
t:  207  :episode:  0
q_loss:  [0.21997803]
t:  208  :episode:  0
q_loss:  [0.573527]
t:  209  :episode:  0
q_loss:  [0.15335113]
t:  210  :episode:  0
q_loss:  [0.32916573]
t:  211  :episode:  0
q_loss:  [0.11907011]
t:  212  :episode:  0
q_loss:  [0.550

q_loss:  [0.3482115]
t:  378  :episode:  0
q_loss:  [0.3492006]
t:  379  :episode:  0
q_loss:  [0.44797805]
t:  380  :episode:  0
q_loss:  [0.36175147]
t:  381  :episode:  0
q_loss:  [0.11595603]
t:  382  :episode:  0
q_loss:  [0.19441657]
t:  383  :episode:  0
q_loss:  [0.14934915]
t:  384  :episode:  0
q_loss:  [0.22876292]
t:  385  :episode:  0
q_loss:  [0.58408237]
t:  386  :episode:  0
q_loss:  [0.1986756]
t:  387  :episode:  0
q_loss:  [0.22035907]
t:  388  :episode:  0
q_loss:  [0.5010184]
t:  389  :episode:  0
q_loss:  [0.53008395]
t:  390  :episode:  0
q_loss:  [0.66233754]
t:  391  :episode:  0
q_loss:  [0.7222476]
t:  392  :episode:  0
q_loss:  [0.18676925]
t:  393  :episode:  0
q_loss:  [0.13378447]
t:  394  :episode:  0
q_loss:  [0.39999747]
t:  395  :episode:  0
q_loss:  [0.43770325]
t:  396  :episode:  0
q_loss:  [0.17864856]
t:  397  :episode:  0
q_loss:  [0.82682514]
t:  398  :episode:  0
q_loss:  [0.25096035]
t:  399  :episode:  0
q_loss:  [0.28299823]
t:  400  :episo

t:  566  :episode:  0
q_loss:  [0.34399527]
t:  567  :episode:  0
q_loss:  [0.14945099]
t:  568  :episode:  0
q_loss:  [0.11456781]
t:  569  :episode:  0
q_loss:  [0.52204216]
t:  570  :episode:  0
q_loss:  [0.46909934]
t:  571  :episode:  0
q_loss:  [0.41130814]
t:  572  :episode:  0
q_loss:  [0.23905984]
t:  573  :episode:  0
q_loss:  [0.36889422]
t:  574  :episode:  0
q_loss:  [0.43285635]
t:  575  :episode:  0
q_loss:  [0.07125936]
t:  576  :episode:  0
q_loss:  [0.25652653]
t:  577  :episode:  0
q_loss:  [0.2787871]
t:  578  :episode:  0
q_loss:  [0.16910309]
t:  579  :episode:  0
q_loss:  [0.20832613]
t:  580  :episode:  0
q_loss:  [0.07891693]
t:  581  :episode:  0
q_loss:  [0.08362661]
t:  582  :episode:  0
q_loss:  [0.13297752]
t:  583  :episode:  0
q_loss:  [0.1784291]
t:  584  :episode:  0
q_loss:  [0.2859247]
t:  585  :episode:  0
q_loss:  [0.54208]
t:  586  :episode:  0
q_loss:  [0.55445486]
t:  587  :episode:  0
q_loss:  [0.3656098]
t:  588  :episode:  0
q_loss:  [0.44416

q_loss:  [0.2783656]
t:  755  :episode:  0
q_loss:  [0.26083264]
t:  756  :episode:  0
q_loss:  [0.4933945]
t:  757  :episode:  0
q_loss:  [0.17241612]
t:  758  :episode:  0
q_loss:  [0.31734514]
t:  759  :episode:  0
q_loss:  [0.12068719]
t:  760  :episode:  0
q_loss:  [0.09529025]
t:  761  :episode:  0
q_loss:  [0.37531328]
t:  762  :episode:  0
q_loss:  [0.19405104]
t:  763  :episode:  0
q_loss:  [0.8556369]
t:  764  :episode:  0
q_loss:  [0.23385231]
t:  765  :episode:  0
q_loss:  [0.6498941]
t:  766  :episode:  0
q_loss:  [0.20667768]
t:  767  :episode:  0
q_loss:  [0.31087857]
t:  768  :episode:  0
q_loss:  [0.42010856]
t:  769  :episode:  0
q_loss:  [0.26929468]
t:  770  :episode:  0
q_loss:  [0.9569495]
t:  771  :episode:  0
q_loss:  [0.44066507]
t:  772  :episode:  0
q_loss:  [0.447546]
t:  773  :episode:  0
q_loss:  [0.66086304]
t:  774  :episode:  0
q_loss:  [0.24704653]
t:  775  :episode:  0
q_loss:  [0.24218136]
t:  776  :episode:  0
q_loss:  [0.4513635]
t:  777  :episode:

t:  943  :episode:  0
q_loss:  [0.985991]
t:  944  :episode:  0
q_loss:  [0.15569048]
t:  945  :episode:  0
q_loss:  [0.27025747]
t:  946  :episode:  0
q_loss:  [0.20685616]
t:  947  :episode:  0
q_loss:  [0.3228577]
t:  948  :episode:  0
q_loss:  [0.3228972]
t:  949  :episode:  0
q_loss:  [0.6637796]
t:  950  :episode:  0
q_loss:  [0.5454177]
t:  951  :episode:  0
q_loss:  [0.42882365]
t:  952  :episode:  0
q_loss:  [0.10533044]
t:  953  :episode:  0
q_loss:  [0.41614065]
t:  954  :episode:  0
q_loss:  [0.35339487]
t:  955  :episode:  0
q_loss:  [0.2417004]
t:  956  :episode:  0
q_loss:  [0.34362125]
t:  957  :episode:  0
q_loss:  [0.51961005]
t:  958  :episode:  0
q_loss:  [0.1848276]
t:  959  :episode:  0
q_loss:  [0.4304642]
t:  960  :episode:  0
q_loss:  [0.22080562]
t:  961  :episode:  0
q_loss:  [0.15128137]
t:  962  :episode:  0
q_loss:  [0.20165244]
t:  963  :episode:  0
q_loss:  [0.09778753]
t:  964  :episode:  0
q_loss:  [0.36964193]
t:  965  :episode:  0
q_loss:  [0.3788714

q_loss:  [0.13579398]
t:  133  :episode:  1
q_loss:  [0.60538244]
t:  134  :episode:  1
q_loss:  [0.51041114]
t:  135  :episode:  1
q_loss:  [0.26611742]
t:  136  :episode:  1
q_loss:  [0.3463731]
t:  137  :episode:  1
q_loss:  [0.18610097]
t:  138  :episode:  1
q_loss:  [1.1074718]
t:  139  :episode:  1
q_loss:  [0.61795425]
t:  140  :episode:  1
q_loss:  [0.46850124]
t:  141  :episode:  1
q_loss:  [0.13682176]
t:  142  :episode:  1
q_loss:  [0.18108419]
t:  143  :episode:  1
q_loss:  [0.24007142]
t:  144  :episode:  1
q_loss:  [0.17772272]
t:  145  :episode:  1
q_loss:  [0.20282932]
t:  146  :episode:  1
q_loss:  [0.16051169]
t:  147  :episode:  1
q_loss:  [0.8172749]
t:  148  :episode:  1
q_loss:  [0.07862176]
t:  149  :episode:  1
q_loss:  [0.52075195]
t:  150  :episode:  1
q_loss:  [0.3820443]
t:  151  :episode:  1
q_loss:  [0.4118479]
t:  152  :episode:  1
q_loss:  [0.13151468]
t:  153  :episode:  1
q_loss:  [0.61805016]
t:  154  :episode:  1
q_loss:  [0.5197213]
t:  155  :episod

t:  321  :episode:  1
q_loss:  [0.22546533]
t:  322  :episode:  1
q_loss:  [0.07973889]
t:  323  :episode:  1
q_loss:  [0.61611754]
t:  324  :episode:  1
q_loss:  [0.07907166]
t:  325  :episode:  1
q_loss:  [0.08194745]
t:  326  :episode:  1
q_loss:  [0.36794034]
t:  327  :episode:  1
q_loss:  [0.23961729]
t:  328  :episode:  1
q_loss:  [0.3833275]
t:  329  :episode:  1
q_loss:  [0.6760115]
t:  330  :episode:  1
q_loss:  [0.27335662]
t:  331  :episode:  1
q_loss:  [0.41202325]
t:  332  :episode:  1
q_loss:  [0.2569252]
t:  333  :episode:  1
q_loss:  [0.14954343]
t:  334  :episode:  1
q_loss:  [0.9347825]
t:  335  :episode:  1
q_loss:  [0.55682194]
t:  336  :episode:  1
q_loss:  [0.4666738]
t:  337  :episode:  1
q_loss:  [0.34631392]
t:  338  :episode:  1
q_loss:  [0.38662428]
t:  339  :episode:  1
q_loss:  [0.45384267]
t:  340  :episode:  1
q_loss:  [0.7072009]
t:  341  :episode:  1
q_loss:  [0.2177468]
t:  342  :episode:  1
q_loss:  [0.27221435]
t:  343  :episode:  1
q_loss:  [0.71843

t:  511  :episode:  1
q_loss:  [0.25367188]
t:  512  :episode:  1
q_loss:  [0.39077935]
t:  513  :episode:  1
q_loss:  [0.31286287]
t:  514  :episode:  1
q_loss:  [0.81115115]
t:  515  :episode:  1
q_loss:  [0.10181616]
t:  516  :episode:  1
q_loss:  [0.55512816]
t:  517  :episode:  1
q_loss:  [0.80285555]
t:  518  :episode:  1
q_loss:  [0.4079432]
t:  519  :episode:  1
q_loss:  [0.5706248]
t:  520  :episode:  1
q_loss:  [0.18004008]
t:  521  :episode:  1
q_loss:  [0.0852764]
t:  522  :episode:  1
q_loss:  [0.36275542]
t:  523  :episode:  1
q_loss:  [0.32539034]
t:  524  :episode:  1
q_loss:  [0.3497417]
t:  525  :episode:  1
q_loss:  [0.5181731]
t:  526  :episode:  1
q_loss:  [0.27144665]
t:  527  :episode:  1
q_loss:  [0.473222]
t:  528  :episode:  1
q_loss:  [0.09718169]
t:  529  :episode:  1
q_loss:  [0.1721015]
t:  530  :episode:  1
q_loss:  [0.42503732]
t:  531  :episode:  1
q_loss:  [0.70542085]
t:  532  :episode:  1
q_loss:  [0.14951164]
t:  533  :episode:  1
q_loss:  [0.537266

q_loss:  [0.33828676]
t:  700  :episode:  1
q_loss:  [0.76869154]
t:  701  :episode:  1
q_loss:  [0.74844706]
t:  702  :episode:  1
q_loss:  [0.26945886]
t:  703  :episode:  1
q_loss:  [0.6340186]
t:  704  :episode:  1
q_loss:  [0.35649568]
t:  705  :episode:  1
q_loss:  [0.19860683]
t:  706  :episode:  1
q_loss:  [0.16239345]
t:  707  :episode:  1
q_loss:  [0.34829617]
t:  708  :episode:  1
q_loss:  [0.26312923]
t:  709  :episode:  1
q_loss:  [0.32471576]
t:  710  :episode:  1
q_loss:  [0.4265706]
t:  711  :episode:  1
q_loss:  [0.93304473]
t:  712  :episode:  1
q_loss:  [0.2810093]
t:  713  :episode:  1
q_loss:  [0.3364042]
t:  714  :episode:  1
q_loss:  [0.91521776]
t:  715  :episode:  1
q_loss:  [0.3881004]
t:  716  :episode:  1
q_loss:  [0.07619562]
t:  717  :episode:  1
q_loss:  [0.47402877]
t:  718  :episode:  1
q_loss:  [0.4159404]
t:  719  :episode:  1
q_loss:  [1.0440333]
t:  720  :episode:  1
q_loss:  [0.41988936]
t:  721  :episode:  1
q_loss:  [0.20240188]
t:  722  :episode

t:  888  :episode:  1
q_loss:  [0.25549787]
t:  889  :episode:  1
q_loss:  [0.1971623]
t:  890  :episode:  1
q_loss:  [0.45550364]
t:  891  :episode:  1
q_loss:  [1.7693313]
t:  892  :episode:  1
q_loss:  [0.36417282]
t:  893  :episode:  1
q_loss:  [1.1021805]
t:  894  :episode:  1
q_loss:  [1.1572198]
t:  895  :episode:  1
q_loss:  [0.92263556]
t:  896  :episode:  1
q_loss:  [0.04970239]
t:  897  :episode:  1
q_loss:  [0.7968219]
t:  898  :episode:  1
q_loss:  [0.16236791]
t:  899  :episode:  1
q_loss:  [0.17300555]
t:  900  :episode:  1
q_loss:  [0.20632024]
t:  901  :episode:  1
q_loss:  [0.32507306]
t:  902  :episode:  1
q_loss:  [1.9933193]
t:  903  :episode:  1
q_loss:  [0.22402443]
t:  904  :episode:  1
q_loss:  [0.406246]
t:  905  :episode:  1
q_loss:  [1.2910872]
t:  906  :episode:  1
q_loss:  [0.34480605]
t:  907  :episode:  1
q_loss:  [1.233081]
t:  908  :episode:  1
q_loss:  [0.32900137]
t:  909  :episode:  1
q_loss:  [0.16854987]
t:  910  :episode:  1
q_loss:  [0.46953112]

UnknownError: Failed to rename: ./baseline_model_temp_92295b6cabce4921b49e049ecac00827/part-00001-of-00002.data-00000-of-00001 to: ./baseline_model.data-00001-of-00002 : Access is denied.
; Input/output error [Op:MergeV2Checkpoints]

 https://datascience.stackexchange.com/questions/13216/intuitive-explanation-of-noise-contrastive-estimation-nce-loss(InfoNCE Loss )
<br>
Representation Learning with Contrastive Predictive Coding
<br>
https://github.com/gdao-research/cpc/blob/master/cpc/data_handler.py (CPC)
<br>
https://github.com/davidtellez/contrastive-predictive-coding/blob/master/train_model.py (CPC)
<br>
https://github.com/MishaLaskin/curl/blob/23b0880708c29b078b0a25e62ff31fb587587b18/utils.py#L123 (replay buffer and SAC)
<br>
https://github.com/marload/DeepRL-TensorFlow2/blob/master/A2C/A2C_Discrete.py (A2C)
<br>
https://github.com/germain-hug/Deep-RL-Keras/blob/master/A3C/a3c.py (A3C)
<br>
https://github.com/tensorflow/agents/blob/v0.5.0/tf_agents/agents/sac/sac_agent.py (SAC)