In [1]:
# if colab

# !pip install pybullet
# !pip install gym
# !apt-get install python-opengl -y
# !apt install xvfb -y
# !pip install gym pyvirtualdisplay > /dev/null 2>&1
# !pip install -q git+https://github.com/tensorflow/examples.git

In [2]:
import os
import glob
import cv2
import tensorflow as tf 
from tensorflow.keras import layers, models
import numpy as np 
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
import pybullet_envs
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay

In [3]:
seed = 654765645
np.random.seed(seed)
tf.random.set_seed(seed)

# check if GPU
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
# colab

# from google.colab import drive
# drive.mount('/content/drive')

# root_dir = "drive/My Drive/"
# base_dir = root_dir + 'CPCtesting'
# os.makedirs(base_dir,exist_ok=True)

# train_dir = base_dir + '/train'
# os.makedirs(train_dir,exist_ok=True)

# model_dir = base_dir + '/model'
# os.makedirs(model_dir,exist_ok=True)

# if local machine
base_dir = os.getcwd()

train_dir = os.path.join(base_dir , 'train')
os.makedirs(train_dir,exist_ok=True)

model_dir = os.path.join(base_dir , 'model')
os.makedirs(model_dir,exist_ok=True)

# logs_base_dir = os.path.join(base_dir , 'logs')

log_dir = os.path.join(base_dir , 'training_logs_save')
reward_dir = os.path.join(base_dir , 'training_rewards_save')

#remove old logs
fileList1 = glob.glob(os.path.join(log_dir , "events.*"))
fileList2 = glob.glob(os.path.join(reward_dir , "events.*"))

for filePath in fileList1:
    try:
        os.remove(filePath)
    except:
        print("Error while deleting file : ", filePath)
        
for filePath in fileList2:
    try:
        os.remove(filePath)
    except:
        print("Error while deleting file : ", filePath)


# tensorboard directories
# %load_ext tensorboard
os.makedirs(log_dir, exist_ok=True)
os.makedirs(reward_dir,exist_ok=True)
# %tensorboard --logdir {logs_base_dir}

In [5]:
# get data
# (train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

In [6]:
class CPCModel(tf.keras.Model):
    def __init__(self,code_size, predict_terms, terms=4, units=256, image_size=64, channels=3):
        super(CPCModel, self).__init__()
        self.code_size = code_size
        self.predict_terms = predict_terms
        self.terms = terms
        self.units = units
        self.image_size = image_size
        self.channels = channels

        self.conv1 = tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=2, activation='linear')
        self.bn1 = tf.keras.layers.BatchNormalization()
        self.lrelu1 = tf.keras.layers.LeakyReLU()
        self.conv2 = tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=2, activation='linear')
        self.bn2 = tf.keras.layers.BatchNormalization()
        self.lrelu2 = tf.keras.layers.LeakyReLU()
        self.conv3 = tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=2, activation='linear')
        self.bn3 = tf.keras.layers.BatchNormalization()
        self.lrelu3 = tf.keras.layers.LeakyReLU()
        self.conv4 = tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=2, activation='linear')
        self.bn4 = tf.keras.layers.BatchNormalization()
        self.lrelu4 = tf.keras.layers.LeakyReLU()
        self.flatten = tf.keras.layers.Flatten()
        self.dense5 = tf.keras.layers.Dense(units=256, activation='linear')
        self.bn5 = tf.keras.layers.BatchNormalization()
        self.lrelu5 = tf.keras.layers.LeakyReLU()
        self.dense6 = tf.keras.layers.Dense(units=code_size, activation='linear', name='encoder_embedding')

        self.gru = tf.keras.layers.GRU(units, return_sequences=False, name='ar_context')
        self.linear = tf.keras.layers.Dense(predict_terms*code_size, activation='linear')    
   
    def encoding(self,x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.lrelu1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.lrelu2(x)
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.lrelu3(x)
        x = self.conv4(x)
        x = self.bn4(x)
        x = self.lrelu4(x)
        x = self.flatten(x)
        x = self.dense5(x)
        x = self.bn5(x)
        x = self.lrelu5(x)
        z = self.dense6(x)
        return z
  
    def get_context(self, x):
        z = self.encoding(x)
        z = tf.reshape(z, [-1, self.terms, self.code_size])
        c = self.gru(z)
        return c
    def get_prediction(self, x):
        c = self.get_context(x)
        z_hats = self.linear(c)
        z_hat = tf.reshape(z_hats, [-1, self.predict_terms, self.code_size])
        return z_hat

    def optimizer(self):
        pass

    def loss(self,weights,biases,labels,inputs,num_samples,num_classes): 
        loss = tf.nn.nce_loss(
        weights, biases, labels, inputs, num_sampled, num_classes, num_true=1,
        sampled_values=None, remove_accidental_hits=False, name='nce_loss')
        return loss
  
    def call(self,inputs):
        x_tm, x_tp = inputs
        x_tm = tf.reshape(x_tm, [-1, self.image_size, self.image_size, self.channels])
        x_tp = tf.reshape(x_tp, [-1, self.image_size, self.image_size, self.channels])
        z_hat = self.get_prediction(x_tm)
        z_tp = self.encoding(x_tp)
        z_tp = tf.reshape(z_tp, [-1, self.predict_terms, self.code_size])
        dot_prods = tf.reduce_mean(tf.reduce_mean(z_hat*z_tp, axis=-1), axis=-1, keepdims=True)
        probs = tf.sigmoid(dot_prods)
        return probs


  # def save(self):
  #       f1 = os.path.join(folder,'target_actor')
  #       f2 = os.path.join(folder, 'target_critic')
  #       f3 = os.path.join(folder, 'actor')
  #       f4 = os.path.join(folder, 'critic')
  #       self.target_actor.save(f1)
  #       self.target_critic.save(f2)
  #       self.actor.save(f3)
  #       self.critic.save(f4)


  # def load(self):
  #   pass

In [7]:
class ReplayBuffer():
    def __init__(self,state_space,action_space,capacity,batch):
        self.capacity = capacity
        self.batch = batch
        self.elements = 0
        
        self.avaliable_batch = 0
        self.idx = 0
        self.entries = 0 
        
        self.states = np.empty((self.capacity,state_space),dtype = np.float32)
        self.next_states = np.empty((self.capacity,state_space),dtype = np.float32)
        self.actions = np.empty((self.capacity,action_space),dtype = np.float32)
        self.rewards = np.empty((self.capacity,1),dtype = np.float32)
        self.not_dones = np.empty((self.capacity, 1), dtype=np.float32)
        
    def add(self,state,next_state,action,reward,done):
        np.copyto(self.states[self.idx], state)
        np.copyto(self.actions[self.idx], action)
        np.copyto(self.rewards[self.idx], reward)
        np.copyto(self.next_states[self.idx], next_state)
        np.copyto(self.not_dones[self.idx], not done)
        #self.avaliable_batch= (self.avaliable_batch + 1) if self.avaliable_batch < self.batch else self.batch
        #self.entries = (self.entries + 1) if self.entries < self.capacity else self.capacity
        self.idx = (self.idx + 1) % self.capacity
        self.entries = np.minimum(self.entries + 1, self.capacity)
        
    def sample(self):
        num = self.entries
        if(num > self.batch):
            num = self.batch
        #print('avaliable_batch: ',self.avaliable_batch, "entries: ", self.entries,'capacity: ', self.capacity)
        idx = np.random.choice(self.entries,size = num,replace=False)
        #print('test idx: ', idx)
        
        states = tf.convert_to_tensor(self.states[idx])
        next_states = tf.convert_to_tensor(self.next_states[idx])
        actions = tf.convert_to_tensor(self.actions[idx])
        rewards = tf.convert_to_tensor(self.rewards[idx])
        not_dones = tf.convert_to_tensor(self.not_dones[idx])
        
        return states,next_states,actions,rewards,not_dones            
            
        

class Actor(tf.keras.Model):
    def __init__(self,action_space):
        super(Actor,self).__init__()
        
        #params
        self.action_space = action_space
       
        #model
        self.dense1 = tf.keras.layers.Dense(400,
                                            #input_shape = (None,1,1,state_space),
                                            activation = 'relu',
                                            #bias_initializer = tf.random_uniform_initializer(minval=-0.003, maxval=0.003),
                                            bias_initializer = tf.keras.initializers.VarianceScaling(scale=1.0, 
                                                                                                     mode='fan_in', 
                                                                                                     distribution='uniform', 
                                                                                                     seed=seed
                                                                                                    ),
                                            kernel_initializer = tf.keras.initializers.VarianceScaling(scale=1.0, 
                                                                                                       mode='fan_in', 
                                                                                                       distribution='uniform', 
                                                                                                       seed=seed)
                                           )
        self.dense2 = tf.keras.layers.Dense(300,
                                            activation='relu',
                                            #bias_initializer = tf.random_uniform_initializer(minval=-0.003, maxval=0.003),
                                            bias_initializer = tf.keras.initializers.VarianceScaling(scale=1.0, 
                                                                                                       mode='fan_in', 
                                                                                                       distribution='uniform', 
                                                                                                       seed=seed),
                                            kernel_initializer = tf.keras.initializers.VarianceScaling(scale=1.0, 
                                                                                                       mode='fan_in', 
                                                                                                       distribution='uniform', 
                                                                                                       seed=seed)
                                            )
        self.dense3 = tf.keras.layers.Dense(self.action_space,
                                            bias_initializer = tf.random_uniform_initializer(minval=-0.003, 
                                                                                             maxval=0.003,
                                                                                             seed = seed
                                                                                            ),
                                            kernel_initializer = tf.random_uniform_initializer(minval=-0.003, 
                                                                                               maxval=0.003,
                                                                                               seed = seed
                                                                                              )
                                           )
        
    def call(self,x):
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dense3(x)
        return x

class Critic(tf.keras.Model):
    def __init__(self):
        super(Critic,self).__init__()
        # layers
        self.dense1 = tf.keras.layers.Dense(400,
                                            #input_shape=(1,1,combined_space),
                                            activation = 'relu',
                                            #bias_initializer = tf.random_uniform_initializer(minval=-0.003, maxval=0.003),
                                            bias_initializer = tf.keras.initializers.VarianceScaling(scale=1.0, 
                                                                                                     mode='fan_in', 
                                                                                                     distribution='uniform', 
                                                                                                     seed=seed
                                                                                                    ),
                                            kernel_initializer = tf.keras.initializers.VarianceScaling(scale=1.0, 
                                                                                                       mode='fan_in', 
                                                                                                       distribution='uniform', 
                                                                                                       seed=seed),
                                            kernel_regularizer=tf.keras.regularizers.l2(0.01)
                                           )
        self.concat1 = tf.keras.layers.Concatenate(axis=-1)
        self.dense2 = tf.keras.layers.Dense(300,
                                            activation='relu',
                                            #bias_initializer = tf.random_uniform_initializer(minval=-0.003, maxval=0.003),
                                            bias_initializer = tf.keras.initializers.VarianceScaling(scale=1.0, 
                                                                                                     mode='fan_in', 
                                                                                                     distribution='uniform', 
                                                                                                     seed=seed
                                                                                                    ),
                                            kernel_initializer = tf.keras.initializers.VarianceScaling(scale=1.0, 
                                                                                                       mode='fan_in', 
                                                                                                       distribution='uniform', 
                                                                                                       seed=seed
                                                                                                      ),
                                            kernel_regularizer=tf.keras.regularizers.l2(0.01)
                                            )
        self.dense3 = tf.keras.layers.Dense(1,
                                            bias_initializer = tf.random_uniform_initializer(minval=-0.003, maxval=0.003),
                                            kernel_initializer = tf.random_uniform_initializer(minval=-0.003, 
                                                                                               maxval=0.003,
                                                                                               seed = seed
                                                                                              ),
                                            kernel_regularizer=tf.keras.regularizers.l2(0.01)
                                            ) 
    #predict
    def call(self,data): #states,actions):
        [states,actions] = data
        #x = tf.concat([states,actions],-1)
        y = self.dense1(states)
        x = self.concat1([y,actions])
        x = self.dense2(x)
        x = self.dense3(x)
        return x
    
    
class SAC():
    def __init__(self,
                 state_space,
                 action_space,
                 capacity = 1000,
                 batch = 1, 
                 tau=0.999,
                 gamma=0.99,
                 actor_lr = 0.001, 
                 critic_lr = 0.0001,
                 variance = 1.0):
        super(SAC,self).__init__()
        
        #hyperparameters
        self.batch = batch
        self.tau = tau
        self.gamma = gamma
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.noise_flag = 1.0
        self.std = np.sqrt(variance)
        
        
        #spaces
        self.action_space = action_space
        self.state_space = state_space
        self.combined_space = self.action_space + self.state_space
        
        # replay buffer
        self.replay_buffer = ReplayBuffer(self.state_space,self.action_space,capacity,self.batch)
        
        # optimizers
        self.opt_actor = tf.keras.optimizers.Adam(actor_lr)
        self.opt_critic = tf.keras.optimizers.Adam(critic_lr)
        
        #losses
        self.loss_actor = self.loss_actor_func
        self.loss_critic = tf.keras.losses.MSE
        
        # models
        self.critic = Critic()        
        self.actor = Actor(self.action_space)
        #self.critic.compile(optimizer = self.opt_critic,loss = self.loss_critic)
        #self.actor.compile(optimizer = self.opt_actor,loss = self.loss_actor)
        
        
        #print('model: ',self.critic.summary())
        # target models
        self.target_actor = Actor(self.action_space)
        self.target_critic = Critic()         
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())
        
        #cpc
        #self.cpc = CPC(code_size=128, predict_terms=4, terms=4, units=256, image_size=64, channels=3)
    
    def loss_actor_func(self,states,actions):
        actions = self.actor(states)
        #stateactions = tf.concat([states,actions],-1)
        #print("state,action shape: ",states.shape,actions.shape)
        Q = self.critic([states,actions])
        loss = - tf.reduce_mean(Q)
        return loss
        
    def update_actor(self,states,actions):
        with tf.GradientTape() as tape:
            loss = self.loss_actor(states,actions)

        grad = tape.gradient(loss,self.actor.trainable_variables)
        self.opt_actor.apply_gradients(zip(grad, self.actor.trainable_variables))
        #print('actor loss: ', loss ,"\n" )
        return loss
    
    def set_noise_flag(self,num):
        self.noise_flag = np.float32(not not num)
    
    def continous_noise(self):
        #num = np.random.normal(0,self.std)
        #result = np.full((self.action_space,),num)
        result = np.random.normal(0,self.std,size=(self.action_space,))
        return self.noise_flag * np.clip(result,a_min = -1.0, a_max = 1.0)
    
    def update_critic(self,states_i,actions_i,Q_h):
        match = Q_h.shape[0]
        with tf.GradientTape() as tape:
            Q = self.critic([states_i,actions_i])
            Q = tf.reshape(Q,(1,1,1,match))
            Q_h = tf.reshape(Q_h,(1,1,1,match))
            loss = self.loss_critic(Q,Q_h)

        grad = tape.gradient(loss,self.critic.trainable_variables)
        #grad_magnitude = tf.reduce_sum(grad)
        self.opt_critic.apply_gradients(zip(grad, self.critic.trainable_variables))
        #print('critic loss: ', loss ,"\n" )
        #print("check exploding gradient: ", grad)
        return loss
    
    
    def store_replay(self,state,next_state,action,reward,done):
        self.replay_buffer.add(state,next_state,action,reward,done)
    
    def set_labels(self,states_i,next_states_i,actions_i,rewards_i,terminal_i):
        mu = self.target_actor(next_states_i)
        #print('ends: ', terminal)
        #print(mu,states)
#         stateactions = tf.concat([states,mu],1)
        Q_h = self.target_critic([next_states_i,mu])
        y = rewards_i + terminal_i*self.gamma * Q_h
        #y = np.concatenate(self.y,0).astype('float32') #.reshape((self.minibatch_size,1,1,1))
        #print('y: ',self.y)
        #y = tf.reshape(y,(self.replay_buffer.batch,1,1,1))
        return y 

    
    def update_target_weights(self):   
        tgt_critic_weight = self.target_critic.get_weights()
        tgt_actor_weight = self.target_actor.get_weights()
        actor_weight = self.actor.get_weights()
        critic_weight = self.critic.get_weights()
        
        
        for idx,(part_tgt,part_net) in enumerate(zip(tgt_actor_weight,actor_weight)):
            tgt_actor_weight[idx] = self.tau*part_tgt + (1.0-self.tau)*part_net
            
        for idx,(part_tgt,part_net) in enumerate(zip(tgt_critic_weight,critic_weight)):
            tgt_critic_weight[idx] = self.tau*part_tgt + (1.0-self.tau)*part_net
        

            
        self.target_actor.set_weights(tgt_actor_weight)
        self.target_critic.set_weights(tgt_critic_weight)
            
    def save(self,filename):
        self.actor.save_weights(filename)
        self.critic.save_weights(filename)
        self.target_actor.save_weights(filename)
        self.target_critic.save_weights(filename)
    
    def load(self,filename):
        self.actor.load_weights(filename)
        self.critic.load_weights(filename)
        self.target_actor.load_weights(filename)
        self.target_critic.load_weights(filename)
            

In [8]:
class DataHandler:
    def __init__(self, batch_size, terms, predict_terms=1, image_size=64, color=False, rescale=True, aug=True, is_training=True, method='cpc'):
        self.batch_size = batch_size
        self.terms = terms
        self.predict_terms = predict_terms
        self.image_size = image_size
        self.color = color
        self.rescale = rescale
        self.aug = aug
        self.is_training = is_training
        self.method = method
        self.lena = cv2.imread(os.path.join(base_dir,'lena.jpg'))
        (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
        if self.is_training:
            self.x = x_train/255.0
            self.y = y_train
        else:
            self.x = x_test/255.0
            self.y = y_test
        self.idxs = []
        for i in range(10):
            y = y_train if self.is_training else y_test
            self.idxs.append(np.where(y == i)[0])
        self.n_samples = len(self.y)//terms if self.method == 'cpc' else len(self.y)
        self.shape = self.x.shape
        self.n_batches = self.n_samples//batch_size

    def __iter__(self):
        return self

    def __next__(self):
        return self.cpc_batch() if self.method == 'cpc' else self.benchmark_batch()

    def __len__(self):
        return self.n_batches

    def cpc_batch(self):
        img_labels = np.zeros((self.batch_size, self.terms + self.predict_terms))
        sentence_labels = np.ones((self.batch_size, 1)).astype('int32')
        for bi in range(self.batch_size):
            seed = np.random.randint(10)
            sentence = np.arange(seed, seed + self.terms + self.predict_terms) % 10
            if bi < self.batch_size//2:
                num = np.arange(10)
                predicted = sentence[-self.predict_terms:]
                for i, p in enumerate(predicted):
                    predicted[i] = np.random.choice(num[num != p], 1)
                sentence[-self.predict_terms:] = predicted % 10
                sentence_labels[bi, :] = 0
            img_labels[bi, :] = sentence
        images = self.get_samples(img_labels).reshape((self.batch_size, self.terms+self.predict_terms, self.image_size, self.image_size, 3))
        x_images = images[:, :-self.predict_terms, ...]
        y_images = images[:, -self.predict_terms:, ...]
        idx = np.random.choice(self.batch_size, self.batch_size, replace=False)
        return [x_images[idx], y_images[idx]], sentence_labels[idx]

    def get_samples(self, img_labels):
        idx = []
        for label in img_labels.flatten():
            idx.append(np.random.choice(self.idxs[int(label)], 1)[0])
        img_batch = self.x[idx, :, :]
        if self.aug:
            img_batch = self._aug_batch(img_batch)
        return img_batch

    def _aug_batch(self, img_batch):
        if self.image_size != 28:
            resized = []
            for i in range(img_batch.shape[0]):
                resized.append(cv2.resize(img_batch[i], (self.image_size, self.image_size)))
            img_batch = np.stack(resized)
        img_batch = img_batch.reshape((img_batch.shape[0], 1, self.image_size, self.image_size))
        img_batch = np.concatenate([img_batch, img_batch, img_batch], axis=1)

        if self.color:
            img_batch[img_batch >= 0.5] = 1
            img_batch[img_batch < 0.5] = 0
            for i in range(img_batch.shape[0]):
                x_c = np.random.randint(0, self.lena.shape[0] - self.image_size)
                y_c = np.random.randint(0, self.lena.shape[1] - self.image_size)
                img = self.lena[x_c:x_c+self.image_size, y_c:y_c+self.image_size]
                img = np.array(img).transpose((2, 0, 1))/255.0
                for j in range(3):
                    img[j, :, :] = (img[j, :, :] + np.random.uniform(0, 1))/2.0
                img[img_batch[i, :, :, :] == 1] = 1 - img[img_batch[i, :, :, :] == 1]
                img_batch[i, :, :, :] = img

        if self.rescale:
            img_batch = img_batch * 2 - 1
        img_batch = img_batch.transpose((0, 2, 3, 1))
        return img_batch

    def benchmark_batch(self):
        idx = np.random.choice(len(self.x), self.batch_size, replace=False)
        img_batch = self.x[idx]
        label_batch = self.y[idx]
        if self.aug:
            img_batch = self._aug_batch(img_batch)
        label_batch = label_batch.reshape((-1, 1))
        return img_batch, label_batch

In [9]:
# #train loop
# dh_train = DataHandler(64, 4, predict_terms=4, image_size=64, color=True, rescale=True, aug=True, is_training=True, method='cpc')
# dh_test = DataHandler(64, 4, predict_terms=4, image_size=64, color=True, rescale=True, aug=True, is_training=False, method='cpc')
# accuracy_metric_train = tf.keras.metrics.BinaryAccuracy()
# loss_metric_train = tf.keras.metrics.BinaryCrossentropy()
# accuracy_metric_test = tf.keras.metrics.BinaryAccuracy()
# loss_metric_test = tf.keras.metrics.BinaryCrossentropy()
# cpc = CPCModel(code_size=128, predict_terms=4, terms=4, units=256, image_size=64, channels=3)
# optim = tf.keras.optimizers.Adam(1e-3)
# cb = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=1/3, patience=2, min_lr=1e-4),
#       tf.keras.callbacks.ModelCheckpoint('weights/weights.{epoch:02d}-{val_binary_accuracy:.2f}.cpkt',
#                                           monitor='val_binary_accuracy', save_best_only=True, save_weights_only=True),
#       tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=3),
#       tf.keras.callbacks.TensorBoard()]
# cpc.compile(optimizer=optim, loss='binary_crossentropy', metrics=['binary_accuracy'])
# cpc.fit(x=dh_train, epochs=10, validation_data=dh_test, steps_per_epoch=60000//64, validation_steps=10000//64, callbacks=cb)


In [10]:
#%tensorboard --logdir {logs_base_dir}

In [11]:
# train loop params


episodes = 200
episode_steps = 1000
buffer_size = 100000
batch_size = 16

# pybullet setup
env = gym.make('HalfCheetahBulletEnv-v0')
env.seed(seed)
#env.render() #mode = 'human')
env._max_episode_steps = episode_steps





In [12]:
writer = tf.summary.create_file_writer(log_dir)
writer_reward = tf.summary.create_file_writer(reward_dir)

#get spaces
state_space = env.observation_space.shape[0]
action_space = env.action_space.shape[0]
print(state_space,action_space)

26 6


In [None]:
state = env.reset()
sac = SAC(action_space=action_space,
          state_space=state_space,
          capacity = buffer_size,
          batch = batch_size,
          tau = 0.999,
          gamma = 0.99,
          actor_lr = 0.0001,
          critic_lr = 0.001,
          variance = 0.2)

#fill replay buffer
#env._max_episode_steps = buffer_size
#sac.replay_buffer.fill_buffer(buffer_size, state, episode_steps) # self,timesteps,state,prev_timesteps
#env._max_episode_steps = episode_steps


env = gym.wrappers.Monitor(env, "baseline_training", video_callable=lambda episode: True, force="true")
state = env.reset()

for episode in range(episodes):
    sumreward = 0
    for step in range(episode_steps):
        #print(observation)
        print('t: ',step, ' :episode: ',episode)
        #print('state: ',state)
        
        # get action
        state = tf.reshape(state,(1,1,state_space)) #,dtype='float32')
        #print(state)
        tensor_noisy_action = sac.actor(state)+sac.continous_noise()
        #tensor_action = tf.clip_by_value(tensor_action, clip_value_min=-1.0, clip_value_max=1.0)

        noisy_action = tensor_noisy_action[0][0]
        #print('action: ',action)
        
        #get loss
        #q_loss = sac.critic(state,tensor_action)
        
        
        # execute action
        next_state, reward, done, info = env.step(noisy_action)
        sumreward += reward

        # store transitions
        sac.store_replay(state,next_state,noisy_action,reward,done)
        
        #print('state: ',state)
        #print('next_state: ',next_state)
        #print('action: ',action)
        #print('reward: ',reward)

        #sample minibatch from data
        states_i,next_states_i,actions_i,rewards_i,terminal_i = sac.replay_buffer.sample()
        
        #set labels y_i
        y = sac.set_labels(states_i,next_states_i,actions_i,rewards_i,terminal_i)
        #print('y: ',y)
        
        # update critic net
        q_loss = sac.update_critic(states_i, actions_i, y)

        print('q_loss: ', q_loss.numpy())
        with writer.as_default():
            tf.summary.scalar('Squared QLosses (qtarget - qval)^2', q_loss[0][0][0].numpy(),
                              step=episode * episode_steps + step + 1)
        
        #losses[episode*timesteps + t] = loss
        #losses[i_episode*timesteps+] = history.history
        
        #update actor net
        sac.update_actor(states_i, sac.actor(states_i)) #actions)
        #print('weight check: ',rl.actor.get_weights(),'\n')
        
        #update target nets
        sac.update_target_weights()
        
        state = next_state
        if done:
            state = env.reset()
            #rewards[episode] = sumreward
            #sac.save(base_dir+'/baseline_model')
            print("Episode {} finished after {} timesteps with average reward {}".format(episode,step+1,sumreward))
            with writer_reward.as_default():
                tf.summary.scalar('Episode sum reward', sumreward,step=episode)
            break
print('done') 
sac.save(base_dir+'/baseline_model')

t:  0  :episode:  0
q_loss:  [[[0.50974935]]]
t:  1  :episode:  0
q_loss:  [[[0.3275066]]]
t:  2  :episode:  0
q_loss:  [[[0.19963235]]]
t:  3  :episode:  0
q_loss:  [[[0.25715858]]]
t:  4  :episode:  0
q_loss:  [[[0.188266]]]
t:  5  :episode:  0
q_loss:  [[[0.12147459]]]
t:  6  :episode:  0
q_loss:  [[[0.16471402]]]
t:  7  :episode:  0
q_loss:  [[[0.1738387]]]
t:  8  :episode:  0
q_loss:  [[[0.17829093]]]
t:  9  :episode:  0
q_loss:  [[[0.171621]]]
t:  10  :episode:  0
q_loss:  [[[0.15258278]]]
t:  11  :episode:  0
q_loss:  [[[0.12929249]]]
t:  12  :episode:  0
q_loss:  [[[0.10955252]]]
t:  13  :episode:  0
q_loss:  [[[0.09504943]]]
t:  14  :episode:  0
q_loss:  [[[0.08506015]]]
t:  15  :episode:  0
q_loss:  [[[0.10853047]]]
t:  16  :episode:  0
q_loss:  [[[0.10576446]]]
t:  17  :episode:  0
q_loss:  [[[0.540135]]]
t:  18  :episode:  0
q_loss:  [[[0.6476035]]]
t:  19  :episode:  0
q_loss:  [[[0.9277204]]]
t:  20  :episode:  0
q_loss:  [[[0.9831829]]]
t:  21  :episode:  0
q_loss:  [[[0

t:  176  :episode:  0
q_loss:  [[[0.09260025]]]
t:  177  :episode:  0
q_loss:  [[[0.05569023]]]
t:  178  :episode:  0
q_loss:  [[[0.04756865]]]
t:  179  :episode:  0
q_loss:  [[[0.06911379]]]
t:  180  :episode:  0
q_loss:  [[[0.04098647]]]
t:  181  :episode:  0
q_loss:  [[[0.04443201]]]
t:  182  :episode:  0
q_loss:  [[[0.04779094]]]
t:  183  :episode:  0
q_loss:  [[[0.04019135]]]
t:  184  :episode:  0
q_loss:  [[[0.06879655]]]
t:  185  :episode:  0
q_loss:  [[[0.04045025]]]
t:  186  :episode:  0
q_loss:  [[[0.07505585]]]
t:  187  :episode:  0
q_loss:  [[[0.06871314]]]
t:  188  :episode:  0
q_loss:  [[[0.07885274]]]
t:  189  :episode:  0
q_loss:  [[[0.0464015]]]
t:  190  :episode:  0
q_loss:  [[[0.02754547]]]
t:  191  :episode:  0
q_loss:  [[[0.02725924]]]
t:  192  :episode:  0
q_loss:  [[[0.02077844]]]
t:  193  :episode:  0
q_loss:  [[[0.04029897]]]
t:  194  :episode:  0
q_loss:  [[[0.06034178]]]
t:  195  :episode:  0
q_loss:  [[[0.05176857]]]
t:  196  :episode:  0
q_loss:  [[[0.03773

t:  348  :episode:  0
q_loss:  [[[0.043363]]]
t:  349  :episode:  0
q_loss:  [[[0.48233765]]]
t:  350  :episode:  0
q_loss:  [[[0.05089519]]]
t:  351  :episode:  0
q_loss:  [[[0.03277501]]]
t:  352  :episode:  0
q_loss:  [[[0.05811514]]]
t:  353  :episode:  0
q_loss:  [[[0.23487635]]]
t:  354  :episode:  0
q_loss:  [[[0.22381446]]]
t:  355  :episode:  0
q_loss:  [[[0.08491471]]]
t:  356  :episode:  0
q_loss:  [[[0.08409361]]]
t:  357  :episode:  0
q_loss:  [[[0.03051794]]]
t:  358  :episode:  0
q_loss:  [[[0.3633572]]]
t:  359  :episode:  0
q_loss:  [[[0.30750087]]]
t:  360  :episode:  0
q_loss:  [[[0.12651369]]]
t:  361  :episode:  0
q_loss:  [[[0.07123142]]]
t:  362  :episode:  0
q_loss:  [[[0.07525507]]]
t:  363  :episode:  0
q_loss:  [[[0.06668624]]]
t:  364  :episode:  0
q_loss:  [[[0.10176645]]]
t:  365  :episode:  0
q_loss:  [[[0.05181476]]]
t:  366  :episode:  0
q_loss:  [[[0.03801646]]]
t:  367  :episode:  0
q_loss:  [[[0.10997147]]]
t:  368  :episode:  0
q_loss:  [[[0.1116274

t:  521  :episode:  0
q_loss:  [[[0.18319312]]]
t:  522  :episode:  0
q_loss:  [[[0.23560877]]]
t:  523  :episode:  0
q_loss:  [[[0.21619281]]]
t:  524  :episode:  0
q_loss:  [[[0.3587493]]]
t:  525  :episode:  0
q_loss:  [[[0.11424714]]]
t:  526  :episode:  0
q_loss:  [[[0.17668329]]]
t:  527  :episode:  0
q_loss:  [[[0.18550766]]]
t:  528  :episode:  0
q_loss:  [[[0.06009182]]]
t:  529  :episode:  0
q_loss:  [[[0.17302445]]]
t:  530  :episode:  0
q_loss:  [[[0.11290458]]]
t:  531  :episode:  0
q_loss:  [[[0.4919415]]]
t:  532  :episode:  0
q_loss:  [[[0.2597009]]]
t:  533  :episode:  0
q_loss:  [[[0.08205715]]]
t:  534  :episode:  0
q_loss:  [[[0.43305326]]]
t:  535  :episode:  0
q_loss:  [[[0.03937828]]]
t:  536  :episode:  0
q_loss:  [[[0.04105221]]]
t:  537  :episode:  0
q_loss:  [[[0.20462804]]]
t:  538  :episode:  0
q_loss:  [[[0.14039673]]]
t:  539  :episode:  0
q_loss:  [[[0.07407589]]]
t:  540  :episode:  0
q_loss:  [[[0.08467105]]]
t:  541  :episode:  0
q_loss:  [[[0.0724919

t:  694  :episode:  0
q_loss:  [[[0.05960058]]]
t:  695  :episode:  0
q_loss:  [[[0.12206917]]]
t:  696  :episode:  0
q_loss:  [[[0.14550862]]]
t:  697  :episode:  0
q_loss:  [[[0.07335831]]]
t:  698  :episode:  0
q_loss:  [[[0.02446941]]]
t:  699  :episode:  0
q_loss:  [[[0.24722108]]]
t:  700  :episode:  0
q_loss:  [[[0.28844023]]]
t:  701  :episode:  0
q_loss:  [[[0.08374253]]]
t:  702  :episode:  0
q_loss:  [[[0.1084073]]]
t:  703  :episode:  0
q_loss:  [[[0.02388151]]]
t:  704  :episode:  0
q_loss:  [[[0.04667604]]]
t:  705  :episode:  0
q_loss:  [[[0.05099162]]]
t:  706  :episode:  0
q_loss:  [[[0.03100695]]]
t:  707  :episode:  0
q_loss:  [[[0.09638376]]]
t:  708  :episode:  0
q_loss:  [[[0.0558794]]]
t:  709  :episode:  0
q_loss:  [[[0.02287395]]]
t:  710  :episode:  0
q_loss:  [[[0.18596636]]]
t:  711  :episode:  0
q_loss:  [[[0.07127739]]]
t:  712  :episode:  0
q_loss:  [[[0.06169031]]]
t:  713  :episode:  0
q_loss:  [[[0.38861954]]]
t:  714  :episode:  0
q_loss:  [[[0.066026

t:  867  :episode:  0
q_loss:  [[[0.03743747]]]
t:  868  :episode:  0
q_loss:  [[[0.12955117]]]
t:  869  :episode:  0
q_loss:  [[[0.15013073]]]
t:  870  :episode:  0
q_loss:  [[[0.10133869]]]
t:  871  :episode:  0
q_loss:  [[[0.23323709]]]
t:  872  :episode:  0
q_loss:  [[[0.16437393]]]
t:  873  :episode:  0
q_loss:  [[[0.05815717]]]
t:  874  :episode:  0
q_loss:  [[[0.03973888]]]
t:  875  :episode:  0
q_loss:  [[[0.15400068]]]
t:  876  :episode:  0
q_loss:  [[[0.03704668]]]
t:  877  :episode:  0
q_loss:  [[[0.11388537]]]
t:  878  :episode:  0
q_loss:  [[[0.2594646]]]
t:  879  :episode:  0
q_loss:  [[[0.04729059]]]
t:  880  :episode:  0
q_loss:  [[[0.06430289]]]
t:  881  :episode:  0
q_loss:  [[[0.08209636]]]
t:  882  :episode:  0
q_loss:  [[[0.03276595]]]
t:  883  :episode:  0
q_loss:  [[[0.2956263]]]
t:  884  :episode:  0
q_loss:  [[[0.20723958]]]
t:  885  :episode:  0
q_loss:  [[[0.1862879]]]
t:  886  :episode:  0
q_loss:  [[[0.06042961]]]
t:  887  :episode:  0
q_loss:  [[[0.3843254

t:  38  :episode:  1
q_loss:  [[[0.02512078]]]
t:  39  :episode:  1
q_loss:  [[[0.07009915]]]
t:  40  :episode:  1
q_loss:  [[[0.04332705]]]
t:  41  :episode:  1
q_loss:  [[[0.21163125]]]
t:  42  :episode:  1
q_loss:  [[[0.08332346]]]
t:  43  :episode:  1
q_loss:  [[[0.18100388]]]
t:  44  :episode:  1
q_loss:  [[[0.17856987]]]
t:  45  :episode:  1
q_loss:  [[[0.14476463]]]
t:  46  :episode:  1
q_loss:  [[[0.11172438]]]
t:  47  :episode:  1
q_loss:  [[[0.32174855]]]
t:  48  :episode:  1
q_loss:  [[[0.17865172]]]
t:  49  :episode:  1
q_loss:  [[[0.24308185]]]
t:  50  :episode:  1
q_loss:  [[[0.03761684]]]
t:  51  :episode:  1
q_loss:  [[[0.04865675]]]
t:  52  :episode:  1
q_loss:  [[[0.1972351]]]
t:  53  :episode:  1
q_loss:  [[[0.06443627]]]
t:  54  :episode:  1
q_loss:  [[[0.04742163]]]
t:  55  :episode:  1
q_loss:  [[[0.19538288]]]
t:  56  :episode:  1
q_loss:  [[[0.13004932]]]
t:  57  :episode:  1
q_loss:  [[[0.3617233]]]
t:  58  :episode:  1
q_loss:  [[[0.0980775]]]
t:  59  :episode

t:  211  :episode:  1
q_loss:  [[[0.0952142]]]
t:  212  :episode:  1
q_loss:  [[[0.04561901]]]
t:  213  :episode:  1
q_loss:  [[[0.03710119]]]
t:  214  :episode:  1
q_loss:  [[[0.29950407]]]
t:  215  :episode:  1
q_loss:  [[[0.08501016]]]
t:  216  :episode:  1
q_loss:  [[[0.1808235]]]
t:  217  :episode:  1
q_loss:  [[[0.22494511]]]
t:  218  :episode:  1
q_loss:  [[[0.06127415]]]
t:  219  :episode:  1
q_loss:  [[[0.04006342]]]
t:  220  :episode:  1
q_loss:  [[[0.3959609]]]
t:  221  :episode:  1
q_loss:  [[[0.05739001]]]
t:  222  :episode:  1
q_loss:  [[[0.1007362]]]
t:  223  :episode:  1
q_loss:  [[[0.45398647]]]
t:  224  :episode:  1
q_loss:  [[[0.04940389]]]
t:  225  :episode:  1
q_loss:  [[[0.08994856]]]
t:  226  :episode:  1
q_loss:  [[[0.17763135]]]
t:  227  :episode:  1
q_loss:  [[[0.08157285]]]
t:  228  :episode:  1
q_loss:  [[[0.04097654]]]
t:  229  :episode:  1
q_loss:  [[[0.08241757]]]
t:  230  :episode:  1
q_loss:  [[[0.03661242]]]
t:  231  :episode:  1
q_loss:  [[[0.0337669]

q_loss:  [[[0.0268717]]]
t:  384  :episode:  1
q_loss:  [[[0.3455724]]]
t:  385  :episode:  1
q_loss:  [[[0.11309518]]]
t:  386  :episode:  1
q_loss:  [[[0.05620395]]]
t:  387  :episode:  1
q_loss:  [[[0.03292773]]]
t:  388  :episode:  1
q_loss:  [[[0.0601711]]]
t:  389  :episode:  1
q_loss:  [[[0.03942811]]]
t:  390  :episode:  1
q_loss:  [[[0.4108658]]]
t:  391  :episode:  1
q_loss:  [[[0.11650716]]]
t:  392  :episode:  1
q_loss:  [[[0.05291215]]]
t:  393  :episode:  1
q_loss:  [[[0.12018526]]]
t:  394  :episode:  1
q_loss:  [[[0.2587667]]]
t:  395  :episode:  1
q_loss:  [[[0.0686664]]]
t:  396  :episode:  1
q_loss:  [[[0.07100989]]]
t:  397  :episode:  1
q_loss:  [[[0.21471481]]]
t:  398  :episode:  1
q_loss:  [[[0.04912872]]]
t:  399  :episode:  1
q_loss:  [[[0.21610203]]]
t:  400  :episode:  1
q_loss:  [[[0.15639463]]]
t:  401  :episode:  1
q_loss:  [[[0.10920077]]]
t:  402  :episode:  1
q_loss:  [[[0.03851668]]]
t:  403  :episode:  1
q_loss:  [[[0.03891402]]]
t:  404  :episode:  

t:  555  :episode:  1
q_loss:  [[[0.11426126]]]
t:  556  :episode:  1
q_loss:  [[[0.18799466]]]
t:  557  :episode:  1
q_loss:  [[[0.12450875]]]
t:  558  :episode:  1
q_loss:  [[[0.2166962]]]
t:  559  :episode:  1
q_loss:  [[[0.1443517]]]
t:  560  :episode:  1
q_loss:  [[[0.13199827]]]
t:  561  :episode:  1
q_loss:  [[[0.1226914]]]
t:  562  :episode:  1
q_loss:  [[[0.05023941]]]
t:  563  :episode:  1
q_loss:  [[[0.44316065]]]
t:  564  :episode:  1
q_loss:  [[[0.6148188]]]
t:  565  :episode:  1
q_loss:  [[[0.23211145]]]
t:  566  :episode:  1
q_loss:  [[[0.07003078]]]
t:  567  :episode:  1
q_loss:  [[[0.48599377]]]
t:  568  :episode:  1
q_loss:  [[[0.08996183]]]
t:  569  :episode:  1
q_loss:  [[[0.12973762]]]
t:  570  :episode:  1
q_loss:  [[[0.33110762]]]
t:  571  :episode:  1
q_loss:  [[[0.07006624]]]
t:  572  :episode:  1
q_loss:  [[[0.04385652]]]
t:  573  :episode:  1
q_loss:  [[[0.06524836]]]
t:  574  :episode:  1
q_loss:  [[[0.32321113]]]
t:  575  :episode:  1
q_loss:  [[[0.04747206

t:  727  :episode:  1
q_loss:  [[[0.04674771]]]
t:  728  :episode:  1
q_loss:  [[[0.18281308]]]
t:  729  :episode:  1
q_loss:  [[[0.08356816]]]
t:  730  :episode:  1
q_loss:  [[[0.0602037]]]
t:  731  :episode:  1
q_loss:  [[[0.09608081]]]
t:  732  :episode:  1
q_loss:  [[[0.49135828]]]
t:  733  :episode:  1
q_loss:  [[[0.2159569]]]
t:  734  :episode:  1
q_loss:  [[[0.10337204]]]
t:  735  :episode:  1
q_loss:  [[[0.3514685]]]
t:  736  :episode:  1
q_loss:  [[[0.09552333]]]
t:  737  :episode:  1
q_loss:  [[[0.38144463]]]
t:  738  :episode:  1
q_loss:  [[[0.04990377]]]
t:  739  :episode:  1
q_loss:  [[[0.28091693]]]
t:  740  :episode:  1
q_loss:  [[[0.11698918]]]
t:  741  :episode:  1
q_loss:  [[[0.21171483]]]
t:  742  :episode:  1
q_loss:  [[[0.19545761]]]
t:  743  :episode:  1
q_loss:  [[[0.17280069]]]
t:  744  :episode:  1
q_loss:  [[[0.1665256]]]
t:  745  :episode:  1
q_loss:  [[[0.23486681]]]
t:  746  :episode:  1
q_loss:  [[[0.13198976]]]
t:  747  :episode:  1
q_loss:  [[[0.18953606

q_loss:  [[[0.12539737]]]
t:  901  :episode:  1
q_loss:  [[[0.57048607]]]
t:  902  :episode:  1
q_loss:  [[[0.11023252]]]
t:  903  :episode:  1
q_loss:  [[[0.05804791]]]
t:  904  :episode:  1
q_loss:  [[[0.23806791]]]
t:  905  :episode:  1
q_loss:  [[[0.2401965]]]
t:  906  :episode:  1
q_loss:  [[[0.25924653]]]
t:  907  :episode:  1
q_loss:  [[[0.1405898]]]
t:  908  :episode:  1
q_loss:  [[[0.6675527]]]
t:  909  :episode:  1
q_loss:  [[[0.38460505]]]
t:  910  :episode:  1
q_loss:  [[[0.07238393]]]
t:  911  :episode:  1
q_loss:  [[[0.42530936]]]
t:  912  :episode:  1
q_loss:  [[[0.20254056]]]
t:  913  :episode:  1
q_loss:  [[[0.63212097]]]
t:  914  :episode:  1
q_loss:  [[[0.08635767]]]
t:  915  :episode:  1
q_loss:  [[[0.27937424]]]
t:  916  :episode:  1
q_loss:  [[[0.09088489]]]
t:  917  :episode:  1
q_loss:  [[[0.47804558]]]
t:  918  :episode:  1
q_loss:  [[[0.07851011]]]
t:  919  :episode:  1
q_loss:  [[[0.16594774]]]
t:  920  :episode:  1
q_loss:  [[[0.2978337]]]
t:  921  :episode:

t:  74  :episode:  2
q_loss:  [[[0.11042008]]]
t:  75  :episode:  2
q_loss:  [[[0.03550792]]]
t:  76  :episode:  2
q_loss:  [[[0.390964]]]
t:  77  :episode:  2
q_loss:  [[[0.14993948]]]
t:  78  :episode:  2
q_loss:  [[[0.37937877]]]
t:  79  :episode:  2
q_loss:  [[[0.10540842]]]
t:  80  :episode:  2
q_loss:  [[[0.25488672]]]
t:  81  :episode:  2
q_loss:  [[[0.11371197]]]
t:  82  :episode:  2
q_loss:  [[[0.10075836]]]
t:  83  :episode:  2
q_loss:  [[[0.0992604]]]
t:  84  :episode:  2
q_loss:  [[[0.085694]]]
t:  85  :episode:  2
q_loss:  [[[0.35985768]]]
t:  86  :episode:  2
q_loss:  [[[0.2611121]]]
t:  87  :episode:  2
q_loss:  [[[0.06706686]]]
t:  88  :episode:  2
q_loss:  [[[0.43323052]]]
t:  89  :episode:  2
q_loss:  [[[0.08579457]]]
t:  90  :episode:  2
q_loss:  [[[0.08923865]]]
t:  91  :episode:  2
q_loss:  [[[0.30763954]]]
t:  92  :episode:  2
q_loss:  [[[0.07969362]]]
t:  93  :episode:  2
q_loss:  [[[0.38242358]]]
t:  94  :episode:  2
q_loss:  [[[0.20661764]]]
t:  95  :episode:  

q_loss:  [[[0.7444746]]]
t:  247  :episode:  2
q_loss:  [[[0.47429264]]]
t:  248  :episode:  2
q_loss:  [[[0.4508411]]]
t:  249  :episode:  2
q_loss:  [[[0.33327097]]]
t:  250  :episode:  2
q_loss:  [[[0.19079608]]]
t:  251  :episode:  2
q_loss:  [[[0.16467443]]]
t:  252  :episode:  2
q_loss:  [[[0.122224]]]
t:  253  :episode:  2
q_loss:  [[[0.2311329]]]
t:  254  :episode:  2
q_loss:  [[[0.5279424]]]
t:  255  :episode:  2
q_loss:  [[[0.11772503]]]
t:  256  :episode:  2
q_loss:  [[[0.5849858]]]
t:  257  :episode:  2
q_loss:  [[[0.11242859]]]
t:  258  :episode:  2
q_loss:  [[[0.07780886]]]
t:  259  :episode:  2
q_loss:  [[[0.07517416]]]
t:  260  :episode:  2
q_loss:  [[[0.07095466]]]
t:  261  :episode:  2
q_loss:  [[[0.03892666]]]
t:  262  :episode:  2
q_loss:  [[[0.04265578]]]
t:  263  :episode:  2
q_loss:  [[[0.21909153]]]
t:  264  :episode:  2
q_loss:  [[[0.08278289]]]
t:  265  :episode:  2
q_loss:  [[[0.09919757]]]
t:  266  :episode:  2
q_loss:  [[[0.35041142]]]
t:  267  :episode:  2

q_loss:  [[[0.35397142]]]
t:  419  :episode:  2
q_loss:  [[[0.06073518]]]
t:  420  :episode:  2
q_loss:  [[[0.06048341]]]
t:  421  :episode:  2
q_loss:  [[[0.752331]]]
t:  422  :episode:  2
q_loss:  [[[0.6222638]]]
t:  423  :episode:  2
q_loss:  [[[0.33983907]]]
t:  424  :episode:  2
q_loss:  [[[0.08036845]]]
t:  425  :episode:  2
q_loss:  [[[0.3998016]]]
t:  426  :episode:  2
q_loss:  [[[0.06544723]]]
t:  427  :episode:  2
q_loss:  [[[0.3111324]]]
t:  428  :episode:  2
q_loss:  [[[0.15566164]]]
t:  429  :episode:  2
q_loss:  [[[0.6969388]]]
t:  430  :episode:  2
q_loss:  [[[0.05588672]]]
t:  431  :episode:  2
q_loss:  [[[0.16917022]]]
t:  432  :episode:  2
q_loss:  [[[0.28548402]]]
t:  433  :episode:  2
q_loss:  [[[0.09945652]]]
t:  434  :episode:  2
q_loss:  [[[0.08423053]]]
t:  435  :episode:  2
q_loss:  [[[0.35495228]]]
t:  436  :episode:  2
q_loss:  [[[0.30765927]]]
t:  437  :episode:  2
q_loss:  [[[0.7130375]]]
t:  438  :episode:  2
q_loss:  [[[0.24022382]]]
t:  439  :episode:  2

t:  590  :episode:  2
q_loss:  [[[0.26304576]]]
t:  591  :episode:  2
q_loss:  [[[0.23216593]]]
t:  592  :episode:  2
q_loss:  [[[0.09582324]]]
t:  593  :episode:  2
q_loss:  [[[0.08708686]]]
t:  594  :episode:  2
q_loss:  [[[0.04373055]]]
t:  595  :episode:  2
q_loss:  [[[0.06846792]]]
t:  596  :episode:  2
q_loss:  [[[0.20298773]]]
t:  597  :episode:  2
q_loss:  [[[0.721486]]]
t:  598  :episode:  2
q_loss:  [[[0.16081329]]]
t:  599  :episode:  2
q_loss:  [[[0.06250651]]]
t:  600  :episode:  2
q_loss:  [[[0.053427]]]
t:  601  :episode:  2
q_loss:  [[[0.04313203]]]
t:  602  :episode:  2
q_loss:  [[[0.22729868]]]
t:  603  :episode:  2
q_loss:  [[[0.09748521]]]
t:  604  :episode:  2
q_loss:  [[[0.08136372]]]
t:  605  :episode:  2
q_loss:  [[[0.04395698]]]
t:  606  :episode:  2
q_loss:  [[[0.11488876]]]
t:  607  :episode:  2
q_loss:  [[[0.2516767]]]
t:  608  :episode:  2
q_loss:  [[[0.1700004]]]
t:  609  :episode:  2
q_loss:  [[[0.13773675]]]
t:  610  :episode:  2
q_loss:  [[[0.04517661]]

t:  763  :episode:  2
q_loss:  [[[0.47827977]]]
t:  764  :episode:  2
q_loss:  [[[0.12190488]]]
t:  765  :episode:  2
q_loss:  [[[0.17966938]]]
t:  766  :episode:  2
q_loss:  [[[0.0721545]]]
t:  767  :episode:  2
q_loss:  [[[0.17898858]]]
t:  768  :episode:  2
q_loss:  [[[0.21171744]]]
t:  769  :episode:  2
q_loss:  [[[0.60324097]]]
t:  770  :episode:  2
q_loss:  [[[0.16566163]]]
t:  771  :episode:  2
q_loss:  [[[0.22319162]]]
t:  772  :episode:  2
q_loss:  [[[0.14418754]]]
t:  773  :episode:  2
q_loss:  [[[0.11650832]]]
t:  774  :episode:  2
q_loss:  [[[0.06495462]]]
t:  775  :episode:  2
q_loss:  [[[0.14413884]]]
t:  776  :episode:  2
q_loss:  [[[0.19355264]]]
t:  777  :episode:  2
q_loss:  [[[0.08748957]]]
t:  778  :episode:  2
q_loss:  [[[0.10942352]]]
t:  779  :episode:  2
q_loss:  [[[0.11232562]]]
t:  780  :episode:  2
q_loss:  [[[0.6474611]]]
t:  781  :episode:  2
q_loss:  [[[0.04151548]]]
t:  782  :episode:  2
q_loss:  [[[0.02723862]]]
t:  783  :episode:  2
q_loss:  [[[0.131008

q_loss:  [[[0.07864763]]]
t:  935  :episode:  2
q_loss:  [[[0.04707396]]]
t:  936  :episode:  2
q_loss:  [[[0.04386039]]]
t:  937  :episode:  2
q_loss:  [[[0.45767197]]]
t:  938  :episode:  2
q_loss:  [[[0.07302347]]]
t:  939  :episode:  2
q_loss:  [[[0.08767857]]]
t:  940  :episode:  2
q_loss:  [[[0.5885518]]]
t:  941  :episode:  2
q_loss:  [[[0.08415987]]]
t:  942  :episode:  2
q_loss:  [[[1.2202482]]]
t:  943  :episode:  2
q_loss:  [[[0.09327655]]]
t:  944  :episode:  2
q_loss:  [[[0.38900974]]]
t:  945  :episode:  2
q_loss:  [[[0.04374433]]]
t:  946  :episode:  2
q_loss:  [[[0.04745739]]]
t:  947  :episode:  2
q_loss:  [[[0.12986997]]]
t:  948  :episode:  2
q_loss:  [[[0.80354834]]]
t:  949  :episode:  2
q_loss:  [[[0.1098973]]]
t:  950  :episode:  2
q_loss:  [[[0.24130219]]]
t:  951  :episode:  2
q_loss:  [[[0.05006173]]]
t:  952  :episode:  2
q_loss:  [[[0.2443695]]]
t:  953  :episode:  2
q_loss:  [[[0.11632546]]]
t:  954  :episode:  2
q_loss:  [[[0.14717211]]]
t:  955  :episode:

t:  107  :episode:  3
q_loss:  [[[0.05774361]]]
t:  108  :episode:  3
q_loss:  [[[0.5321566]]]
t:  109  :episode:  3
q_loss:  [[[0.49737307]]]
t:  110  :episode:  3
q_loss:  [[[0.1787025]]]
t:  111  :episode:  3
q_loss:  [[[0.14265013]]]
t:  112  :episode:  3
q_loss:  [[[1.574816]]]
t:  113  :episode:  3
q_loss:  [[[0.2783447]]]
t:  114  :episode:  3
q_loss:  [[[0.61329496]]]
t:  115  :episode:  3
q_loss:  [[[0.25272638]]]
t:  116  :episode:  3
q_loss:  [[[0.3393594]]]
t:  117  :episode:  3
q_loss:  [[[0.05816036]]]
t:  118  :episode:  3
q_loss:  [[[0.07458735]]]
t:  119  :episode:  3
q_loss:  [[[0.16717958]]]
t:  120  :episode:  3
q_loss:  [[[0.10773207]]]
t:  121  :episode:  3
q_loss:  [[[0.3743507]]]
t:  122  :episode:  3
q_loss:  [[[0.9942584]]]
t:  123  :episode:  3
q_loss:  [[[0.5765407]]]
t:  124  :episode:  3
q_loss:  [[[0.4787537]]]
t:  125  :episode:  3
q_loss:  [[[0.07394114]]]
t:  126  :episode:  3
q_loss:  [[[0.21292803]]]
t:  127  :episode:  3
q_loss:  [[[0.0709858]]]
t: 

t:  280  :episode:  3
q_loss:  [[[0.15514119]]]
t:  281  :episode:  3
q_loss:  [[[0.11360639]]]
t:  282  :episode:  3
q_loss:  [[[0.10359202]]]
t:  283  :episode:  3
q_loss:  [[[0.655414]]]
t:  284  :episode:  3
q_loss:  [[[0.12316108]]]
t:  285  :episode:  3
q_loss:  [[[0.17264323]]]
t:  286  :episode:  3
q_loss:  [[[1.0194395]]]
t:  287  :episode:  3
q_loss:  [[[0.5048103]]]
t:  288  :episode:  3
q_loss:  [[[0.3758909]]]
t:  289  :episode:  3
q_loss:  [[[0.95099527]]]
t:  290  :episode:  3
q_loss:  [[[0.17417255]]]
t:  291  :episode:  3
q_loss:  [[[0.25148484]]]
t:  292  :episode:  3
q_loss:  [[[0.7066885]]]
t:  293  :episode:  3
q_loss:  [[[0.1598974]]]
t:  294  :episode:  3
q_loss:  [[[0.7458148]]]
t:  295  :episode:  3
q_loss:  [[[1.3077893]]]
t:  296  :episode:  3
q_loss:  [[[0.0643726]]]
t:  297  :episode:  3
q_loss:  [[[0.44974312]]]
t:  298  :episode:  3
q_loss:  [[[0.48786506]]]
t:  299  :episode:  3
q_loss:  [[[0.11688472]]]
t:  300  :episode:  3
q_loss:  [[[0.19244555]]]
t:

t:  454  :episode:  3
q_loss:  [[[0.12862346]]]
t:  455  :episode:  3
q_loss:  [[[0.33575335]]]
t:  456  :episode:  3
q_loss:  [[[0.26834702]]]
t:  457  :episode:  3
q_loss:  [[[0.1603898]]]
t:  458  :episode:  3
q_loss:  [[[0.13136086]]]
t:  459  :episode:  3
q_loss:  [[[0.09244531]]]
t:  460  :episode:  3
q_loss:  [[[0.06620903]]]
t:  461  :episode:  3
q_loss:  [[[0.13463481]]]
t:  462  :episode:  3
q_loss:  [[[0.22678928]]]
t:  463  :episode:  3
q_loss:  [[[0.40361074]]]
t:  464  :episode:  3
q_loss:  [[[0.227795]]]
t:  465  :episode:  3
q_loss:  [[[0.18862529]]]
t:  466  :episode:  3
q_loss:  [[[0.42365685]]]
t:  467  :episode:  3
q_loss:  [[[0.18293454]]]
t:  468  :episode:  3
q_loss:  [[[0.13159244]]]
t:  469  :episode:  3
q_loss:  [[[1.372803]]]
t:  470  :episode:  3
q_loss:  [[[0.29800406]]]
t:  471  :episode:  3
q_loss:  [[[0.05866868]]]
t:  472  :episode:  3
q_loss:  [[[0.3468298]]]
t:  473  :episode:  3
q_loss:  [[[0.19149624]]]
t:  474  :episode:  3
q_loss:  [[[0.29001224]]

q_loss:  [[[1.4333339]]]
t:  627  :episode:  3
q_loss:  [[[0.5783332]]]
t:  628  :episode:  3
q_loss:  [[[0.02120267]]]
t:  629  :episode:  3
q_loss:  [[[0.18725649]]]
t:  630  :episode:  3
q_loss:  [[[0.91316307]]]
t:  631  :episode:  3
q_loss:  [[[0.73233205]]]
t:  632  :episode:  3
q_loss:  [[[0.22048134]]]
t:  633  :episode:  3
q_loss:  [[[0.11416554]]]
t:  634  :episode:  3
q_loss:  [[[0.11587679]]]
t:  635  :episode:  3
q_loss:  [[[0.13953172]]]
t:  636  :episode:  3
q_loss:  [[[0.03671307]]]
t:  637  :episode:  3
q_loss:  [[[0.10927959]]]
t:  638  :episode:  3
q_loss:  [[[0.28242442]]]
t:  639  :episode:  3
q_loss:  [[[0.81211793]]]
t:  640  :episode:  3
q_loss:  [[[0.22386983]]]
t:  641  :episode:  3
q_loss:  [[[1.0484396]]]
t:  642  :episode:  3
q_loss:  [[[0.12991586]]]
t:  643  :episode:  3
q_loss:  [[[1.1058048]]]
t:  644  :episode:  3
q_loss:  [[[0.8706653]]]
t:  645  :episode:  3
q_loss:  [[[0.03698923]]]
t:  646  :episode:  3
q_loss:  [[[0.24824038]]]
t:  647  :episode: 

t:  800  :episode:  3
q_loss:  [[[0.20441037]]]
t:  801  :episode:  3
q_loss:  [[[0.08793637]]]
t:  802  :episode:  3
q_loss:  [[[0.12387013]]]
t:  803  :episode:  3
q_loss:  [[[0.29774043]]]
t:  804  :episode:  3
q_loss:  [[[0.10833889]]]
t:  805  :episode:  3
q_loss:  [[[1.3679495]]]
t:  806  :episode:  3
q_loss:  [[[0.13205075]]]
t:  807  :episode:  3
q_loss:  [[[0.11599671]]]
t:  808  :episode:  3
q_loss:  [[[0.45432755]]]
t:  809  :episode:  3
q_loss:  [[[0.7656045]]]
t:  810  :episode:  3
q_loss:  [[[0.04355312]]]
t:  811  :episode:  3
q_loss:  [[[0.8881607]]]
t:  812  :episode:  3
q_loss:  [[[0.4136072]]]
t:  813  :episode:  3
q_loss:  [[[1.3692715]]]
t:  814  :episode:  3
q_loss:  [[[1.3766444]]]
t:  815  :episode:  3
q_loss:  [[[0.5939263]]]
t:  816  :episode:  3
q_loss:  [[[0.30321035]]]
t:  817  :episode:  3
q_loss:  [[[0.3122474]]]
t:  818  :episode:  3
q_loss:  [[[0.08737241]]]
t:  819  :episode:  3
q_loss:  [[[0.08773649]]]
t:  820  :episode:  3
q_loss:  [[[0.6699009]]]
t

t:  974  :episode:  3
q_loss:  [[[0.44253132]]]
t:  975  :episode:  3
q_loss:  [[[1.1171341]]]
t:  976  :episode:  3
q_loss:  [[[0.05059629]]]
t:  977  :episode:  3
q_loss:  [[[0.10175278]]]
t:  978  :episode:  3
q_loss:  [[[0.1204827]]]
t:  979  :episode:  3
q_loss:  [[[5.206631]]]
t:  980  :episode:  3
q_loss:  [[[0.32577443]]]
t:  981  :episode:  3
q_loss:  [[[0.21444821]]]
t:  982  :episode:  3
q_loss:  [[[0.63110673]]]
t:  983  :episode:  3
q_loss:  [[[0.18671738]]]
t:  984  :episode:  3
q_loss:  [[[0.93158126]]]
t:  985  :episode:  3
q_loss:  [[[1.5469754]]]
t:  986  :episode:  3
q_loss:  [[[0.1213801]]]
t:  987  :episode:  3
q_loss:  [[[0.09732978]]]
t:  988  :episode:  3
q_loss:  [[[1.0488427]]]
t:  989  :episode:  3
q_loss:  [[[0.08733943]]]
t:  990  :episode:  3
q_loss:  [[[0.46939033]]]
t:  991  :episode:  3
q_loss:  [[[0.24792904]]]
t:  992  :episode:  3
q_loss:  [[[0.39386684]]]
t:  993  :episode:  3
q_loss:  [[[0.18176332]]]
t:  994  :episode:  3
q_loss:  [[[0.9607879]]]


 https://datascience.stackexchange.com/questions/13216/intuitive-explanation-of-noise-contrastive-estimation-nce-loss(InfoNCE Loss )
<br>
Representation Learning with Contrastive Predictive Coding
<br>
https://github.com/gdao-research/cpc/blob/master/cpc/data_handler.py (CPC)
<br>
https://github.com/davidtellez/contrastive-predictive-coding/blob/master/train_model.py (CPC)
<br>
https://github.com/MishaLaskin/curl/blob/23b0880708c29b078b0a25e62ff31fb587587b18/utils.py#L123 (replay buffer and SAC)
<br>
https://github.com/marload/DeepRL-TensorFlow2/blob/master/A2C/A2C_Discrete.py (A2C)
<br>
https://github.com/germain-hug/Deep-RL-Keras/blob/master/A3C/a3c.py (A3C)
<br>
https://github.com/tensorflow/agents/blob/v0.5.0/tf_agents/agents/sac/sac_agent.py (SAC)
<br>
https://github.com/cookbenjamin/DDPG/blob/master/networks/critic.py (transfer the action state merge to second layer)
<br>
https://github.com/georgesung/TD3 (check expected results)
<br>
https://github.com/georgesung/TD3/blob/master/DDPG.py (param mistake)