In [239]:
from collections import deque
import random
import numpy as np
import queue
import heapq


class ReplayBuffer(object):
    def __init__(self, env_shape, buffer_size, random_seed=123):
        """
        The right side of the deque contains the most recent experiences
        """
        self.buffer_size = buffer_size
        self.count = 0
        self.buffer = []
        heapq.heapify(self.buffer)
        random.seed(random_seed)
        self.env_shape = env_shape
        self.c = 0
        self.alpha = 0.5

    def collect(self, s, a, r, s1, t):

        # collecting trajectory
        s = np.reshape(s, (-1, self.env_shape[0]))
        a = np.reshape(a, (-1, self.env_shape[1]))
        s1 = np.reshape(s1, (-1, self.env_shape[0]))

        entry = (s, a, r, s1, t)
        priority = 1
        self.c += 1
        # in theory break heap but we rebuild when update
        self.buffer.append((priority, self.c, entry))
        print(len(self.buffer))


    # def add(self, exp):
    #
    #     if self.count < self.buffer_size:
    #         self.buffer.append(exp)
    #         self.count += 1
    #     else:
    #         self.buffer.popleft()
    #         self.buffer.append(exp)

    def get_size(self):
        return len(self.buffer)

    def update(self, error, sample):

        values = heapq.nlargest((self.buffer_size - 100), self.buffer)

        priority = list(map(lambda x: x[0], values))
        entry = list(map(lambda x: x[2], values))

        priority.extend(error)
        priority = np.array(priority)
        priority = priority/sum(priority)

        entry.extend(sample)
        
        count = [i for i in range(len(entry))]
        if self.c < len(count):
            self.c = len(count) + 1
        else:
            self.c += 1

        self.buffer = list(zip(priority, count, entry))
        print(self.buffer)

        heapq.heapify(self.buffer)


    def get_sample(self, batch_size):

        priority = list(map(lambda x: x[0], self.buffer))
        priority = np.array(priority)
        priority = priority / sum(priority)
        entry = np.array(list(map(lambda x: x[2], self.buffer)))

        if len(entry) < batch_size:
            ix = np.random.choice(a=len(entry), size=len(entry), replace=True, p=priority)
        else:
            ix = np.random.choice(a=len(entry), size=batch_size, replace=True, p=priority)

        batch = entry[ix,:]
        print(batch)
        print(batch[:,0])
        print(batch[:,1])
        print(batch[:,2])
        print(batch[:,3])
        print(batch[:,4])
        s1_batch = np.vstack(batch[:,0])#np.vstack(list(map(lambda x: x[0], batch)))
        a_batch = np.vstack(batch[:,1])#np.vstack(np.array(list(map(lambda x: x[1], batch))))
        r_batch = np.vstack(batch[:,2]) #np.vstack(np.array(list(map(lambda x: x[2], batch))))
        s2_batch = np.vstack(batch[:,3])#np.vstack(np.array(list(map(lambda x: x[3], batch))))
        t_batch = np.vstack(batch[:,4])#np.vstack(np.array(list(map(lambda x: x[4], batch))))

        return s1_batch, a_batch, r_batch, s2_batch, t_batch

    def clear(self):
        self.buffer.clear()
        self.count = 0


In [240]:
replay = ReplayBuffer((1,1),1000)

In [244]:
replay.collect(1,2,3,4,7)

4


In [247]:
replay.buffer

[(0.125, 0, (array([[1]]), array([[2]]), 3, array([[4]]), 7)),
 (0.125, 1, (array([[1]]), array([[2]]), 3, array([[4]]), 7)),
 (0.125, 2, (array([[1]]), array([[2]]), 3, array([[4]]), 7)),
 (0.125, 3, (array([[1]]), array([[2]]), 3, array([[4]]), 7)),
 (0.25, 4, (1, 1, 1, 1, 1)),
 (0.25, 5, (2, 2, 2, 2, 2))]

In [246]:
replay.update([2,2], [(1,1,1,1,1), (2,2,2,2,2)])

[(0.125, 0, (array([[1]]), array([[2]]), 3, array([[4]]), 7)), (0.125, 1, (array([[1]]), array([[2]]), 3, array([[4]]), 7)), (0.125, 2, (array([[1]]), array([[2]]), 3, array([[4]]), 7)), (0.125, 3, (array([[1]]), array([[2]]), 3, array([[4]]), 7)), (0.25, 4, (1, 1, 1, 1, 1)), (0.25, 5, (2, 2, 2, 2, 2))]


In [249]:
x = replay.get_sample(2)

[[1 1 1 1 1]
 [2 2 2 2 2]]
[1 2]
[1 2]
[1 2]
[1 2]
[1 2]


In [254]:
x[0]

array([[1],
       [2]])

In [164]:
replay.buffer

[(0.125, 0, (array([[1]]), array([[2]]), 3, array([[4]]), 7)),
 (0.125, 1, (array([[1]]), array([[2]]), 3, array([[4]]), 7)),
 (0.125, 2, (array([[1]]), array([[2]]), 3, array([[4]]), 7)),
 (0.125, 3, (array([[1]]), array([[2]]), 3, array([[4]]), 7)),
 (0.25, 4, (1, 1, 1, 1, 1)),
 (0.25, 5, (2, 2, 2, 2, 2))]

In [3]:
import tensorflow as tf

from datetime import datetime
from agent import Agent
from ou_noise import OUNoise
# from daddy import Daddy, augment_state
import os
import gym

# from osim.env import RunEnv
import numpy as np

tf.logging.set_verbosity( tf.logging.INFO )

# 'Walker2d-v1'
# 'Pendulum-v0'

ENV_NAME = 'Walker2d-v1'

MEMORY_SIZE = 1e6
BATCH_SIZE = 32
GAMMA = 0.99
NUM_EP = 5000
SAVE_EVERY = 100
H_SIZE = [ 128 , 128 ]
PRE_TRAIN = 100
IS_STOCHASTIC = True

def main():
    now = datetime.utcnow().strftime( "%b-%d_%H_%M" )  # create unique dir

    full_path = os.path.join( os.getcwd() , 'logs' , now )

    env = gym.make( ENV_NAME )

    # env = RunEnv( visualize=False )
    # 5 is the number of velocities for head and other parts
    env_dims = (env.observation_space.shape[ 0 ], env.action_space.shape[ 0 ] , (env.action_space.low, env.action_space.high))
    ou = OUNoise( action_dimension=env_dims[ 1 ] )

    # tf.reset_default_graph ()

    target = Agent( name='target' , env_dim=env_dims , h_size=H_SIZE , stochastic=IS_STOCHASTIC )

    global_step = tf.Variable( 0 , trainable=False , name='global_step' )
    writer = tf.summary.FileWriter( full_path )
    saver = tf.train.Saver( tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES , scope='target' ) , max_to_keep=2 )
    ckpt = tf.train.latest_checkpoint( full_path )

    agent = Agent( name='local' , env_dim=env_dims , target=target , writer=writer , h_size=H_SIZE , stochastic=IS_STOCHASTIC )

    # daddy = Daddy( target=agent , env_dim=env_dims )

    with tf.Session() as sess:
        if ckpt:
            tf.logging.info('Restore model {}'.format(ckpt))
            saver.restore(sess=sess,  save_path=ckpt)

        sess.run( tf.global_variables_initializer() )

        summarize = False
        # load pre trained model
        #
        # for _ in range(PRE_TRAIN):
        #     l, tot_rw, timesteps = daddy.sample(env= env, w = daddy.w)
        #
        #     if _ % 5 == 0:
        #         ep_summary = tf.Summary()
        #
        #         ep_summary.value.add( simple_value=tot_rw , tag='daddy/total_rw' )
        #         ep_summary.value.add( simple_value=timesteps , tag='daddy/timesteps' )
        #         ep_summary.value.add( simple_value=l , tag='daddy/loss' )
        #
        #         agent.writer.add_summary( ep_summary , _ )
        #         agent.writer.flush()
        #
        #         tf.logging.info(
        #             'Master ep  {}, latest ep reward {}, of steps {}'.format( _ , tot_rw , timesteps ) )
        #
        # tf.logging.info('Pre-train ended, starting training now...')
        # # save memory
        # daddy.save_memory('memory.pkl')
        # # transfer memory
        # agent.memory.buffer = copy.deepcopy(daddy.memory.buffer)
        #
        # saver.save( sess , os.path.join( full_path , 'model.ckpt' ) , global_step=PRE_TRAIN )

        for ep in range( NUM_EP ):

            agent.sync()
            state = env.reset()
            ou.reset()

            terminal = False

            timesteps , tot_rw = 0 , 0


            # activate if osim_rl
            # state = augment_state( state , state )

            while not terminal:
                # if stochastic remove exploration noise
                action = agent.get_action( state )  # + ou.noise()

                # if determinstic clip here
                # action = np.clip(action, 0, 1)

                next_state , reward , terminal , _ = env.step( action.flatten() )

                # Activate if osim-rl
                # rw = surr_rw( state , action ) + reward
                # next_state = augment_state( state , next_state )

                agent.memory.collect( state , action , reward , next_state , terminal )
                agent.think( batch_size=BATCH_SIZE , gamma=GAMMA , summarize=summarize )

                state = next_state
                summarize = False

                timesteps += 1
                tot_rw += reward

            if ep % 5 == 0:
                summarize = True
                ep_summary = tf.Summary()

                ep_summary.value.add( simple_value=tot_rw , tag='eval/total_rw' )
                ep_summary.value.add( simple_value=timesteps , tag='eval/ep_length' )

                agent.writer.add_summary( ep_summary , ep )
                agent.writer.flush()

                tf.logging.info(
                    'Master ep  {}, latest ep reward {}, of steps {}'.format( ep , tot_rw , timesteps ) )

            if ep % SAVE_EVERY == 0:
                gs = tf.train.global_step( sess , global_step )
                saver.save( sess , os.path.join( full_path , 'model.ckpt' ) , global_step=gs )
                tf.logging.info( 'Model saved at ep {}'.format( gs ) )


def augment_state(s , s1):

    s = np.reshape( np.array( s ) , (1 , -1) )
    s1 = np.reshape( np.array( s1 ) , (1 , -1) )

    idxs = [ 22 , 24 , 26 , 28 , 30 ]

    vel = (s1[ : , idxs ] - s[ : , idxs ]) / (0.01)
    # keep information of the environment like difficulty
    return np.reshape( np.append( s1[:,:38] , np.append(vel, s1[:,38:])) , (1 , -1) )


def surr_rw(state , action):
    # state = np.array(state)
    delta_h = state[ :,27 ] - state[ :,35 ]
    rw = 10 * state[ :,20 ] - abs( delta_h - 1.2 ) - 0.1 * np.linalg.norm( action ) - 10 * (state[ :,27 ] < 0.8)
    return np.asscalar( rw )


if __name__ == '__main__':
    main()


[2017-08-23 21:38:05,494] Making new env: Walker2d-v1


INFO:tensorflow:Worker target ready to go ...


[2017-08-23 21:38:06,690] Worker target ready to go ...


INFO:tensorflow:Worker local ready to go ...


[2017-08-23 21:38:07,591] Worker local ready to go ...


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33


ValueError: a must be 1-dimensional