In [2]:

from ddpg import *
import time
import numpy as np


# Environments to be tested on
# env_name = 'InvertedPendulum-v1'
# env_name = 'Pendulum-v0'
# env_name = 'HalfCheetah-v1'
env_name = 'Ant-v1'
# env_name = "Hopper-v1"
# env_name = "Walker2d-v1"

GAMMA = 0.99
BATCH_SIZE = 256

MAX_STEPS = 1000
NUM_EPISODES = 1200

logging_interval = 100
animate_interval = logging_interval * 5

VISUALIZE = False

dropout_rate = 0.05
Records = []


env = gym.make(env_name)
env = NormalizeAction(env)  # remap action values for the environment
avg_val = 0

# for plotting
running_rewards_ddpg = []
step_list_ddpg = []
step_counter = 0

# set term_condition for early stopping according to environment being used
term_condition = 1500  # Pendulum
ddpg = DDPG(act_dim = env.action_space.shape[0], obs_dim = env.observation_space.shape[0],critic_lr=1e-3, actor_lr=1e-4, gamma = GAMMA, batch_size = BATCH_SIZE)

for itr in range(NUM_EPISODES):
    state = env.reset()  # get initial state
    animate_this_episode = (itr % animate_interval == 0) and VISUALIZE
    total_reward = 0

    while True:
        ddpg.noise.reset()

        if animate_this_episode:
            env.render()
            time.sleep(0.05)

        action = ddpg.get_action_with_noise(state)
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        ddpg.replayBuffer.remember(state, action, reward, next_state, done)
        state = next_state
        step_counter += 1

        # use actor to get action, add ddpg.noise.step() to action
        # remember to put NN in eval mode while testing (to deal with BatchNorm layers) and put it back
        # to train mode after you're done getting the action

        # step action, get next state, reward, done (keep track of total_reward)
        # populate ddpg.replayBuffer
        if ddpg.replayBuffer.size > BATCH_SIZE:
            ddpg.train()
        


        if done:
            break
    if avg_val > term_condition and itr > 100:
        #save(ddpg)
        break
        
    

    running_rewards_ddpg.append(total_reward)  # return of this episode
    step_list_ddpg.append(step_counter)
    
    
    #dropout
#     number_to_drop = int(ddpg.replayBuffer.size * dropout_rate)
#     for i in range(number_to_drop):
#         del ddpg.replayBuffer.buffer[np.random.choice(len(ddpg.replayBuffer.buffer),1)[0]]
#         ddpg.replayBuffer.size -=1
            

    avg_val = avg_val * 0.95 + 0.05 * running_rewards_ddpg[-1]
    print("Average value: {} for episode: {} and reward: {}".format(avg_val, itr,total_reward))
    Records.append((avg_val,itr,step_counter))

[2018-06-12 21:55:58,285] Making new env: Ant-v1


Average value: 4.545334453565605 for episode: 0 and reward: 90.9066890713121
Average value: 7.62156236266325 for episode: 1 and reward: 66.06989263551849
Average value: 39.82448811791512 for episode: 2 and reward: 651.6800774677006
Average value: 71.4703601339184 for episode: 3 and reward: 672.7419284379807
Average value: 108.93263189181697 for episode: 4 and reward: 820.7157952918895
Average value: 145.12788295624648 for episode: 5 and reward: 832.837653180407
Average value: 180.09976662838253 for episode: 6 and reward: 844.5655563989675
Average value: 211.15591480354198 for episode: 7 and reward: 801.2227301315717
Average value: 243.16232911357963 for episode: 8 and reward: 851.2842010042951
Average value: 271.6039909475042 for episode: 9 and reward: 811.9955657920716
Average value: 298.46236648134584 for episode: 10 and reward: 808.7715016243371
Average value: 326.3956307819026 for episode: 11 and reward: 857.127652492481
Average value: 346.5022835766027 for episode: 12 and reward: 

Average value: 810.7373996339226 for episode: 104 and reward: 770.9285745063564
Average value: 804.408861394853 for episode: 105 and reward: 684.1666348525346
Average value: 805.5079174747749 for episode: 106 and reward: 826.3899829932923
Average value: 809.9155784592241 for episode: 107 and reward: 893.6611371637597
Average value: 814.1850037932121 for episode: 108 and reward: 895.3040851389852
Average value: 816.0016140524036 for episode: 109 and reward: 850.5172089770443
Average value: 816.9854160976804 for episode: 110 and reward: 835.6776549579395
Average value: 797.9435918350823 for episode: 111 and reward: 436.1489308457202
Average value: 796.208311930905 for episode: 112 and reward: 763.2379937515369
Average value: 760.9221445449258 for episode: 113 and reward: 90.48496421132204
Average value: 767.9084322414899 for episode: 114 and reward: 900.6478984762094
Average value: 774.7777015314616 for episode: 115 and reward: 905.2938180409247
Average value: 756.34866004109 for episode

Average value: 780.177040126587 for episode: 207 and reward: 947.873466716456
Average value: 787.3262955082384 for episode: 208 and reward: 923.1621477596152
Average value: 791.2000961240246 for episode: 209 and reward: 864.8023078239632
Average value: 798.9970685746943 for episode: 210 and reward: 947.139545137418
Average value: 805.2858354252089 for episode: 211 and reward: 924.7724055849886
Average value: 811.0853178013323 for episode: 212 and reward: 921.2754829476761
Average value: 816.4124110083762 for episode: 213 and reward: 917.6271819422122
Average value: 819.4771536044898 for episode: 214 and reward: 877.7072629306481
Average value: 822.6298612060917 for episode: 215 and reward: 882.5313056365304
Average value: 822.8914689271337 for episode: 216 and reward: 827.8620156269318
Average value: 824.3612601685035 for episode: 217 and reward: 852.287293754529
Average value: 829.7109675091646 for episode: 218 and reward: 931.3554069817266
Average value: 833.885947799228 for episode:

Average value: 883.4624427617407 for episode: 310 and reward: 925.5238807478064
Average value: 871.4351382329783 for episode: 311 and reward: 642.9163521864921
Average value: 872.5750380746779 for episode: 312 and reward: 894.2331350669712
Average value: 875.4345593892291 for episode: 313 and reward: 929.7654643657038
Average value: 878.7411411198752 for episode: 314 and reward: 941.5661940021531
Average value: 851.862191569894 for episode: 315 and reward: 341.16215012025083
Average value: 810.1205232382085 for episode: 316 and reward: 17.028824936185824
Average value: 790.2201813318261 for episode: 317 and reward: 412.11368511055963
Average value: 789.0720530628735 for episode: 318 and reward: 767.2576159527744
Average value: 784.9959564013728 for episode: 319 and reward: 707.5501198328609
Average value: 791.2682139874926 for episode: 320 and reward: 910.4411081237686
Average value: 796.7160339699947 for episode: 321 and reward: 900.2246136375367
Average value: 802.7245013500135 for e

KeyboardInterrupt: 

In [5]:
import pickle

with open("ant.pkl","wb") as f:
    pickle.dump((Records,running_rewards_ddpg),f)
    
    
with open("ant_model.pkl","wb") as f:
    pickle.dump(ddpg,f)