In [14]:
#use activate tensorforceEnv
from ale_py import ALEInterface

ale = ALEInterface()

from ale_py.roms import Breakout

ale.loadROM(Breakout)
from collections import deque

from tensorforce import Agent, Environment

import gym
from gym import wrappers

import numpy as np

# DQN agent specification
env = gym.make('CartPole-v1')
env = wrappers.Monitor(env, 'tmp', force=True)
environment = Environment.create(environment=env, max_episode_timesteps=500)
nbr_step = 1500
agent = Agent.create(
    agent='ddqn', 
    environment=environment,
    memory = 50000,
    batch_size=32, 
    update_frequency = 0.5,
    start_updating = 100,
    learning_rate= 0.001,
    target_sync_frequency = 4,
    discount = 0.95,
    exploration = dict(type = 'exponential',unit = 'updates', 
                       num_steps = nbr_step, initial_value=1.0, decay_rate=0.1),
    
    network = [
        dict(type='dense', size=30, activation = 'relu'),
        dict(type='dense', size=24, activation = 'relu')
    ]


)

print(agent.get_architecture())

score_mean = deque(maxlen = 50)
# Train for 100 episodes
for episode in range(nbr_step):

    # Record episode experience
    episode_states = list()
    episode_internals = list()
    episode_actions = list()
    episode_terminal = list()
    episode_reward = list()

    # Episode using independent-act and agent.intial_internals()
    states = environment.reset()
    internals = agent.initial_internals()
    terminal = False
    sum_rewards = 0.0
    while not terminal:

        actions = agent.act(states=states)

        states, terminal, reward = environment.execute(actions=actions)
        agent.observe(terminal=terminal, reward=reward)
        #env.render()
        sum_rewards += reward
    print('Episode {}: {}'.format(episode, sum_rewards))
    score_mean.append(sum_rewards)
    print(np.mean(score_mean))




    
    
    
    





Policy:
    Network:  
        Dense(name=dense0, size=30, bias=True, activation=relu)
        Dense(name=dense1, size=24, bias=True, activation=relu)Action-value:  Linear(name=action_value, size=2, bias=True)
Baseline:
    Network:  
        Dense(name=dense0, size=30, bias=True, activation=relu)
        Dense(name=dense1, size=24, bias=True, activation=relu)Action-value:  Linear(name=action_value, size=2, bias=True)
Episode 0: 12.0
12.0
Episode 1: 14.0
13.0
Episode 2: 20.0
15.333333333333334
Episode 3: 18.0
16.0
Episode 4: 12.0
15.2
Episode 5: 21.0
16.166666666666668
Episode 6: 10.0
15.285714285714286
Episode 7: 20.0
15.875
Episode 8: 23.0
16.666666666666668
Episode 9: 36.0
18.6
Episode 10: 13.0
18.09090909090909
Episode 11: 19.0
18.166666666666668
Episode 12: 26.0
18.76923076923077
Episode 13: 10.0
18.142857142857142
Episode 14: 20.0
18.266666666666666
Episode 15: 13.0
17.9375
Episode 16: 21.0
18.11764705882353
Episode 17: 16.0
18.0
Episode 18: 77.0
21.105263157894736
Episode 19: 31

Episode 313: 15.0
18.72
Episode 314: 61.0
19.66
Episode 315: 69.0
20.8
Episode 316: 55.0
21.32
Episode 317: 30.0
21.5
Episode 318: 83.0
22.8
Episode 319: 69.0
23.9
Episode 320: 57.0
24.54
Episode 321: 154.0
27.36
Episode 322: 117.0
29.5
Episode 323: 84.0
30.7
Episode 324: 86.0
32.1
Episode 325: 127.0
34.34
Episode 326: 43.0
34.7
Episode 327: 27.0
34.96
Episode 328: 103.0
36.48
Episode 329: 25.0
36.74
Episode 330: 11.0
36.58
Episode 331: 139.0
39.1
Episode 332: 171.0
41.68
Episode 333: 29.0
41.92
Episode 334: 46.0
42.4
Episode 335: 146.0
44.62
Episode 336: 137.0
46.32
Episode 337: 57.0
46.9
Episode 338: 86.0
48.02
Episode 339: 60.0
48.88
Episode 340: 181.0
52.28
Episode 341: 148.0
54.9
Episode 342: 93.0
56.52
Episode 343: 146.0
59.08
Episode 344: 119.0
61.24
Episode 345: 141.0
63.74
Episode 346: 90.0
65.06
Episode 347: 45.0
65.66
Episode 348: 247.0
70.16
Episode 349: 45.0
70.24
Episode 350: 69.0
71.34
Episode 351: 90.0
72.92
Episode 352: 306.0
78.86
Episode 353: 165.0
81.94
Episode 354:

KeyboardInterrupt: 

In [15]:
# Evaluate for 100 episodes
nbr_episode = 30
sum_rewards = 0.0
for i in range(nbr_episode):
    print(i)
    states = environment.reset()
    internals = agent.initial_internals()
    terminal = False
    while not terminal:
        actions, internals = agent.act(
            states=states, internals=internals, independent=True, deterministic=True
        )
        states, terminal, reward = environment.execute(actions=actions)
        env.render()
        sum_rewards += reward
print('Mean evaluation return:', sum_rewards / nbr_episode)



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
Mean evaluation return: 500.0


In [4]:
# Close agent and environment
agent.close()
environment.close()