In [1]:
!pip install keras-rl2

Collecting keras-rl2
  Downloading keras_rl2-1.0.5-py3-none-any.whl (52 kB)
[?25l[K     |██████▎                         | 10 kB 26.5 MB/s eta 0:00:01[K     |████████████▋                   | 20 kB 31.9 MB/s eta 0:00:01[K     |██████████████████▉             | 30 kB 37.4 MB/s eta 0:00:01[K     |█████████████████████████▏      | 40 kB 29.0 MB/s eta 0:00:01[K     |███████████████████████████████▍| 51 kB 32.4 MB/s eta 0:00:01[K     |████████████████████████████████| 52 kB 730 kB/s 
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 51.7 MB/s 
Installing collected packages: tf-estimator-nightly, keras-rl2
Successfully installed keras-rl2-1.0.5 tf-estimator-nightly-2.8.0.dev2021122109


In [3]:
import gym
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import CSVLogger
from rl.memory import SequentialMemory
from rl.policy import EpsGreedyQPolicy, BoltzmannQPolicy
from rl.agents.dqn import DQNAgent

In [4]:
env = gym.make('CartPole-v0')

In [6]:
num_actions = env.action_space.n
state_space_shape = env.observation_space.shape
print(num_actions)
print(state_space_shape)

2
(4,)


In [7]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + state_space_shape))
model.add(Dense(24, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(24, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(num_actions, activation='linear'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 4)                 0         
                                                                 
 dense (Dense)               (None, 24)                120       
                                                                 
 dropout (Dropout)           (None, 24)                0         
                                                                 
 dense_1 (Dense)             (None, 24)                600       
                                                                 
 dropout_1 (Dropout)         (None, 24)                0         
                                                                 
 dense_2 (Dense)             (None, 2)                 50        
                                                                 
Total params: 770
Trainable params: 770
Non-trainable pa

In [8]:
memory = SequentialMemory(limit=50000, window_length=1)

In [9]:
policy = EpsGreedyQPolicy(eps=0.1)

In [10]:
agent = DQNAgent(model=model, nb_actions=num_actions, memory=memory,
                 nb_steps_warmup=1000, target_model_update=1, policy=policy)

In [11]:
agent.compile(Adam(lr=1e-3), metrics=['mae'])

  super(Adam, self).__init__(name, **kwargs)


In [12]:
agent.fit(env, nb_steps=2000, visualize=False, verbose=2)

Training for 2000 steps ...


  updates=self.state_updates,


    9/2000: episode: 1, duration: 1.972s, episode steps:   9, steps per second:   5, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 1.000 [1.000, 1.000],  loss: --, mae: --, mean_q: --
   19/2000: episode: 2, duration: 0.040s, episode steps:  10, steps per second: 253, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.900 [0.000, 1.000],  loss: --, mae: --, mean_q: --
   36/2000: episode: 3, duration: 0.060s, episode steps:  17, steps per second: 281, episode reward: 17.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.706 [0.000, 1.000],  loss: --, mae: --, mean_q: --
   45/2000: episode: 4, duration: 0.037s, episode steps:   9, steps per second: 241, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 1.000 [1.000, 1.000],  loss: --, mae: --, mean_q: --
   54/2000: episode: 5, duration: 0.034s, episode steps:   9, steps per second: 261, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], m

  updates=self.state_updates,


 1008/2000: episode: 74, duration: 1.475s, episode steps:  16, steps per second:  11, episode reward: 16.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.812 [0.000, 1.000],  loss: 0.647233, mae: 0.690568, mean_q: -0.009927
 1017/2000: episode: 75, duration: 0.133s, episode steps:   9, steps per second:  68, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 1.000 [1.000, 1.000],  loss: 0.649041, mae: 0.660786, mean_q: 0.069060
 1027/2000: episode: 76, duration: 0.133s, episode steps:  10, steps per second:  75, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.900 [0.000, 1.000],  loss: 0.643765, mae: 0.654605, mean_q: 0.171786
 1039/2000: episode: 77, duration: 0.171s, episode steps:  12, steps per second:  70, episode reward: 12.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.833 [0.000, 1.000],  loss: 0.643834, mae: 0.660407, mean_q: 0.328332
 1048/2000: episode: 78, duration: 0.137s, episode steps:   9, steps pe

<keras.callbacks.History at 0x7fd3de39e090>

In [13]:
agent.test(env, nb_episodes=10, visualize=False)

Testing for 10 episodes ...
Episode 1: reward: 12.000, steps: 12
Episode 2: reward: 13.000, steps: 13
Episode 3: reward: 14.000, steps: 14
Episode 4: reward: 12.000, steps: 12
Episode 5: reward: 15.000, steps: 15
Episode 6: reward: 11.000, steps: 11
Episode 7: reward: 16.000, steps: 16
Episode 8: reward: 15.000, steps: 15
Episode 9: reward: 16.000, steps: 16
Episode 10: reward: 17.000, steps: 17


<keras.callbacks.History at 0x7fd3de098310>

In [None]:
env = gym.make('CartPole-v0')

num_actions = env.action_space.n
state_space_shape = env.observation_space.shape

model = Sequential()
model.add(Flatten(input_shape=(1,) + state_space_shape))
model.add(Dense(24, activation='relu'))
# model.add(Dropout(0.5))
model.add(Dense(24, activation='relu'))
# model.add(Dropout(0.25))
model.add(Dense(num_actions, activation='linear'))

memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()  # EpsGreedyQPolicy(eps=0.1)
agent = DQNAgent(model=model, nb_actions=num_actions, memory=memory,
                     nb_steps_warmup=1000, target_model_update=1, policy=policy)
agent.compile(Adam(lr=1e-3), metrics=['mae'])

logger = CSVLogger('dqn_cartpole.log')
agent.fit(env, nb_steps=2000, visualize=False, verbose=2)
agent.save_weights('dqn_cartpole.h5', overwrite=True)

agent.load_weights('dqn_cartpole.h5')
agent.test(env, nb_episodes=10, visualize=False)

In [14]:
!pip install stable_baselines
!pip install tensorflow==1.14

Collecting stable_baselines
  Downloading stable_baselines-2.10.2-py3-none-any.whl (240 kB)
[?25l[K     |█▍                              | 10 kB 28.2 MB/s eta 0:00:01[K     |██▊                             | 20 kB 32.6 MB/s eta 0:00:01[K     |████                            | 30 kB 37.0 MB/s eta 0:00:01[K     |█████▍                          | 40 kB 40.6 MB/s eta 0:00:01[K     |██████▉                         | 51 kB 28.0 MB/s eta 0:00:01[K     |████████▏                       | 61 kB 30.9 MB/s eta 0:00:01[K     |█████████▌                      | 71 kB 24.5 MB/s eta 0:00:01[K     |██████████▉                     | 81 kB 26.1 MB/s eta 0:00:01[K     |████████████▎                   | 92 kB 28.2 MB/s eta 0:00:01[K     |█████████████▋                  | 102 kB 27.4 MB/s eta 0:00:01[K     |███████████████                 | 112 kB 27.4 MB/s eta 0:00:01[K     |████████████████▎               | 122 kB 27.4 MB/s eta 0:00:01[K     |█████████████████▊              | 133

In [1]:
import gym
from stable_baselines.deepq.policies import MlpPolicy
from stable_baselines import DQN

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  "stable-baselines is in maintenance mode, please use [Stable-Baselines3 (SB3)](https://github.com/DLR-RM/stable-baselines3) for an up-to-date version. You can find a [migration guide](https://stable-baselines3.readthedocs.io/en/master/guide/migration.html) in SB3 documentation."


In [2]:
env = gym.make('CartPole-v1')

In [3]:
model = DQN(MlpPolicy, env, verbose=1)







Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




In [4]:
model.learn(total_timesteps=2000)

--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 100      |
| mean 100 episode reward | 10.7     |
| steps                   | 1058     |
--------------------------------------


<stable_baselines.deepq.dqn.DQN at 0x7fb683fd1cd0>

In [5]:
model.save('deepq_cartpole')

In [7]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    #env.render()



KeyboardInterrupt: ignored

In [None]:
env = gym.make('CartPole-v1')

model = DQN(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=2000)
model.save('deepq_cartpole')

del model # remove to demonstrate saving and loading

model = DQN.load('deepq_cartpole')

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

In [None]:
!pip install -U gym>=0.21.0
!pip install -U gym[atari,accept-rom-license]

In [None]:
!pip install stable_baselines3

In [None]:
from stable_baselines.common.atari_wrappers import make_atari
from stable_baselines.deepq.policies import MlpPolicy, CnnPolicy
from stable_baselines import DQN

In [None]:
env = make_atari('BreakoutNoFrameskip-v4')

model = DQN(CnnPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save('deepq_breakout')

del model # remove to demonstrate saving and loading

model = DQN.load('deepq_breakout')

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

In [None]:
!python dqn_cartpole.py

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
2022-05-12 14:55:12.009602: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
2022-05-12 14:55:12.013036: I tensorflow/core/platform/profile_utils/cpu_utils.cc: