In [2]:
import gym
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy #Probalistic approch to also try some new paths from time to time
from rl.memory import SequentialMemory

In [3]:
env = gym.make('CartPole-v0')
states = env.observation_space.shape[0]
actions = env.action_space.n

In [4]:
#random inputs for visualisation
for episodes in range(1, 11):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = random.choice([0,1])
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episodes:{} Score:{}'.format(episodes, score))

Episodes:1 Score:46.0
Episodes:2 Score:14.0
Episodes:3 Score:28.0
Episodes:4 Score:21.0
Episodes:5 Score:26.0
Episodes:6 Score:20.0
Episodes:7 Score:15.0
Episodes:8 Score:39.0
Episodes:9 Score:13.0
Episodes:10 Score:35.0


In [5]:
#build model: input layer -> 2x fully connected -> output layer
def build_model(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1,states)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model
    

In [6]:
model = build_model(states, actions)

In [7]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=5000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

In [8]:
import sys
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU'))) #i dont know why it does not find my GPU :(

Num GPUs Available:  0


In [36]:
dqn = build_agent(model, actions)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
#with tf.device('/gpu:0'): #please use my GPU if it would recognise it :(
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
56 episodes - episode_reward: 177.911 [30.000, 200.000] - loss: 7.864 - mae: 43.367 - mean_q: 87.101

Interval 2 (10000 steps performed)
50 episodes - episode_reward: 197.720 [86.000, 200.000] - loss: 13.471 - mae: 44.058 - mean_q: 88.082

Interval 3 (20000 steps performed)
53 episodes - episode_reward: 191.113 [39.000, 200.000] - loss: 18.558 - mae: 44.452 - mean_q: 88.705

Interval 4 (30000 steps performed)
50 episodes - episode_reward: 200.000 [200.000, 200.000] - loss: 18.796 - mae: 44.481 - mean_q: 88.678

Interval 5 (40000 steps performed)
done, took 421.072 seconds


<tensorflow.python.keras.callbacks.History at 0x258a313b610>

In [34]:
#test our net
scores = dqn.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 195.000, steps: 195
Episode 5: reward: 186.000, steps: 186
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 188.000, steps: 188
Episode 11: reward: 200.000, steps: 200
Episode 12: reward: 200.000, steps: 200
Episode 13: reward: 196.000, steps: 196
Episode 14: reward: 196.000, steps: 196
Episode 15: reward: 200.000, steps: 200
Episode 16: reward: 189.000, steps: 189
Episode 17: reward: 200.000, steps: 200
Episode 18: reward: 200.000, steps: 200
Episode 19: reward: 200.000, steps: 200
Episode 20: reward: 200.000, steps: 200
Episode 21: reward: 200.000, steps: 200
Episode 22: reward: 200.000, steps: 200
Episode 23: reward: 194.000, steps: 194
Episode 24: reward: 200.000, steps: 200
Episode 25: reward: 