In [None]:
import random
import gym
import math
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from tensorflow.keras.backend import clear_session



In [None]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

In [None]:
get_available_gpus()

In [None]:
class DQNCartPoleSolver():
    def __init__(self, gamma=1.0, 
                 epsilon=1.0, epsilon_min=0.01, epsilon_log_decay=0.995, 
                 lr=0.01, lr_decay=0.01, dense_layers = [24,48], 
                 batch_size=64, monitor=False, quiet=False,
                env_shape = 4, action_space_shape = 2):
        self.memory = deque(maxlen=1000000)

        self.gamma = gamma
    
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_log_decay

        self.batch_size = batch_size
        self.quiet = quiet
        self.env_shape = env_shape
        self.action_space_shape = action_space_shape

        # Init model
        with tf.device('/device:GPU:0'):
            self.model = self.build_model(dense_layers = dense_layers,output_shape = action_space_shape, lr = lr, lr_decay = lr_decay)

    
    def build_model(self, dense_layers = [24,48],output_shape = 2, lr = 0.01, lr_decay = 0.01):
        model = Sequential()
        
        model.add(Dense(dense_layers[0], input_shape=(4,),activation = 'tanh'))
        
        for layer in dense_layers[1:]:
            model.add(Dense(layer, activation = 'tanh'))
        
        model.add(Dense(output_shape, activation = 'linear'))
        model.compile(loss='mse', optimizer=Adam(lr=lr, decay=lr_decay))
        return model
    
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def choose_action(self, state, epsilon, env):
        return env.action_space.sample() if (np.random.random() <= epsilon) else np.argmax(self.model.predict_on_batch(state).numpy())

    def get_epsilon(self, t):
        return max(self.epsilon_min, min(self.epsilon, 1.0 - math.log10((t + 1) * self.epsilon_decay)))

    def preprocess_state(self, state):
        return np.reshape(state, [1, 4])

    def replay(self, batch_size):
        
        if len(self.memory) < batch_size:
            return 1
        
        x_batch, y_batch = [], []
        
        minibatch = random.sample(
                self.memory, min(len(self.memory), batch_size))

        state_batch = np.array([elt[0][0] for elt in minibatch ])
        action_batch = np.array([elt[1] for elt in minibatch])
        reward_batch = np.array([elt[2] for elt in minibatch])
        next_state_batch = np.array([elt[3][0] for elt in minibatch ])
        done_batch = np.array([elt[4] for elt in minibatch])
        
        with tf.device('/device:GPU:0'):
            y_target = self.model.predict_on_batch(state_batch).numpy()
            q_target = np.max(self.model.predict_on_batch(next_state_batch).numpy(), axis = 1)

        for i in range(y_target.shape[0]):
            y_target[i, action_batch[i]] = reward_batch[i] if done_batch[i] else reward_batch[i] + self.gamma * q_target[i]
        
        with tf.device('/device:GPU:0'):
            self.model.fit(state_batch, y_target, batch_size = 32, epochs = 2, verbose = 0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
        clear_session()


In [None]:

agent = DQNCartPoleSolver(batch_size = 512, dense_layers = [16,32,64,128,256])
env = gym.make('CartPole-v1')

scores = deque(maxlen=100)

# env.render()
for e in range(15000):
#     print(e)
    env.render()
    state = agent.preprocess_state(env.reset())
    done = False
    i = 0
    while not done:
        with tf.device('/device:GPU:0'):
            action = agent.choose_action(state, agent.get_epsilon(e), env)
        next_state, reward, done, _ = env.step(action)
        next_state = agent.preprocess_state(next_state)
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        i += 1

    scores.append(i)
    mean_score = np.mean(scores)
    if mean_score >= 500 and e >= 100:
        if not agent.quiet: print('Ran {} episodes. Solved after {} trials ✔'.format(e, e - 100))
        print(e - 100)
        break

    if e % 100 == 0 and not agent.quiet:
        print('[Episode {}] - Mean survival time over last 100 episodes was {} ticks.'.format(e, mean_score))

    agent.replay(agent.batch_size)
    
env.close()

In [None]:
env.close()