In [39]:
import numpy as np
import gym
import keras
import copy
import os
import time

model_name = "trained_model.h5"
if os.path.exists(model_name):
    os.remove(model_name)


class Agent():
    def __init__(self, discount_factor=1, learning_rate=0.05, min_learning_rate=0.0001, learning_rate_reduction=0.999, exploration_rate=0.8, min_exploration_rate=0.01, exploration_rate_reduction=0.95, num_episodes=1, num_steps=400, batch_size=10, num_epochs=1, num_training_sessions=100):
        self.brain = self.make_model()
        self.memory = []
        self.env = gym.make('Acrobot-v1')
        self.discount_factor = discount_factor
        self.learning_rate = learning_rate
        self.min_learning_rate = min_learning_rate
        self.learning_rate_reduction = learning_rate_reduction
        self.exploration_rate = exploration_rate
        self.min_exloration_rate = min_exploration_rate
        self.exploration_rate_reduction = exploration_rate_reduction
        self.num_episodes = num_episodes
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.num_training_sessions = num_training_sessions

    def play(self):
        self.memory = []
        for episode in range(self.num_episodes):
            observation = self.env.reset()
            for step in range(self.num_steps):
                # self.env.render()
                initial_observation = observation.reshape(1, -1)
                initial_q_values = self.brain.predict(initial_observation).flatten()
                action = np.argmax(initial_q_values)
                observation, reward, done, _ = self.env.step(action)
                if done:
                    q_values = self.brain.predict(observation.reshape(1, -1)).flatten()
                    target = copy.copy(initial_q_values)
                    target[action] = initial_q_values[action] + self.learning_rate * (100 - initial_q_values[action])
                    self.memory.append((initial_observation, initial_q_values, observation, q_values, target, action, reward, done))
                else:
                    q_values = self.brain.predict(observation.reshape(1, -1)).flatten()
                    target = copy.copy(initial_q_values)
                    target[action] = initial_q_values[action] + self.learning_rate * (reward + self.discount_factor * np.max(q_values) - initial_q_values[action])
                    self.memory.append((initial_observation, initial_q_values, observation, q_values, target, action, reward, done))

    def learn_from_memory(self):
        self.memory = np.asarray(self.memory)
        x = self.memory[:, 0]
        y = self.memory[:, 4]
        x = np.concatenate(x)
        y = np.concatenate(y).reshape(-1, 2)
        for epoch in range(self.num_epochs):
            self.brain.fit(x, y, batch_size=self.batch_size, shuffle=True)

    def train(self):
        min_brain_size = self.num_episodes * self.num_steps
        for training_session in range(self.num_training_sessions):
            print("This is training session number %s" % (training_session))
            print("The current exploration rate is %s" % (self.exploration_rate))
            print("The current learning rate is %s" % (self.learning_rate))
            self.play()
            if np.asarray(self.memory).size < min_brain_size:
                self.brain.save(model_name)
                min_brain_size = np.asarray(self.memory).size
            self.learn_from_memory()
            self.exploration_rate = max(self.min_exloration_rate, self.exploration_rate * self.exploration_rate_reduction)
            self.learning_rate = max(self.min_learning_rate, self.learning_rate * self.learning_rate_reduction)

    def make_model(self):
        inputs = keras.layers.Input(shape=(6,))
        x = keras.layers.Dense(256, activation='linear')(inputs)
        x = keras.layers.Dropout(0.3)(x)
        x = keras.layers.LeakyReLU(alpha=0.3)(x)
        x = keras.layers.Dense(512, activation='linear')(x)
        x = keras.layers.Dropout(0.3)(x)
        x = keras.layers.LeakyReLU(alpha=0.3)(x)
        predictions = keras.layers.Dense(2, activation='linear')(x)

        model = keras.Model(inputs=inputs, outputs=predictions)
        model.compile(optimizer='adam', loss='mean_squared_error')
        return model


agent = Agent(discount_factor=1, learning_rate=0.05, min_learning_rate=0.001, learning_rate_reduction=0.995, exploration_rate=0.20, min_exploration_rate=0.01, exploration_rate_reduction=0.99, num_episodes=1, num_steps=400, batch_size=10, num_epochs=1, num_training_sessions=10000)

In [57]:
agent = Agent()
agent.play()

In [58]:
agent.memory[-1]

(array([[ 0.99369662,  0.11210279,  0.94713857, -0.32082478,  0.32431479,
         -0.72334788]]),
 array([ 0.15064   , -0.02651778], dtype=float32),
 array([ 0.98605619,  0.1664127 ,  0.89841971, -0.43913782,  0.21015677,
        -0.52929944]),
 array([ 0.12800154, -0.03011816], dtype=float32),
 array([ 0.09950808, -0.02651778], dtype=float32),
 0,
 -1.0,
 False)

In [59]:
agent.memory[0][1][agent.memory[0][5]] + agent.learning_rate * (agent.memory[0][6] + agent.discount_factor * np.max(agent.memory[0][3]) - agent.memory[0][1][agent.memory[0][5]])

0.0757245372980833

In [60]:
agent.memory[0][1][agent.memory[0][5]]

0.12447051

In [61]:
agent.learning_rate

0.05

In [53]:
agent.memory[0][6]

-1.0

In [54]:
agent.discount_factor

1

In [55]:
np.max(agent.memory[0][3])

0.13653791