In [4]:
import numpy as np
import gym
import keras
import copy
import os


class Agent():
    def __init__(self, discount_factor=1, learning_rate=0.0001, exploration_rate=0.3, min_exploration_rate=0.05, exploration_rate_reduction=0.1):
        self.brain = self.make_model()
        self.memory = []
        self.env = gym.make('Acrobot-v1')
        self.discount_factor = discount_factor
        self.learning_rate = learning_rate
        self.exploration_rate = exploration_rate
        self.min_exloration_rate = min_exploration_rate
        self.exploration_rate_reduction = exploration_rate_reduction

    def play(self, num_episodes=10, num_steps=30):
        self.memory = []
        for episode in range(num_episodes):
            observation = self.env.reset()
            for step in range(num_steps):
                # self.env.render()
                initial_observation = observation.reshape(1, -1)
                initial_q_values = self.brain.predict(initial_observation).flatten()
                action = np.argmax(initial_q_values)
                observation, reward, done, _ = self.env.step(action)
                q_values = self.brain.predict(observation.reshape(1, -1)).flatten()
                target = copy.copy(initial_q_values)
                target[action] = initial_q_values[action] + self.learning_rate * (initial_q_values[action] - (reward + self.discount_factor * np.max(q_values)))
                self.memory.append((initial_observation, initial_q_values, observation, q_values, target, action, reward, done))
                if done:
                    break

    def learn_from_memory(self, batch_size=10, num_epochs=1):
        self.memory = np.asarray(self.memory)
        x = self.memory[:, 0]
        y = self.memory[:, 4]
        x = np.concatenate(x)
        y = np.concatenate(y).reshape(-1, 2)
        for epoch in range(num_epochs):
            self.brain.fit(x, y, batch_size=batch_size, shuffle=True)

    def train(self, num_training_sessions=2000):
        min_brain_size = np.asarray(self.memory).size
        for training_session in range(num_training_sessions):
            print("This is training session number %s" % (training_session))
            self.play()
            if np.asarray(self.memory).size < min_brain_size:
                self.brain.save(model_name)
                min_brain_size = np.asarray(self.memory).size
            self.learn_from_memory()
            self.exploration_rate = max(self.min_exloration_rate, self.exploration_rate * self.exploration_rate_reduction)

    def make_model(self):
        inputs = keras.layers.Input(shape=(6,))
        x = keras.layers.Dense(64, activation='linear')(inputs)
        x = keras.layers.LeakyReLU(alpha=0.3)(x)
        x = keras.layers.Dense(128, activation='linear')(x)
        x = keras.layers.LeakyReLU(alpha=0.3)(x)
        predictions = keras.layers.Dense(2, activation='linear')(x)

        model = keras.Model(inputs=inputs, outputs=predictions)
        model.compile(optimizer='adam', loss='mean_squared_error')
        return model

In [12]:
agent = Agent(exploration_rate=0, min_exploration_rate=0)
agent.play(num_episodes=1, num_steps=1)

In [13]:
agent.memory

[(array([[ 9.99999996e-01, -8.91522615e-05,  9.96388998e-01,
           8.49056255e-02,  2.41670091e-02, -3.23217256e-02]]),
  array([-0.01841734, -0.14496998], dtype=float32),
  array([ 0.99981334,  0.01932064,  0.99935511,  0.03590777,  0.16433238,
         -0.44685902]),
  array([ 0.06218744, -0.12766051], dtype=float32),
  array([-0.0183254 , -0.14496998], dtype=float32),
  0,
  -1.0,
  False)]