In [97]:
class Agent():
    def __init__(self):
        self.env = gym.make('CartPole-v0')
        self.brain = self.create_brain()
        self.memory = []
        self.exploration_rate = 0.3
        self.min_exloration_rate = 0.01
        self.exploration_rate_reduction = 0.8
        self.learning_rate = 0.001
        self.discount_factor = 1
        self.losing_penalty = -10

    # Create the model
    def create_brain(self):
        inputs = keras.layers.Input(shape=(4,))
        x = keras.layers.Dense(128, activation='linear')(inputs)
        x = keras.layers.Dropout(0.3)(x)
        x = keras.layers.LeakyReLU(alpha=0.3)(x)
        x = keras.layers.Dense(256, activation='linear')(x)
        x = keras.layers.Dropout(0.3)(x)
        x = keras.layers.LeakyReLU(alpha=0.3)(x)
        predictions = keras.layers.Dense(2, activation='linear')(x)
        model = keras.Model(inputs=inputs, outputs=predictions)
        model.compile(optimizer='adam', loss='mean_squared_error')
        return model

    # Navigate through the environment and collect traning data in memory
    def play(self, num_episodes=10, num_time_steps=250):
        self.memory = []
        for episode in range(num_episodes):
            observation = self.env.reset()
            for t in range(num_time_steps):
                initial_observation = observation.reshape(1, -1)
                initial_q_values = self.brain.predict(initial_observation).flatten()
                if np.random.rand() < self.exploration_rate:
                    action = self.env.action_space.sample()
                else:
                    action = np.argmax(initial_q_values)
                observation, reward, done, _ = self.env.step(action)
                observation = observation.reshape(1, -1)
                q_values = self.brain.predict(observation).flatten()
                if done:
                    target = copy.copy(initial_q_values)
                    target_test = copy.copy(initial_q_values)
                    target[action] = initial_q_values[action] + self.learning_rate * (self.losing_penalty - initial_q_values[action])
                    target_test[action] = initial_q_values[action] + self.learning_rate * (initial_q_values[action] + self.losing_penalty)
                    self.memory.append((initial_observation, initial_q_values,  q_values, target, target_test, action, reward, done))
                    break
                else:
                    target = copy.copy(initial_q_values)
                    target_test = copy.copy(initial_q_values)
                    target[action] = initial_q_values[action] + self.learning_rate * (reward + self.discount_factor * np.max(q_values) - initial_q_values[action])
                    target_test[action] = initial_q_values[action] + self.learning_rate * (initial_q_values[action] - (reward + self.discount_factor * np.max(q_values)))
                    self.memory.append((initial_observation, initial_q_values, q_values, target, target_test, action, reward, done))

In [98]:
agent = Agent()

In [114]:
agent.play(num_episodes=1)
agent.memory[0]

(array([[-0.04805415, -0.03720153,  0.02675557,  0.01893391]]),
 array([0.00476557, 0.00448998], dtype=float32),
 array([0.00776992, 0.04238249], dtype=float32),
 array([0.00580319, 0.00448998], dtype=float32),
 array([0.00372795, 0.00448998], dtype=float32),
 0,
 1.0,
 False)

In [115]:
x = agent.memory[0][1][agent.memory[0][5]] + agent.learning_rate * (1 + np.max(agent.memory[0][2]) - agent.memory[0][1][agent.memory[0][5]])
x

0.005803188482765108

In [116]:
y = agent.memory[0][1][agent.memory[0][5]] + agent.learning_rate * (agent.memory[0][1][agent.memory[0][5]] - (1 + np.max(agent.memory[0][2])))
y

0.003727954638656229

In [117]:
x - y

0.0020752338441088793

In [118]:
x - agent.memory[0][1][agent.memory[0][5]]

0.0010376169220544396

In [119]:
y - agent.memory[0][1][agent.memory[0][5]]

-0.0010376169220544396