In [235]:
import gym
import keras
import numpy as np
import random
import copy


class Agent():
    def __init__(self):
        self.env = gym.make('CartPole-v0')
        self.brain = self.create_brain()
        self.memory = []
        self.exploration_rate = 0.05
        self.learning_rate = 1
        self.discount_factor = 0.9

    def create_brain(self):
        inputs = keras.layers.Input(shape=(4,))
        x = keras.layers.Dense(16, activation='relu')(inputs)
        x = keras.layers.Dense(32, activation='relu')(x)
        predictions = keras.layers.Dense(2, activation='linear')(x)
        model = keras.Model(inputs=inputs, outputs=predictions)
        model.compile(optimizer='rmsprop', loss='mean_squared_error')
        return model

    def play(self, num_episodes=100, num_time_steps=100):
        self.memory = []
        for episode in range(num_episodes):
            observation = self.env.reset()
            for t in range(num_time_steps):
                initial_observation = observation.reshape(1, -1)
                initial_q_values = self.brain.predict(initial_observation).flatten()
                if np.random.rand() < self.exploration_rate:
                    action = self.env.action_space.sample()
                else:
                    action = np.argmax(initial_q_values)
                observation, reward, done, _ = self.env.step(action)
                observation = observation.reshape(1, -1)
                q_values = self.brain.predict(observation).flatten()
                target = copy.copy(initial_q_values)
                target[action] = initial_q_values[action] + self.learning_rate * (initial_q_values[action] - (reward + self.discount_factor * np.max(q_values)))
                self.memory.append((initial_observation, initial_q_values, observation, q_values, target, action, reward, done))
                if done:
                    break

    def learn(self, batch_size=10, num_epochs=10):
        self.memory = np.asarray(self.memory)
        x = self.memory[:, 0]
        y = self.memory[:, 4]

agent = Agent()
agent.play()
agent.learn()

In [236]:
agent.memory[0]

array([array([[ 0.00248855, -0.02750133, -0.03640334,  0.02987755]]),
       array([-0.0134726 , -0.01946324], dtype=float32),
       array([[ 0.00193852, -0.22208284, -0.03580579,  0.31085622]]),
       array([-0.10143878, -0.09219329], dtype=float32),
       array([-0.9439712 , -0.01946324], dtype=float32), 0, 1.0, False],
      dtype=object)

In [237]:
batch = np.asarray([random.choice(agent.memory) for i in range(10)])

In [238]:
batch

array([[array([[ 0.01204191,  0.04026624, -0.04698397, -0.06987237]]),
        array([-0.00095788,  0.01038291], dtype=float32),
        array([[ 0.01284724,  0.2360292 , -0.04838142, -0.37700096]]),
        array([0.02152566, 0.01209041], dtype=float32),
        array([-9.5788436e-04, -9.9860728e-01], dtype=float32), 1, 1.0,
        False],
       [array([[-0.0402312 ,  0.14414843,  0.04560947, -0.24532534]]),
        array([0.02269199, 0.00392988], dtype=float32),
        array([[-0.03734823, -0.05159427,  0.04070297,  0.06138791]]),
        array([-0.01060719, -0.00468279], dtype=float32),
        array([-0.95040154,  0.00392988], dtype=float32), 0, 1.0, False],
       [array([[ 0.06075134,  0.61397077, -0.02661067, -0.87533249]]),
        array([0.04002709, 0.0315402 ], dtype=float32),
        array([[ 0.07303075,  0.41922048, -0.04411732, -0.59113308]]),
        array([0.01568058, 0.03282335], dtype=float32),
        array([-0.94948685,  0.0315402 ], dtype=float32), 0, 1.0, False]

In [279]:
x = batch[:, 0]
y = batch[:, 4]

In [280]:
x

array([array([[ 0.01204191,  0.04026624, -0.04698397, -0.06987237]]),
       array([[-0.0402312 ,  0.14414843,  0.04560947, -0.24532534]]),
       array([[ 0.06075134,  0.61397077, -0.02661067, -0.87533249]]),
       array([[ 0.03579153,  1.11315913,  0.0043516 , -1.35314073]]),
       array([[-0.19357547, -0.06090832,  0.15923417,  0.40733878]]),
       array([[-0.03516774,  0.02438343,  0.04809613,  0.09028284]]),
       array([[ 0.03236193, -0.02054122,  0.01924193, -0.04819359]]),
       array([[-0.05022685, -0.15239663,  0.02672281,  0.34705042]]),
       array([[-0.03310266,  0.13969825,  0.03698646, -0.14697833]]),
       array([[ 0.04002864, -0.75914321, -0.09569033,  0.75912926]])],
      dtype=object)

In [281]:
x.shape

(10,)

In [282]:
y

array([array([-9.5788436e-04, -9.9860728e-01], dtype=float32),
       array([-0.95040154,  0.00392988], dtype=float32),
       array([-0.94948685,  0.0315402 ], dtype=float32),
       array([ 0.03227581, -0.9593405 ], dtype=float32),
       array([-0.98187417, -0.04504369], dtype=float32),
       array([-0.00456514, -1.0134563 ], dtype=float32),
       array([-0.9226288,  0.0020185], dtype=float32),
       array([-0.08040256, -1.141401  ], dtype=float32),
       array([-0.9588132 , -0.00313429], dtype=float32),
       array([-0.25801724, -1.3761474 ], dtype=float32)], dtype=object)

In [283]:
y.shape

(10,)

In [284]:
x = np.concatenate(x)
y = np.concatenate(y, axis=0)

AxisError: axis 1 is out of bounds for array of dimension 1

In [285]:
x.shape

(10, 4)

In [286]:
y.shape

(10,)

In [287]:
x[0].shape

(4,)

In [288]:
y.shape

(10,)

In [289]:
agent.brain.fit(x, y)

ValueError: Error when checking target: expected dense_81 to have shape (2,) but got array with shape (1,)