<a href="https://colab.research.google.com/github/dwdb/deep-learning/blob/master/reinforcement-learning/cart_pole.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import gym
import tensorflow as tf
import numpy as np

!nvidia-smi

Wed Jul  1 11:35:42 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [36]:
class RLCartPole(object):
    def __init__(self):
        self.state_size=  4
        self.hidden_size = 8
        self.action_size = 2
        self.agent = self.create_agent()

        self.env = gym.make('CartPole-v0')

        self.optimizer = tf.keras.optimizers.Adam(0.001)

    def discount_rewards(self, rewards, gamma=0.99):
        discounted = np.zeros_like(rewards)
        running_add = 0.
        for i in range(discounted.size - 1, -1, -1):
            running_add = running_add * gamma + rewards[i]
            discounted[i] = running_add
        return discounted

    def create_agent(self):
        agent = tf.keras.Sequential([
            tf.keras.layers.Dense(
                self.hidden_size, input_shape=(self.state_size,), activation='relu'),    
            tf.keras.layers.Dense(self.hidden_size, activation='relu'),   
            tf.keras.layers.Dense(self.action_size, activation='softmax')
        ])
        return agent
    
    def train(self, episodes=5000, max_ep=999):
        total_reward = []
        grad_buf = [tf.zeros_like(var) for var in self.agent.trainable_variables]
        for episode in range(episodes):
            state = self.env.reset()
            tau = []
            running_reward  = 0.

            for _ in range(max_ep):
                action_dist = self.agent(np.expand_dims(state, axis=0))
                action = np.random.choice(np.arange(self.action_size), p=action_dist.numpy()[0])
                state1, reward, done, _ = self.env.step(action)
                tau.append([state, action, reward])
                state = state1
                running_reward += reward
                if done:
                    break
            states, actions, rewards = zip(*tau)
            rewards = self.discount_rewards(rewards)
            grads = self.calculate_gradient(np.array(states), np.array(actions), rewards)
            for i, grad in enumerate(grads):
                grad_buf[i] += grad

            if episode % 5 == 0 and episode != 0:
                self.optimizer.apply_gradients(
                    zip(grad_buf, self.agent.trainable_variables))
                grad_buf = [tf.zeros_like(v) for v in self.agent.trainable_variables]
                    
            total_reward.append(running_reward)
            if episode % 100 == 0 and episode:
                print(np.mean(total_reward[-100:]))
    
    def calculate_gradient(self, state, action, reward):
        with tf.GradientTape() as tape:
            action_dist = self.agent(state)
            indices = tf.range(tf.shape(state)[0]) * self.action_size + action
            action_dist = tf.gather(tf.reshape(action_dist, [-1]), indices)
            loss = - tf.math.log(action_dist) * reward
        grads = tape.gradient(loss, self.agent.trainable_variables)
        return grads

model = RLCartPole()
model.train()

24.87
28.82
27.17
28.52
32.72
33.67
39.5
40.65
47.31
52.67
67.76
81.69
103.96
121.27
154.53
166.3
175.29
186.08
185.47
191.43
191.33
178.27
185.62
184.68
183.05
189.64
197.32
195.95
194.93
196.38
190.04
196.3
197.32
193.51
198.21
199.36
197.14
199.92
199.74
197.85
198.74
197.93
198.22
196.73
197.47
199.7
198.89
200.0
199.3
