In [1]:
!pip install --upgrade pip
!pip install 'gym[atari]'

Collecting pip
  Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/43/84/23ed6a1796480a6f1a2d38f2802901d078266bda38388954d01d3f2e821d/pip-20.1.1-py2.py3-none-any.whl (1.5MB)
[K    100% |████████████████████████████████| 1.5MB 66.3MB/s ta 0:00:01
[?25hInstalling collected packages: pip
  Found existing installation: pip 9.0.1
    Uninstalling pip-9.0.1:
      Successfully uninstalled pip-9.0.1
Successfully installed pip-20.1.1
[33mYou are using pip version 20.1.1, however version 20.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Looking in indexes: http://repo.myhuaweicloud.com/repository/pypi/simple
Collecting atari_py~=0.2.0
  Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/58/45/c2f6523aed89db6672b241fa1aafcfa54126c564be769c1360d298f03852/atari_py-0.2.6-cp36-cp36m-manylinux1_x86_64.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 23.1 MB/s eta 0:00:01
Installing collected packages: at

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import gym

In [2]:
## config 
end_game_reward = -100
hidden_layers = [12,12]
gamma = 0.99
learning_rate = 0.0001
internal = 100
env_name = 'SpaceInvaders-v0'
env = gym.make(env_name)
w,h,d = env.observation_space.shape
state_num = w * h * d

In [3]:
class PolicyGradient:
    def __init__(self, state_size, num_of_actions, hidden_layers, learning_rate):
        self.states = tf.placeholder(shape=(None, state_size), dtype=tf.float32, name='input_states')
        self.acc_r = tf.placeholder(shape=None, dtype=tf.float32, name='accumalated_rewards')
        self.actions = tf.placeholder(shape=None, dtype=tf.int32, name='actions')
        layer = self.states
        for i in range(len(hidden_layers)):
            layer = tf.layers.dense(inputs=layer, units=hidden_layers[i], activation=tf.nn.relu,
                                    kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                    name='hidden_layer_{}'.format(i+1))
        self.last_layer = tf.layers.dense(inputs=layer, units=num_of_actions, activation=tf.nn.tanh,
                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          name='output')
        self.action_prob = tf.nn.softmax(self.last_layer)
        self.log_policy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.last_layer, labels=self.actions)
        self.cost = tf.reduce_mean(self.acc_r * self.log_policy)
        self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.cost)


In [4]:
pg = PolicyGradient(state_size=state_num, num_of_actions=env.action_space.n,
                    hidden_layers=hidden_layers, learning_rate=learning_rate)


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [None]:
from scipy.stats import zscore

sess = tf.Session()
sess.run(tf.global_variables_initializer())
data = pd.DataFrame(columns=['game','steps','cost'])

for g in range(1500):
    game = g+1
    done = False
    ## init env
    observation = env.reset()
    states = []
    rewards = []
    actions = []
    steps = 0
    print_stuff('Starting game {}'.format(game))
    while not done:
        steps += 1
        observation = observation.flatten()[np.newaxis, :]
        
        probs = sess.run(pg.action_prob, feed_dict={pg.states: observation}).flatten()
        # choose the action 
        action = np.random.choice(env.action_space.n, p=probs)
        ## According to the action, give the next state,reward and whether game over
        next_state, r, done, _ = env.step(action)
        if done and steps < env._max_episode_steps: r = end_game_reward
        
        # Save to memory:
        states.append(observation)
        rewards.append(r)
        actions.append(action)
        observation = next_state
    print_stuff('Game {g} has ended after {s} steps.'.format(g=game, s=steps))
    
    discounted_acc_rewards = np.zeros_like(rewards)
    s = 0.0
    for i in reversed(range(len(rewards))):
        s = s * gamma + rewards[i]
        discounted_acc_rewards[i] = s
    discounted_acc_rewards = zscore(discounted_acc_rewards)
    
    states, discounted_acc_rewards, actions = shuffle(states, discounted_acc_rewards, actions)
#     print(np.array(states).shape)
    c, _ = sess.run([pg.cost, pg.optimizer], feed_dict={pg.states: np.squeeze(states), 
                                                        pg.acc_r: discounted_acc_rewards,
                                                        pg.actions: actions})    
    
    print_stuff('Cost: {}\n----------'.format(c))
    data = data.append({'game':game, 'steps':steps, 'cost':c}, ignore_index=True)

In [None]:
from gym import wrappers
env = wrappers.Monitor(env, "./gym-results", force=True)
env.reset()
for _ in range(5000):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    if done: break
env.close()

In [None]:
import io
import base64
from IPython.display import HTML

video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % env.file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))