In [1]:
# import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

import sys
import gym
import pylab
import numpy as np
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam

EPISODES = 10000


Using TensorFlow backend.


In [2]:
class A2CAgent:
    def __init__(self, state_size, action_size):
        # if you want to see Cartpole learning, then change to True
#         self.render = False
#         self.load_model = False
        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size
        self.value_size = 1

        # These are hyper parameters for the Policy Gradient
        self.discount_factor = 0.99
        self.actor_lr = 0.001
        self.critic_lr = 0.005

        # create model for policy network
        self.actor = self.build_actor()
        self.critic = self.build_critic()

#         if self.load_model:
#             self.actor.load_weights("./save_model/cartpole_actor.h5")
#             self.critic.load_weights("./save_model/cartpole_critic.h5")

    # approximate policy and value using Neural Network
    # actor: state is input and probability of each action is output of model
    def build_actor(self):
        actor = Sequential()
        actor.add(Dense(32, input_dim=self.state_size, activation='relu',
                        kernel_initializer='he_uniform'))
        actor.add(Dense(self.action_size, activation='softmax',
                        kernel_initializer='he_uniform'))
        actor.summary()
        # See note regarding crossentropy in cartpole_reinforce.py
        actor.compile(loss='categorical_crossentropy',
                      optimizer=Adam(lr=self.actor_lr))
        return actor

    # critic: state is input and value of state is output of model
    def build_critic(self):
        critic = Sequential()
        critic.add(Dense(32, input_dim=self.state_size, activation='relu',
                         kernel_initializer='he_uniform'))
        critic.add(Dense(self.value_size, activation='linear',
                         kernel_initializer='he_uniform'))
        critic.summary()
        critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr))
        return critic

    # using the output of policy network, pick action stochastically
    def get_action(self, state):
        policy = self.actor.predict(state, batch_size=1).flatten()
        return np.random.choice(self.action_size, 1, p=policy)[0]

    # update policy network every episode
    def train_model(self, state, action, reward, next_state, done):
        target = np.zeros((1, self.value_size))
        advantages = np.zeros((1, self.action_size))

        value = self.critic.predict(state)[0]
        next_value = self.critic.predict(next_state)[0]

        if done:
            advantages[0][action] = reward - value
            target[0][0] = reward
        else:
            advantages[0][action] = reward + self.discount_factor * (next_value) - value
            target[0][0] = reward + self.discount_factor * next_value

        self.actor.fit(state, advantages, epochs=1, verbose=0)
        self.critic.fit(state, target, epochs=1, verbose=0)

In [None]:
env = gym.make('LunarLander-v2')
# get size of state and action from environment
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# make A2C agent
agent = A2CAgent(state_size, action_size)

scores, episodes = [], []

for e in range(EPISODES):
    done = False
    score = 0
    state = env.reset()
    state = np.reshape(state, [1, state_size])

    while not done:
#         if agent.render:
#             env.render()

        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        # if an action make the episode end, then gives penalty of -100
#         reward = reward if not done or score == 499 else -100

        agent.train_model(state, action, reward, next_state, done)

        score += reward
        state = next_state

        if done:
            # every episode, plot the play time
#             score = score if score == 500.0 else score + 100
            landed = (reward == 100)
            print("episode:", e, "  score:", score, " Land:", landed)
            scores.append(score)
            episodes.append(e)

[2017-10-10 22:50:35,213] Making new env: LunarLander-v2


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                288       
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 132       
Total params: 420
Trainable params: 420
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 32)                288       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 321
Trainable params: 321
Non-trainable params: 0
_________________________________________________________________
episode: 0   score: -521.734320134  Land: False
episode: 1   score: -151.2182860

episode: 143   score: -209.462643126  Land: False
episode: 144   score: -295.383854007  Land: False
episode: 145   score: -320.073113766  Land: False
episode: 146   score: -294.076451875  Land: False
episode: 147   score: -319.171187697  Land: False
episode: 148   score: -252.002791479  Land: False
episode: 149   score: -196.291586818  Land: False
episode: 150   score: -289.639555845  Land: False
episode: 151   score: -203.820176954  Land: False
episode: 152   score: -303.91413461  Land: False
episode: 153   score: -245.247882632  Land: False
episode: 154   score: -166.506501153  Land: False
episode: 155   score: -292.980720019  Land: False
episode: 156   score: -266.251687169  Land: False
episode: 157   score: -226.605889345  Land: False
episode: 158   score: -192.355204479  Land: False
episode: 159   score: -209.543278673  Land: False
episode: 160   score: -266.331337321  Land: False
episode: 161   score: -245.369115406  Land: False
episode: 162   score: -245.711052999  Land: False
e

episode: 308   score: -180.840345097  Land: False
episode: 309   score: -153.302580748  Land: False
episode: 310   score: -190.571751079  Land: False
episode: 311   score: -208.961145494  Land: False
episode: 312   score: -229.108198171  Land: False
episode: 313   score: -217.919994105  Land: False
episode: 314   score: -176.122777367  Land: False
episode: 315   score: -233.779128922  Land: False
episode: 316   score: -211.395754815  Land: False
episode: 317   score: -183.365740072  Land: False
episode: 318   score: -288.48306411  Land: False
episode: 319   score: -233.615757992  Land: False
episode: 320   score: -248.119748906  Land: False
episode: 321   score: -176.386648243  Land: False
episode: 322   score: -231.404599904  Land: False
episode: 323   score: -223.175305349  Land: False
episode: 324   score: -256.992508557  Land: False
episode: 325   score: -241.769277717  Land: False
episode: 326   score: -207.293171495  Land: False
episode: 327   score: -187.439945082  Land: False
e

episode: 473   score: -146.251668348  Land: False
episode: 474   score: -119.962440695  Land: False
episode: 475   score: -122.772409199  Land: False
episode: 476   score: -157.856833019  Land: False
episode: 477   score: -142.329256156  Land: False
episode: 478   score: -158.83525564  Land: False
episode: 479   score: -253.310755392  Land: False
episode: 480   score: -129.777656821  Land: False
episode: 481   score: -127.663569706  Land: False
episode: 482   score: -189.80100593  Land: False
episode: 483   score: -138.467075348  Land: False
episode: 484   score: -252.723552609  Land: False
episode: 485   score: -127.721391143  Land: False
episode: 486   score: -210.014090705  Land: False
episode: 487   score: -176.970791009  Land: False
episode: 488   score: -190.036362002  Land: False
episode: 489   score: -120.466929124  Land: False
episode: 490   score: -168.891999781  Land: False
episode: 491   score: -164.758943696  Land: False
episode: 492   score: -235.461147364  Land: False
ep

episode: 638   score: -158.823950442  Land: False
episode: 639   score: -254.41351212  Land: False
episode: 640   score: -184.999015358  Land: False
episode: 641   score: -141.618000661  Land: False
episode: 642   score: -257.959647423  Land: False
episode: 643   score: -172.98589865  Land: False
episode: 644   score: -132.888825221  Land: False
episode: 645   score: -220.067624778  Land: False
episode: 646   score: -131.956063821  Land: False
episode: 647   score: -137.320411421  Land: False
episode: 648   score: -206.186715965  Land: False
episode: 649   score: -168.642134663  Land: False
episode: 650   score: -172.660476985  Land: False
episode: 651   score: -241.888554811  Land: False
episode: 652   score: -324.760362105  Land: False
episode: 653   score: -136.352963938  Land: False
episode: 654   score: -207.669531026  Land: False
episode: 655   score: -82.6146560071  Land: False
episode: 656   score: -117.368729476  Land: False
episode: 657   score: -81.791908127  Land: False
epi

episode: 803   score: -130.627612312  Land: False
episode: 804   score: -246.659782705  Land: False
episode: 805   score: -185.731000008  Land: False
episode: 806   score: 42.3074416952  Land: False
episode: 807   score: 59.0959513719  Land: True
episode: 808   score: -107.627404616  Land: False
episode: 809   score: -78.1639232562  Land: False
episode: 810   score: -92.8569124373  Land: False
episode: 811   score: -93.6695656613  Land: False
episode: 812   score: -230.549121848  Land: False
episode: 813   score: -94.5807570106  Land: False
episode: 814   score: -136.434457076  Land: False
episode: 815   score: -149.189623114  Land: False
episode: 816   score: -120.139209932  Land: False
episode: 817   score: -95.1546847726  Land: False
episode: 818   score: -145.497595686  Land: False
episode: 819   score: -193.547010622  Land: False
episode: 820   score: -145.470939265  Land: False
episode: 821   score: -135.551589239  Land: False
episode: 822   score: -165.059196399  Land: False
epi

episode: 968   score: -148.885647259  Land: False
episode: 969   score: -166.774409277  Land: False
episode: 970   score: -108.651963227  Land: False
episode: 971   score: -151.3979682  Land: False
episode: 972   score: -164.414133667  Land: False
episode: 973   score: -175.01331689  Land: False
episode: 974   score: -210.29628633  Land: False
episode: 975   score: -149.029334453  Land: False
episode: 976   score: -151.023194758  Land: False
episode: 977   score: -172.248965981  Land: False
episode: 978   score: -100.218615234  Land: False
episode: 979   score: -130.633591811  Land: False
episode: 980   score: -136.090783149  Land: False
episode: 981   score: -91.4295043628  Land: False
episode: 982   score: -157.62166498  Land: False
episode: 983   score: -163.217143416  Land: False
episode: 984   score: -164.111596459  Land: False
episode: 985   score: -150.828724754  Land: False
episode: 986   score: -123.720939036  Land: False
episode: 987   score: -118.889410323  Land: False
episo

episode: 1130   score: -78.2227060562  Land: False
episode: 1131   score: -139.936818325  Land: False
episode: 1132   score: -98.530112139  Land: False
episode: 1133   score: -97.0092926911  Land: False
episode: 1134   score: -77.2468194381  Land: False
episode: 1135   score: -119.498431973  Land: False
episode: 1136   score: -128.2889564  Land: False
episode: 1137   score: -115.720778887  Land: False
episode: 1138   score: -130.530649347  Land: False
episode: 1139   score: -56.1724406798  Land: False
episode: 1140   score: -109.786258309  Land: False
episode: 1141   score: -73.9915599782  Land: False
episode: 1142   score: -106.726111456  Land: False
episode: 1143   score: -117.229628907  Land: False
episode: 1144   score: -179.718560255  Land: False
episode: 1145   score: -198.621952443  Land: False
episode: 1146   score: 9.31246800754  Land: True
episode: 1147   score: -13.6272151799  Land: True
episode: 1148   score: 77.6492346616  Land: True
episode: 1149   score: -138.847811601  

episode: 1292   score: -164.011504412  Land: False
episode: 1293   score: -144.143573678  Land: False
episode: 1294   score: -173.755594279  Land: False
episode: 1295   score: -122.490913807  Land: False
episode: 1296   score: -55.9903942258  Land: False
episode: 1297   score: -133.929386806  Land: False
episode: 1298   score: -97.8981319744  Land: False
episode: 1299   score: -97.4020449053  Land: False
episode: 1300   score: -110.679850638  Land: False
episode: 1301   score: -140.134475262  Land: False
episode: 1302   score: -110.103840116  Land: False
episode: 1303   score: -113.628705469  Land: False
episode: 1304   score: -110.737313025  Land: False
episode: 1305   score: -139.492668394  Land: False
episode: 1306   score: -88.9153316736  Land: False
episode: 1307   score: -106.743628973  Land: False
episode: 1308   score: -137.560453806  Land: False
episode: 1309   score: -105.873266165  Land: False
episode: 1310   score: -130.847709655  Land: False
episode: 1311   score: -115.514

episode: 1455   score: 120.321674901  Land: True
episode: 1456   score: 167.683689519  Land: True
episode: 1457   score: 81.8363054698  Land: True
episode: 1458   score: -144.556781254  Land: False
episode: 1459   score: -153.266157956  Land: False
episode: 1460   score: 93.9276189869  Land: True
episode: 1461   score: -116.841524355  Land: False
episode: 1462   score: -111.421918202  Land: False
episode: 1463   score: -93.5626836498  Land: False
episode: 1464   score: 4.08524661351  Land: True
episode: 1465   score: -131.870229901  Land: False
episode: 1466   score: 23.8357774693  Land: True
episode: 1467   score: -7.98381391312  Land: False
episode: 1468   score: 62.9360373919  Land: True
episode: 1469   score: 39.0207064588  Land: True
episode: 1470   score: -93.3157330254  Land: False
episode: 1471   score: 32.6365175182  Land: False
episode: 1472   score: 25.6157933628  Land: True
episode: 1473   score: -167.260347993  Land: False
episode: 1474   score: 131.449240318  Land: True
e

episode: 1618   score: -123.493409611  Land: False
episode: 1619   score: 143.372623103  Land: True
episode: 1620   score: -112.596238378  Land: False
episode: 1621   score: 172.757787994  Land: True
episode: 1622   score: -144.703999366  Land: False
episode: 1623   score: -150.533393547  Land: False
episode: 1624   score: -146.372044939  Land: False
episode: 1625   score: -87.4790881758  Land: False
episode: 1626   score: -64.5706720256  Land: False
episode: 1627   score: -131.887626135  Land: False
episode: 1628   score: -215.112682462  Land: False
episode: 1629   score: -177.856036517  Land: False
episode: 1630   score: -127.772786857  Land: False
episode: 1631   score: -105.937563271  Land: False
episode: 1632   score: -90.8415839871  Land: False
episode: 1633   score: 90.670509368  Land: True
episode: 1634   score: 146.655761471  Land: True
episode: 1635   score: -81.7018936777  Land: False
episode: 1636   score: 136.875050604  Land: True
episode: 1637   score: -68.7096561147  Lan

episode: 1784   score: 92.273171823  Land: True
episode: 1785   score: 120.770906953  Land: True
episode: 1786   score: 139.402271762  Land: True
episode: 1787   score: 126.741268728  Land: True
episode: 1788   score: -96.0716856758  Land: False
episode: 1789   score: 153.043515851  Land: True
episode: 1790   score: 117.611081784  Land: True
episode: 1791   score: 126.4511195  Land: True
episode: 1792   score: 180.477566954  Land: True
episode: 1793   score: 68.7677104325  Land: True
episode: 1794   score: -126.874159326  Land: False
episode: 1795   score: -150.414849622  Land: False
episode: 1796   score: 134.234860994  Land: True
episode: 1797   score: 143.110780658  Land: True
episode: 1798   score: 118.994111008  Land: True
episode: 1799   score: 158.167745695  Land: True
episode: 1800   score: 79.0612380693  Land: True
episode: 1801   score: -113.967123631  Land: False
episode: 1802   score: -109.241745515  Land: False
episode: 1803   score: 159.703442428  Land: True
episode: 1804