[link](http://quant.am/cs/2017/08/07/policy-gradients/)

### State
0	Cart Position             -4.8            4.8<br>
1	Cart Velocity             -Inf            Inf<br>
2	Pole Angle                 -24 deg        24 deg<br>
3	Pole Velocity At Tip      -Inf            Inf<br>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import random
import numpy as np
import scipy.stats
import gym
import time

In [2]:
class LinearSoftmaxAgent(object):
    """Act with softmax policy. Features are encoded as
    phi(s, a) is a 1-hot vector of states."""
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.states = []
        self.actions = []
        self.probs = []
        self.rewards = []
        self.theta = np.random.random(state_size * action_size)
        self.alpha = .01
        self.gamma = .99

    def store(self, state, action, prob, reward):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(prob)
        self.rewards.append(reward)

    def _phi(self, s, a):
        encoded = np.zeros([self.action_size, self.state_size])
        encoded[a] = s
        return encoded.flatten()

    def _softmax(self, s, a):
        return np.exp(self.theta.dot(self._phi(s, a)) / 100)

    def pi(self, s):
        """\pi(a | s)"""
        weights = np.empty(self.action_size)
        for a in range(self.action_size):
            weights[a] = self._softmax(s, a)
        return weights / np.sum(weights)

    def act(self, state):
        probs = self.pi(state)
        a = random.choices(range(0, self.action_size), weights=probs)
        a = a[0]
        pi = probs[a]
        return (a, pi)

    def _gradient(self, s, a):
        expected = 0
        probs = self.pi(s)
        for b in range(0, self.action_size):
            expected += probs[b] * self._phi(s, b)
        return self._phi(s, a) - expected

    def _R(self, t):
        """Reward function."""
        total = 0
        for tau in range(t, len(self.rewards)):
            total += self.gamma**(tau - t) * self.rewards[tau]
        return total

    def train(self):
        self.rewards -= np.mean(self.rewards)
        self.rewards /= np.std(self.rewards)
        for t in range(len(self.states)):
            s = self.states[t]
            a = self.actions[t]
            r = self._R(t)
            grad = self._gradient(s, a)
            self.theta = self.theta + self.alpha * r * grad
        self.states = []
        self.actions = []
        self.probs = []
        self.rewards = []

    def getName(self):
        return 'LinearSoftmaxAgent'

In [None]:
try: env.close()
except: pass
env = gym.make('CartPole-v1')

state = env.reset()
SAVE_FREQUENCY = 10

score = 0
episode = 0
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
g = LinearSoftmaxAgent(state_size, action_size)

MAX_EPISODES = 1000
while episode < MAX_EPISODES:  # episode loop
    env.render()
    action, prob = g.act(state)
    state, reward, done, info = env.step(action)  # take a random action
    if done:
        reward = -10
    score += reward
    g.store(state, action, prob, reward)

    if done:
        episode += 1
        g.train()
        print('Episode: {} Score: {}'.format(episode, score))
        score = 0
        state = env.reset()
        env.render()
    time.sleep(0.03)
env.close()

  result = entry_point.load(False)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode: 1 Score: 23.0
Episode: 2 Score: -2.0
Episode: 3 Score: 2.0
Episode: 4 Score: 37.0
Episode: 5 Score: 4.0
Episode: 6 Score: 4.0
Episode: 7 Score: 14.0
Episode: 8 Score: 2.0
Episode: 9 Score: 5.0
Episode: 10 Score: 29.0
Episode: 11 Score: 8.0
Episode: 12 Score: 4.0
Episode: 13 Score: 11.0
Episode: 14 Score: 1.0
Episode: 15 Score: 5.0
Episode: 16 Score: 8.0
Episode: 17 Score: 11.0
Episode: 18 Score: 27.0
Episode: 19 Score: 8.0
Episode: 20 Score: 6.0
Episode: 21 Score: 34.0
Episode: 22 Score: 13.0
Episode: 23 Score: 24.0
Episode: 24 Score: 5.0
Episode: 25 Score: -1.0
Episode: 26 Score: 24.0
Episode: 27 Score: 37.0
Episode: 28 Score: 0.0
Episode: 29 Score: 3.0
Episode: 30 Score: 7.0
Episode: 31 Score: 48.0
Episode: 32 Score: 11.0
Episode: 33 Score: 18.0
Episode: 34 Score: 29.0
Episode: 35 Score: 1.0
Episode: 36 Score: 9.0
Episode: 37 Score: 17.0
Episode: 38 Score: 39.0
Episode

Episode: 334 Score: 14.0
Episode: 335 Score: 7.0
Episode: 336 Score: 39.0
Episode: 337 Score: 20.0
Episode: 338 Score: 56.0
Episode: 339 Score: 30.0
Episode: 340 Score: 41.0
Episode: 341 Score: 11.0
Episode: 342 Score: 63.0
Episode: 343 Score: 23.0
Episode: 344 Score: 71.0
Episode: 345 Score: 44.0
Episode: 346 Score: 27.0
Episode: 347 Score: 27.0
Episode: 348 Score: 26.0
Episode: 349 Score: 34.0
Episode: 350 Score: 25.0
Episode: 351 Score: 41.0
Episode: 352 Score: 2.0
Episode: 353 Score: 20.0
Episode: 354 Score: 25.0
Episode: 355 Score: 12.0
Episode: 356 Score: 24.0
Episode: 357 Score: 1.0
Episode: 358 Score: 21.0
Episode: 359 Score: 66.0
Episode: 360 Score: 39.0
Episode: 361 Score: 30.0
Episode: 362 Score: 35.0
Episode: 363 Score: 10.0
Episode: 364 Score: 10.0
Episode: 365 Score: 12.0
Episode: 366 Score: 11.0
Episode: 367 Score: 18.0
Episode: 368 Score: 55.0
Episode: 369 Score: 7.0
Episode: 370 Score: 13.0
Episode: 371 Score: 20.0
Episode: 372 Score: 51.0
Episode: 373 Score: 59.0
Epis

Episode: 663 Score: 50.0
Episode: 664 Score: 23.0
Episode: 665 Score: 95.0
Episode: 666 Score: 18.0
Episode: 667 Score: 32.0
Episode: 668 Score: 14.0
Episode: 669 Score: 65.0
Episode: 670 Score: 14.0
Episode: 671 Score: 53.0
Episode: 672 Score: 9.0
Episode: 673 Score: 72.0
Episode: 674 Score: 42.0
Episode: 675 Score: 49.0
Episode: 676 Score: 67.0
Episode: 677 Score: 38.0
Episode: 678 Score: 60.0
Episode: 679 Score: 49.0
Episode: 680 Score: 51.0
Episode: 681 Score: 31.0
Episode: 682 Score: 76.0
Episode: 683 Score: 37.0
Episode: 684 Score: 38.0
Episode: 685 Score: 39.0
Episode: 686 Score: 40.0
Episode: 687 Score: 125.0
Episode: 688 Score: 44.0
Episode: 689 Score: 54.0
Episode: 690 Score: 47.0
Episode: 691 Score: 10.0
Episode: 692 Score: 21.0
Episode: 693 Score: 49.0
Episode: 694 Score: 87.0
Episode: 695 Score: 34.0
Episode: 696 Score: 56.0
Episode: 697 Score: 12.0
Episode: 698 Score: 20.0
Episode: 699 Score: 38.0
Episode: 700 Score: 38.0
Episode: 701 Score: 20.0
Episode: 702 Score: 37.0


In [9]:
env.close()