[link](http://quant.am/cs/2017/08/07/policy-gradients/)

### State
0	Cart Position             -4.8            4.8<br>
1	Cart Velocity             -Inf            Inf<br>
2	Pole Angle                 -24 deg        24 deg<br>
3	Pole Velocity At Tip      -Inf            Inf<br>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import random
import numpy as np
import scipy.stats
import gym
import time

In [2]:
class LinearSoftmaxAgent(object):
    """Act with softmax policy. Features are encoded as
    phi(s, a) is a 1-hot vector of states."""
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.states = []
        self.actions = []
        self.probs = []
        self.rewards = []
        self.theta = np.random.random(state_size * action_size)
        self.alpha = .01
        self.gamma = .99

    def store(self, state, action, prob, reward):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(prob)
        self.rewards.append(reward)

    def _phi(self, s, a):
        encoded = np.zeros([self.action_size, self.state_size])
        encoded[a] = s
        return encoded.flatten()

    def _softmax(self, s, a):
        return np.exp(self.theta.dot(self._phi(s, a)) / 100)

    def pi(self, s):
        """\pi(a | s)"""
        weights = np.empty(self.action_size)
        for a in range(self.action_size):
            weights[a] = self._softmax(s, a)
        return weights / np.sum(weights)

    def act(self, state):
        probs = self.pi(state)
        a = random.choices(range(0, self.action_size), weights=probs)
        a = a[0]
        pi = probs[a]
        return (a, pi)

    def _gradient(self, s, a):
        expected = 0
        probs = self.pi(s)
        for b in range(0, self.action_size):
            expected += probs[b] * self._phi(s, b)
        return self._phi(s, a) - expected

    def _R(self, t):
        """Reward function."""
        total = 0
        for tau in range(t, len(self.rewards)):
            total += self.gamma**(tau - t) * self.rewards[tau]
        return total

    def train(self):
        self.rewards -= np.mean(self.rewards)
        self.rewards /= np.std(self.rewards)
        for t in range(len(self.states)):
            s = self.states[t]
            a = self.actions[t]
            r = self._R(t)
            grad = self._gradient(s, a)
            self.theta = self.theta + self.alpha * r * grad
        self.states = []
        self.actions = []
        self.probs = []
        self.rewards = []

    def getName(self):
        return 'LinearSoftmaxAgent'

In [3]:
try: env.close()
except: env = gym.make('CartPole-v1')

state = env.reset()
SAVE_FREQUENCY = 10

score = 0
episode = 0
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
g = LinearSoftmaxAgent(state_size, action_size)

MAX_EPISODES = 100
while episode < MAX_EPISODES:  # episode loop
    env.render()
    action, prob = g.act(state)
    state, reward, done, info = env.step(action)  # take a random action
    if done:
        reward = -10
    score += reward
    g.store(state, action, prob, reward)

    if done:
        episode += 1
        g.train()
        print('Episode: {} Score: {}'.format(episode, score))
        score = 0
        state = env.reset()
        env.render()
    time.sleep(0.03)
env.close()

  result = entry_point.load(False)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode: 1 Score: 73.0
Episode: 2 Score: 4.0
Episode: 3 Score: 5.0
Episode: 4 Score: 2.0
Episode: 5 Score: 7.0
Episode: 6 Score: 12.0
Episode: 7 Score: 2.0
Episode: 8 Score: 5.0
Episode: 9 Score: 3.0
Episode: 10 Score: 29.0
Episode: 11 Score: 9.0
Episode: 12 Score: 10.0
Episode: 13 Score: 1.0
Episode: 14 Score: 11.0
Episode: 15 Score: 0.0
Episode: 16 Score: 14.0
Episode: 17 Score: 22.0
Episode: 18 Score: 1.0
Episode: 19 Score: -1.0
Episode: 20 Score: 28.0
Episode: 21 Score: 1.0
Episode: 22 Score: 22.0
Episode: 23 Score: 8.0
Episode: 24 Score: -3.0
Episode: 25 Score: 0.0
Episode: 26 Score: -1.0
Episode: 27 Score: 16.0
Episode: 28 Score: 75.0
Episode: 29 Score: -1.0
Episode: 30 Score: 20.0
Episode: 31 Score: 3.0
Episode: 32 Score: 5.0
Episode: 33 Score: 4.0
Episode: 34 Score: 17.0
Episode: 35 Score: 10.0
Episode: 36 Score: 15.0
Episode: 37 Score: 3.0
Episode: 38 Score: 6.0
Episode:

In [4]:
env.close()