In [1]:
import numpy as np
import gym 

import tensorflow as tf
import tensorflow_probability as tfp

import matplotlib.pyplot as plt

%matplotlib inline
%load_ext line_profiler

In [3]:
env = gym.make("CartPole-v0")

# gym compatibility: unwrap TimeLimit
if hasattr(env,'env'):
    env=env.env

obs = env.reset()
obs = obs[:,np.newaxis].T
    
n_actions = env.action_space.n
state_dim = env.observation_space.shape

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

n_hidden_0 = 20
n_hidden_1 = 20

# <define network graph using raw tf or any deep learning library>
model = Sequential( [
            Dense( n_hidden_0, activation='relu', input_shape=state_dim ),
            Dense( n_hidden_1, activation='relu' ),    
            Dense( n_actions,  activation='linear' ),
])

optimizer = tf.optimizers.Adam(learning_rate=0.001)

In [5]:

def calc_entropy( model, _states ):
    
    logits = model(_states)    
    probs = tf.nn.softmax(logits)
    log_probs = tf.nn.log_softmax(logits)
    ent = tf.reduce_sum( -probs * log_probs, axis=1 )
    
    return ent

def calc_loss( model, _states, _actions, _rewards, _gamma, _entropy_weight ):
    """Calculate the loss function to be minimized"""
    
    entropy = tf.reduce_mean( calc_entropy( model, _states ) )
    
    logprobs = calc_logprobs_for_actions( model, _states=_states, _actions=_actions )
    
    cum_rewards = get_cumulative_rewards(_rewards=_rewards, _gamma=_gamma )    
    
    J = tf.reduce_mean( logprobs * cum_rewards )           
    
    loss = -J - _entropy_weight * entropy         
    
    return loss

def calc_gain( model, _states, _actions, _rewards, _gamma, _entropy_weight ):
    """Calculate the loss function to be minimized"""
    
    entropy = tf.reduce_mean( calc_entropy( model, _states ) )
    
    probs = tf.math.exp( calc_logprobs_for_actions( model, _states=_states, _actions=_actions ) )
    cum_rewards = get_cumulative_rewards(_rewards=_rewards, _gamma=_gamma )    
    objective = tf.reduce_mean( probs * cum_rewards )    
    
    gain = objective + _entropy_weight * entropy
    return gain

def train_step( model, optimizer, states, actions, rewards, gamma, entropy_weight ):
    """given full session, trains agent with policy gradient"""

    with tf.GradientTape() as tape:
        tape.watch(model.weights)
        current_loss = calc_loss( model, _states=states, _actions=actions, _rewards=rewards, 
                         _gamma=gamma, _entropy_weight=entropy_weight )
    
    grads = tape.gradient( current_loss, model.weights )
    optimizer.apply_gradients( zip( grads, model.weights ) )

    return current_loss

def choose_action_from_policy( model, states ):
    
    # Choose a random action
    logits = model(states)
    
    action = tf.random.categorical( logits, num_samples=1 )
    return action
    
    
def calc_logprobs_for_actions( model, _states, _actions ):    
    
    # Get the log probabilties of each action
    logits = model(_states)    
    logprob_mtx = tf.nn.log_softmax(logits)
    
    N = tf.shape(logprob_mtx)[0]
    indices = tf.stack( [ tf.range(N), tf.reshape( _actions, (N,) ) ], axis=-1)
    log_probs_for_actions = tf.gather_nd( logprob_mtx, indices)    
    
    return log_probs_for_actions


def get_cumulative_rewards(_rewards, _gamma ):
    """Get the cumulative rewards for the J(\theta) function."""

    # Calculate the discount factor. Assumes the first rewards are earliest    
    cum_rewards = [np.nan] * len(_rewards)

    cum_rewards[-1] = _rewards[-1]
    for t in range(1, _rewards.shape[0] ):
        cum_rewards[-(t+1)] = _rewards[-(t+1)] + _gamma * cum_rewards[-t] 

    output_cum_rewards = tf.convert_to_tensor( cum_rewards, tf.float32 )
    return output_cum_rewards


def generate_session( model, env, t_max, gamma, entropy_weight ):
    """play env with REINFORCE agent and train at the session end"""

    # arrays to record session
    states, actions, rewards = [], [], []

    obs = env.reset()
    obs = obs[:,np.newaxis].T

    for t in range(t_max):

        # action probabilities array aka \pi(a|s)        
        action = choose_action_from_policy( model, states=obs )
        
        # Move to next state
        new_obs, r, done, info = env.step( action.numpy()[0,0] )
        new_obs = new_obs[:,np.newaxis].T

        # record session history to train later
        states.append(obs)
        actions.append(action)
        rewards.append(r)

        obs = new_obs
        if done:
            break

    # Form Tensors out of the collected outputs
    state_mtx = tf.stack( np.vstack( states ) )
    action_mtx = tf.dtypes.cast( tf.stack( np.vstack( actions ) ), tf.int32 )
    reward_mtx = tf.stack( np.vstack( rewards ) )
    
    curr_loss = train_step( model, optimizer=optimizer, states=state_mtx, actions=action_mtx,
                   rewards=reward_mtx, gamma=gamma, entropy_weight=entropy_weight )

    # return session rewards to print them later
    return sum(rewards)

In [6]:
t_max = 1000
#gamma = tf.constant( .99, dtype=tf.float64 )
#entropy_weight = tf.constant(0.10, dtype=tf.float64)
gamma = 0.99
entropy_weight = 0.10

In [7]:
for i in range(10):

    rewards = []
    for _ in range(100):
        rewards.append( 
            generate_session( model, 
                      env, 
                      t_max=t_max, 
                      gamma=gamma, 
                      entropy_weight=entropy_weight)
        )

    print("mean reward:%.3f" % (np.mean(rewards)))

    if np.mean(rewards) > 300:
        print("You Win!") # but you can train even further
        break

mean reward:20.050
mean reward:23.550
mean reward:22.540
mean reward:22.180
mean reward:21.670
mean reward:22.540
mean reward:23.790
mean reward:20.540
mean reward:23.350


KeyboardInterrupt: 

In [None]:
_t_max = 1000
_gamma = 0.99
_entropy_weight = 0.0

gain = []
for _ in range(100):
    # arrays to record session
    states, actions, rewards = [], [], []

    obs = env.reset()
    obs = obs[:,np.newaxis].T

    for t in range(_t_max):

        # action probabilities array aka \pi(a|s)
        action = choose_action_from_policy( model, obs )

        # Move to next state
        new_obs, r, done, info = env.step( action )
        new_obs = new_obs[:,np.newaxis].T

        # record session history to train later
        states.append(obs)
        actions.append(action)
        rewards.append(r)

        obs = new_obs
        if done:
            break

    # Form Tensors out of the collected outputs
    state_mtx = tf.stack( np.vstack( states ) )
    action_mtx = tf.dtypes.cast( tf.stack( np.vstack( actions ) ), 'int32' )
    reward_mtx = tf.stack( np.vstack( rewards ) )

    optimizer.learning_rate = 0.01
    with tf.GradientTape() as t:

        loss_fun = lambda : calc_loss( model, _states=state_mtx, _actions=action_mtx, 
                              _rewards=reward_mtx, _gamma=gamma, _entropy_weight=entropy_weight )
        gain_fun = lambda : calc_gain( model, _states=state_mtx, _actions=action_mtx, 
                              _rewards=reward_mtx, _gamma=gamma, _entropy_weight=entropy_weight )        
        orig_loss = loss_fun()

        tmp = gain_fun()
        x0 = tmp.numpy()
        #print( 'Loss Before: {}'.format(orig_loss) )

        theta = model.trainable_variables

        cum_rewards = get_cumulative_rewards(_rewards=reward_mtx, _gamma=gamma )   
        logprobs = calc_logprobs_for_actions( model, _states=state_mtx, _actions=action_mtx )
        A = -tf.reduce_mean(logprobs * cum_rewards)
        #grads = t.gradient( orig_loss, theta )

        #optimizer.apply_gradients( zip( grads, theta ) )
        optimizer.minimize( loss_fun, var_list=model.weights )

        x1 = gain_fun().numpy()

        #print( 'Loss After: {}'.format( post_loss ) )
        #print( 'Loss Chg: {}'.format( ( x1 - x0 ) ) )
        gain.append(x1-x0)


In [None]:
plt.hist(gain)

In [None]:
reductions = []
for _ in range(100):
    with tf.GradientTape() as t:
        loss_fun = lambda : calc_loss( model, _states=state_mtx, _actions=action_mtx, 
                                  _rewards=reward_mtx, _gamma=gamma, _entropy_weight=entropy_weight )
        orig_loss = loss_fun()
        x0 = orig_loss.numpy()    
        #grads = t.gradient( orig_loss, model.weights )
        #optimizer.apply_gradients( zip( grads, model.weights ) )
        optimizer.minimize( loss_fun, model.weights )
        post_loss = loss_fun()
        x1 = post_loss.numpy()
        reductions.append(x1 - x0)

In [None]:
xx = np.arange( -1., 1.01, .1 )
plt.plot( xx, np.exp(xx) - 2 * xx )

In [None]:
M = Sequential( [ 
        Dense( 1, activation='relu', input_shape=(1,) ), 
        Dense( 1, activation='relu' )
] )

def lf( M, x ):
    return tf.math.exp( M(x) ) - 2 * M(x)

x = tf.reshape( tf.Variable(1.0), (1,1) )
for _ in range(100):
    with tf.GradientTape() as tape:
        L = lambda : lf(M,x)
        #fl = lambda : (x-1) ** 2
        # loss = ( x - 1 ) ** 2
        optimizer.minimize( L, var_list=M.weights )
        #grads = tape.gradient( loss, x )
        #optimizer.apply_gradients( zip( [grads], [x] ) )
        print(M(x))

In [None]:
#x = tf.Variable([4.])   

for _ in range(100):
    with tf.GradientTape() as tape:
        fl = lambda : (x-1) ** 2
        loss = ( x - 1 ) ** 2
        optimizer.minimize( fl, var_list=[x] )
        #grads = tape.gradient( loss, x )
        #optimizer.apply_gradients( zip( [grads], [x] ) )
        print(x[0])

In [89]:
tf.optimizers.Adam?

In [110]:
Solution().longestPalindrome('abacadaba')

[0, 0, -1]
[1, 1, 0]
[2, 2, 1]
[3, 3, 2]
[4, 4, 3]
[5, 5, 4]
[6, 6, 5]
[7, 7, 6]
[8, 8, 7]


'aba'