In [1]:
import tensorflow as tf
import numpy as np
import time
import gymnasium as gym
import tqdm

SEED = 42
tf.random.set_seed(SEED)

In [2]:
env = gym.make("CartPole-v1", render_mode="human")
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(5,activation='relu'))
model.add(tf.keras.layers.Dense(1,activation='sigmoid')) #left probability

#### CREDIT ASSIGNMENT

`We're trying to give weightage to the action which gave more rewards. Kind of weightage rewards. We do this by mutliplying the rewards by dicounted rate and adding it to the previous sum od rewards`

`Discounted Rate (Gamma) : [0.9,0.99] Mostly`

In Pole example, one good reward step followed by bad rewards step can make the pole fall. So we need to assign more importance to the good step

#### Policy Gradients
` Optimize learnable parameters of policy by following the gradients towards higher reward.`

#### Steps

`Step 1: let the nn play its game multiple timesand at every step just calculate the gradiets (wrt reward) but don't apply it immidiately.`

`Step 2: Once you have completed several episodes then compute the actions using discounted method. `

`Step 3: Result of previous step 2 can be poisitive and negative. `

In [3]:
def pg_policy(observation, model): # policy gradient -> PG
    left_probability = model.predict(observation[np.newaxis]) # probability value between 0, and 1
    action = int(np.random.rand() > left_probability) # value {0, 1} # exploration vs exploitation concept
    return action


def play_one_step(env, observation, model, loss_fn):
    with tf.GradientTape() as tape:
        left_prabability = model(observation[np.newaxis])
        action = (tf.random.uniform([1,1]) > left_prabability) # True and False
        y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32) # 
        loss = tf.reduce_mean(loss_fn(y_target, left_prabability)) 

    grads = tape.gradient(loss, model.trainable_variables) # dc/dw
    new_observation, reward, done, info, _ = env.step(int(action))
    return new_observation, reward, done, grads

def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = list()
    all_grads = list()
    for episode in range(n_episodes):
        current_rewards = list()
        current_grads = list()
        observation, info = env.reset()
        for step in range(n_max_steps):
            observation, reward, done, grads = play_one_step(env, observation, model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)
    N = len(rewards)
    for step in range(N - 2, -1, -1):
        # a_n + a_n+1*gamma
        discounted[step] = discounted[step] + discounted[step + 1] * discount_factor
    return discounted

def discount_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = list()
    for reward in all_rewards:
        # discounted rewards
        drs = discount_rewards(reward, discount_factor)
        all_discounted_rewards.append(drs)

    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()

    normalize_rewards = list()
    for discounted_rewards in all_discounted_rewards:
        nrs = (discounted_rewards - reward_mean) / reward_std
        normalize_rewards.append(nrs)
    return normalize_rewards

In [4]:
n_iterations = 150
n_episodes_per_update = 10
n_max_steps = 200
discount_factor = 0.95
learning_rate = 0.01

In [5]:
obs = env.reset(seed=SEED)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.binary_crossentropy

for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(
        env, n_episodes_per_update, n_max_steps, model, loss_fn
    )
    total_rewards = sum(map(sum, all_rewards))
    print(f"Iteration: {iteration + 1}/{n_iterations}",
    f"mean rewards: {total_rewards/n_episodes_per_update}"
    )
    all_final_rewards = discount_and_normalize_rewards(all_rewards, discount_factor)

    all_mean_grads = list()
    # Weight of 5 hidden nodes, bias for 5 nodes, w for output node, bias for output node
    N = len(model.trainable_variables)
    for var_index in range(N):
        temp_reduce_mean = list()
        for episode_index, final_rewards in enumerate(all_final_rewards): # rewards for every episode
            for step, final_reward in enumerate(final_rewards): # several steps
                result = final_reward * all_grads[episode_index][step][var_index]
                temp_reduce_mean.append(result)
        mean_grads = tf.reduce_mean(temp_reduce_mean, axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

Iteration: 1/150 mean rewards: 17.7
Iteration: 2/150 mean rewards: 15.6
Iteration: 3/150 mean rewards: 16.9
Iteration: 4/150 mean rewards: 25.7
Iteration: 5/150 mean rewards: 18.2
Iteration: 6/150 mean rewards: 17.1
Iteration: 7/150 mean rewards: 19.3
Iteration: 8/150 mean rewards: 19.4
Iteration: 9/150 mean rewards: 22.2
Iteration: 10/150 mean rewards: 21.9
Iteration: 11/150 mean rewards: 23.9
Iteration: 12/150 mean rewards: 20.2
Iteration: 13/150 mean rewards: 18.1
Iteration: 14/150 mean rewards: 18.2
Iteration: 15/150 mean rewards: 23.1
Iteration: 16/150 mean rewards: 21.1
Iteration: 17/150 mean rewards: 17.3
Iteration: 18/150 mean rewards: 29.1
Iteration: 19/150 mean rewards: 25.7
Iteration: 20/150 mean rewards: 20.2
Iteration: 21/150 mean rewards: 31.0
Iteration: 22/150 mean rewards: 18.3
Iteration: 23/150 mean rewards: 33.0
Iteration: 24/150 mean rewards: 28.9
Iteration: 25/150 mean rewards: 19.5
Iteration: 26/150 mean rewards: 25.7
Iteration: 27/150 mean rewards: 28.9
Iteration:

In [6]:
import re
import time

unique_name = re.sub(r"[\s+:]","_",time.asctime())
model_name = f"model_{unique_name}.h5"
model.save(model_name)
print("Model saved as {}".format(model_name))

Model saved as model_Fri_Mar_10_01_07_43_2023.h5


In [8]:
model = tf.keras.models.load_model('./model_Fri_Mar_10_01_07_43_2023.h5')



In [12]:
def show_one_episode(policy, model, n_max_steps=500, seed=42):
    env = gym.make("CartPole-v1", render_mode="human")
    obs, info = env.reset()
    for step in range(n_max_steps):
        env.render()
        action = policy(obs, model)
        obs, reward, done, info, _ = env.step(action)
        if done:
            break
    env.close()
    return step, obs

show_one_episode(pg_policy, model)





(236,
 array([-0.9454984 , -0.16241746,  0.22134353,  0.528573  ], dtype=float32))

In [15]:
def show_one_episode(policy, model, n_max_steps=500, seed=42):
    env = gym.make("CartPole-v1", render_mode="human")
    obs, info = env.reset()
    for step in range(n_max_steps):
        env.render()
        action = policy(obs, model)
        obs, reward, done, info, _ = env.step(action)
        if done:
            break
    env.close()
    return step, obs

show_one_episode(pg_policy, model)





(246,
 array([-2.4243603 , -1.2825011 , -0.0838647 ,  0.12254822], dtype=float32))