In [12]:
import gym
# load CartPole env version 1
env = gym.make("CartPole-v1",  render_mode="rgb_array")
# initiate one instance of cartpole env
obs = env.reset()
obs

(array([ 0.02886403,  0.00505042, -0.01196181, -0.0239739 ], dtype=float32),
 {})

In [14]:
def basic_policy(obs):
    if len(obs) == 2:
       angle = obs[0][2]
    else : 
       angle = obs[2]
    return 0 if angle < 0 else 1
totals = []
for episode in range(500):
    episode_rewards = 0
    obs = env.reset()
    for step in range(200):
        action = basic_policy(list(obs))
        obs, reward, done, info, _ = env.step(action)
        episode_rewards += reward
        env.render()
        if done :
            break
    totals.append(episode_rewards)

import numpy as np
print(f"mean: {np.mean(totals)},\n std: {np.std(totals)} \n min: {np.min(totals)} \n max: {np.max(totals)}")

mean: 41.964,
 std: 8.85317479777735 
 min: 25.0 
 max: 68.0


#####  **Credit Assignment-Problem** : wenn der Agent eine Belohnung erhählt, ist es schwierig, einzelne Aktionen hiefür zu loben (oder zu tadeln)
Lösung : Aktion müssen gelobt werden , wenn die nachfolgenden Belohnungen beobachtet wurden. d.h die erwarte gesamte Belohnung , die eine Aktion verursacht. 

In [15]:
import tensorflow as tf
from tensorflow import keras 
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [17]:
def play_one_step(env,  obs, model, loss_fn):
    with tf.GradientTape() as tape:
        left_prob = model(obs[np.newaxis])
        action = (tf.random.uniform([1, 1])> left_prob)
        y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32)
        loss = tf.reduce_mean(loss_fn(y_target, left_prob))
    grads = tape.gradient(loss, model.trainable_variables)

    obs, reward, done, _ , _ = env.step(int(action[0, 0].numpy()))

    return  obs, reward, done, grads

In [18]:
def play_multiple_episodes(env, n_epsiodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for _ in range(n_epsiodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()[0]
        for _ in range(n_max_steps):
            obs, reward, done, grads = play_one_step(env, obs, model, loss_fn)
            env.render()
            current_rewards.append(reward)
            current_grads.append(grads)
            if done :
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards , all_grads

In [19]:
def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)
    for step in range(len(rewards)-2, -1, -1):
        discounted[step] +=  discounted[step+1] * discount_factor 
    return discounted

def disocunt_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = [discount_rewards(rewards, discount_factor) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()

    return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]


In [20]:
n_iterations = 150
n_episode_per_update = 10
n_max_steps = 200
discount_factor = 0.95

In [21]:
n_inputs = env.observation_space.shape[0]

model = keras.models.Sequential([
      keras.layers.Dense(32, activation= "elu", input_shape=[n_inputs]),
      keras.layers.Dense(32, activation= "elu"),
      keras.layers.Dense(1, activation= "sigmoid" )
])
optimizer = keras.optimizers.Adam(learning_rate=0.001)
loss_fn = keras.losses.binary_crossentropy 

In [22]:
all_mean_rewards = []
for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(env, n_episode_per_update, n_max_steps, model, loss_fn) 
    all_final_rewards = disocunt_and_normalize_rewards(all_rewards, discount_factor)
    all_mean_grads = [ ]

    mean_reward = np.mean([reward for episode in all_rewards for reward in episode])

    total_rewards_per_episode = [sum(episode) for episode in all_rewards]


    for var_index in range(len(model.trainable_variables)):
        mean_grads = tf.reduce_mean([final_reward * all_grads[episode_index][step][var_index] 
                                    for episode_index , final_rewards in enumerate(all_final_rewards)
                                    for step, final_reward in enumerate(final_rewards)], axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))
    all_mean_rewards.append(np.mean(total_rewards_per_episode))

In [80]:
import plotly.express as px
import pandas as pd
import plotly.io as pio
pio.renderers.default = "browser" 

def plot_func(all_mean_rewards ):

    iterations = list(range(len(all_mean_rewards))) 
    df = pd.DataFrame({"Iteration": iterations, "Mean Reward": all_mean_rewards})

    fig = px.line(df, x="Iteration", y="Mean Reward", 
                title="Distributionm",
                #labels={"Iteration": "Itération", "Mean Reward": "Récompense Moyenne"},
                line_shape="linear")

    fig.show()

## **Deep-Q-Network**

In [None]:
env = gym.make("CartPole-v0")
input_shape = env.observation_space.shape
n_outputs = env.action_space.n

In [84]:
def epsilon_greedy_policy(state, epsilon =0):
    if np.random.rand() < epsilon :
         return np.random.randint(2)
    else: 
         Q_values = model.predict(state[np.newaxis])
         return np.argmax(Q_values[0])

In [85]:
from collections import deque

replay_buffer = deque(maxlen=2000)

def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_buffer), size=batch_size)
    batch = [replay_buffer[index] for index in indices]
    states, actions, rewards, next_states, dones = [np.array([experience[field_index] for experience in batch]) for field_index in range(5)]

    return states, actions, rewards, next_states, dones

In [86]:
def play_one_step(env, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)
    next_state, reward, done, _, _ = env.step(action)
    replay_buffer.append((state, action, reward, next_state, done))
    return next_state, reward, done, _

In [100]:
batch_size = 64
discount_factor = 0.95
epsilon = 0.99
optimizer = keras.optimizers.Adam(learning_rate=0.001)
loss_fn = keras.losses.MeanSquaredError()

In [88]:
def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones = experiences 
    next_Q_values = model.predict(next_states)
    max_next_Q_values = np.max(next_Q_values, axis=1)
    target_Q_values = (rewards + (1-dones)*discount_factor*max_next_Q_values)
    target_Q_values =  target_Q_values.reshape(-1, 1)
    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values*mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [101]:
all_mean_rewards = []
mean_reward_epsiode = []
model = keras.models.Sequential([keras.layers.Dense(32, activation= "elu", input_shape=input_shape),
      keras.layers.Dense(32, activation= "elu" ),
      keras.layers.Dense(n_outputs)])
for episode in range(150):
    obs = env.reset()[0]
    for step in range(200):
        epsilon = max(1- epsilon/500, 0.01)
        obs , reward, done, info= play_one_step(env, obs, epsilon)
        mean_reward_epsiode.append(reward)
        if done :
            break
    all_mean_rewards.append(sum(mean_reward_epsiode)/200.0)
    mean_reward_epsiode = []
    if episode > 0 :
        training_step(batch_size)
print(f"mean reward: {np.mean(all_mean_rewards)}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15

In [99]:
plot_func(all_mean_rewards=all_mean_rewards)