REFERENCES:
https://github.com/TychoTheTaco/Car-Racing

In [1]:
'''
    Below code does not work well for task 3 but works well for task 4 (ppo)
'''
#Importing necessary packages
#This environment is a slightly modified version of CarRacing-v0
from environments.custom_car_racing import CustomCarRacing
import datetime
import tensorflow as tf
from tensorflow_probability.python.distributions.beta import Beta
from tensorflow.keras import layers
import gym
import numpy as np
from pathlib import Path
from typing import Union, Optional
import matplotlib.pyplot as plt

# IF TRAINING BY PPO IS PREFERRED:
ppo_train = True

# Disable all GPUs
tf.config.set_visible_devices([], 'GPU')
visible_devices = tf.config.get_visible_devices()
for device in visible_devices:
    assert device.device_type != 'GPU'
    

TASK 1-2-3-4

In [2]:
#Implementing network
class Network():
    def __init__(self, env: gym.Env):
        self._env = env
        self._model = self._create_model()
        self._model.summary()
     
    #Creating the Model    
    def _create_model(self):
        def create_conv_layer(filters, kernel_size, strides):
            return layers.Conv2D(filters, kernel_size=kernel_size, strides=strides, activation='relu', kernel_initializer=tf.initializers.glorot_normal(),
                                 bias_initializer=tf.initializers.constant(0.1))

        # Input is a stack of frames
        input_0 = layers.Input(shape=(32, 32, 4))

        # Main network backbone. This is shared by the actor and critic.
        conv_0 = create_conv_layer(8, 4, 2)(input_0)
        conv_1 = create_conv_layer(16, 3, 2)(conv_0)
        conv_2 = create_conv_layer(32, 3, 2)(conv_1)
        flat_0 = layers.Flatten()(conv_2)

        # Actor output
        dense_0 = layers.Dense(64, activation='relu')(flat_0)
        dense_1 = layers.Dense(6, activation='softplus')(dense_0)
        reshape_0 = layers.Reshape((3, 2))(dense_1)
        lamb_0 = layers.Lambda(lambda x: x + 1)(reshape_0)  # Ensure alpha and beta are > 1

        # Critic output
        dense_2 = layers.Dense(64, activation='relu')(flat_0)
        dense_3 = layers.Dense(1)(dense_2)

        # Compile model
        model = tf.keras.Model(inputs=[input_0], outputs=[lamb_0, dense_3])
        model.compile(optimizer=tf.optimizers.Adam(0.001))

        return model
    
    #Given a (batch of) state(s), sample a (batch of) action(s), return the action(s) and the respective log-probability(ies)
    def sample_actions(self,state):
        p = self._model(np.expand_dims(state, axis=0))[0][0]
        alpha, beta = p[:, 0], p[:, 1]
        beta_distribution = Beta(alpha, beta)
        action = beta_distribution.sample()
        log_prob = tf.reduce_sum(beta_distribution.log_prob(action))
        return action, log_prob

    
    
    #Given a (batch of) state(s) and action(s), return the (batch of) log probability(ies) of the action(s) given the state(s)
    def sample_log(self, states, actions):
        log_probs = []
        for state in states:
            acts, log_prob = sample_actions(state)
            for i in range(len(acts)):
                for action in actions:
                    if act == action:
                        log_probs.append((action,state, log_prob[i]))
        return log_probs
        
    #Returns an action
    def get_action(self, observation, action_space: gym.Space):
        p = self._model.predict(np.expand_dims(observation, axis=0))[0][0]
        alpha, beta = p[:, 0], p[:, 1]
        distribution = Beta(alpha, beta)
        action = distribution.sample().numpy()
        action[0] = np.interp(action[0], [0, 1], [-1, 1])
        return action
    
    #Training for ppo (if ppo==True) or vanilla policy gradient (if ppo==false)
    def train(self,
              render = False,
              ppo = False,
              episodes: int = 1000,
              log_interval: int = 10,
              model_dir: str = 'models',
              save_interval: int = 100,
              buffer_size: int = 2000,
              trajectory_size: int = 128,
              gamma: float = 0.99,
              ppo_epochs: int = 10,
              clip_epsilon: float = 0.1):
        model_dir = Path(model_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
        model_dir.mkdir(parents=True, exist_ok=True)

        training_start_time = datetime.datetime.now()
        print('Started training at', training_start_time.strftime('%d-%m-%Y %H:%M:%S'))

        # Keep track of some stats
        episode_rewards = []
        moving_average_range = 50

        transitions = []

        for episode in range(1, episodes + 1):

            # Reset environment
            observation = self._env.reset()
            episode_reward = 0

            done = False
            while not done:
                if render: self._env.render()
                # Choose action
                p = self._model(np.expand_dims(observation, axis=0))[0][0]
                alpha, beta = p[:, 0], p[:, 1]
                beta_distribution = Beta(alpha, beta)
                action = beta_distribution.sample()
                log_prob = tf.reduce_sum(beta_distribution.log_prob(action))

                A = action.numpy()
                A[0] = np.interp(A[0], [0., 1.], [-1., 1.])
                #print(f'Action: {A}')

                # Perform action
                new_observation, reward, done, _ = self._env.step(A)
                episode_reward += reward

                transitions.append((observation, action, log_prob, reward, new_observation))
                if len(transitions) >= buffer_size:
                    print('learning!')

                    states = tf.convert_to_tensor([x[0] for x in transitions])
                    actions = tf.convert_to_tensor([x[1] for x in transitions])
                    old_a_logp = tf.expand_dims(tf.convert_to_tensor([x[2] for x in transitions]), axis=1)
                    rewards = tf.expand_dims(tf.convert_to_tensor([x[3] for x in transitions], dtype=np.float32), axis=1)
                    new_states = tf.convert_to_tensor([x[4] for x in transitions])

                    discounted_rewards = rewards + gamma * self._model(new_states)[1]
                    adv = discounted_rewards - self._model(states)[1]

                    
                    '''
                        TASK 2
                        This function helps to create a trajectory
                    '''
                    def gen_trajectory(indices, trajectory_size):
                        for i in range(0, len(indices), trajectory_size):
                            yield indices[i:i + trajectory_size]

                    for _ in range(ppo_epochs):
                        indices = np.arange(buffer_size)
                        np.random.shuffle(indices)

                        for batch in gen_trajectory(indices, trajectory_size):

                            with tf.GradientTape() as tape:

                                # Calculate action loss
                                ab = self._model(tf.gather(states, batch))[0]
                                alpha, beta = ab[:, :, 0], ab[:, :, 1]
                                dist = Beta(alpha, beta)
                                a_logp = tf.reduce_sum(dist.log_prob(tf.gather(actions, batch)), axis=1, keepdims=True)
                                
                                '''
                                    TASK 4 & 3
                                '''
                                if ppo:
                                    ratio = tf.exp(a_logp - tf.gather(old_a_logp, batch))
                                    surr1 = ratio * tf.gather(adv, batch)
                                    surr2 = tf.clip_by_value(ratio, 1.0 - clip_epsilon, 1.0 + clip_epsilon) * tf.gather(adv, batch)
                                    action_loss = tf.reduce_mean(-tf.minimum(surr1, surr2))
                                    #print(action_loss)
                                else:
                                    surr = a_logp * tf.gather(adv, batch)
                                    action_loss = tf.reduce_mean(-surr)
                                    #print(action_loss)
                                    '''
                                    probability = tf.reduce_sum(dist.prob(tf.gather(actions, batch)), axis=1, keepdims=True)      
                                    p_loss= []
                                    e_loss = []
                                    td = tf.gather(adv,batch)
                                    td = td.numpy()
                                    #print(td)
                                    for pb, t, lpb in zip(probability, td, a_logp):
                                                    t =  tf.constant(t)
                                                    policy_loss = tf.math.multiply(lpb,t)
                                                    entropy_loss = tf.math.negative(tf.math.multiply(pb,lpb))
                                                    p_loss.append(policy_loss)
                                                    e_loss.append(entropy_loss)
                                    p_loss = tf.stack(p_loss)
                                    e_loss = tf.stack(e_loss)
                                    p_loss = tf.reduce_mean(p_loss)
                                    e_loss = tf.reduce_mean(e_loss)
                                    # print(p_loss)
                                    # print(e_loss)
                                    action_loss = -p_loss - 0.0001 * e_loss
                                    #print(loss)
                                   '''
                                    
                            
                                # Calculate value loss
                                value_loss = tf.reduce_mean(
                                    tf.losses.mse(
                                        tf.gather(discounted_rewards, batch),
                                        self._model(tf.gather(states, batch))[1]
                                    ))
                                

                                # Calculate combined loss
                                loss = action_loss + 2 * value_loss

                            g = tape.gradient(loss, self._model.trainable_variables)
                            self._model.optimizer.apply_gradients(zip(g, self._model.trainable_variables))

                    transitions.clear()

                observation = new_observation

            episode_rewards.append(episode_reward)

            # Print some statistics
            if not episode % log_interval:
                print(f'Episode {episode} | Reward: {episode_reward:.02f} | Moving Average: {np.average(episode_rewards[-50:]):.02f}')

            # Save model
            if not episode % save_interval:
                self._model.save(model_dir / f'episode-{episode}.h5')

        # Save final model
        self._model.save(model_dir / 'model.h5')

        training_end_time = datetime.datetime.now()
        print('Finished training at', training_end_time.strftime('%d-%m-%Y %H:%M:%S'))
        print('Total training time:', training_end_time - training_start_time)
        np.savetxt(model_dir / 'rewards.txt', episode_rewards)

        # Plot statistics
        x_axis = np.arange(len(episode_rewards))
        plt.figure(1, figsize=(16, 9))
        plt.plot(x_axis, episode_rewards, label='Episode reward')
        moving_averages = [np.mean(episode_rewards[i - (moving_average_range - 1):i + 1]) if i >= (moving_average_range - 1) else np.mean(episode_rewards[:i + 1]) for i in range(len(episode_rewards))]
        plt.plot(x_axis, moving_averages, color='red', label=f'{moving_average_range}-episode moving average')
        plt.title('Training Performance')
        plt.xlabel('Episode')
        plt.ylabel('Score')
        plt.legend(loc='upper left')
        plt.savefig(model_dir / 'rewards.jpg')
        plt.show()





In [None]:
if __name__ == '__main__':

    # Create environment (slightly modified version of CarRacing environment)
    env = CustomCarRacing()
    
    # Create and train agent
    network = Network(env)
    network.train(episodes=2000, ppo = ppo_train, trajectory_size=512, render=True)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 32, 32, 4)]  0           []                               
                                                                                                  
 conv2d (Conv2D)                (None, 15, 15, 8)    520         ['input_1[0][0]']                
                                                                                                  
 conv2d_1 (Conv2D)              (None, 7, 7, 16)     1168        ['conv2d[0][0]']                 
                                                                                                  
 conv2d_2 (Conv2D)              (None, 3, 3, 32)     4640        ['conv2d_1[0][0]']               
                                                                                              



Episode 10 | Reward: -10.56 | Moving Average: -24.58
Episode 20 | Reward: -16.94 | Moving Average: -26.95
learning!
Episode 30 | Reward: -27.61 | Moving Average: -27.83
Episode 40 | Reward: -18.06 | Moving Average: -27.75
learning!
Episode 50 | Reward: -33.68 | Moving Average: -28.03
Episode 60 | Reward: -63.19 | Moving Average: -26.99
learning!
Episode 70 | Reward: -28.31 | Moving Average: -25.99
Episode 80 | Reward: -12.11 | Moving Average: -24.24
learning!
Episode 90 | Reward: -29.98 | Moving Average: -21.78
Episode 100 | Reward: -18.53 | Moving Average: -19.77
learning!
Episode 110 | Reward: -10.51 | Moving Average: -18.85
Episode 120 | Reward: 3.60 | Moving Average: -16.44
learning!
Episode 130 | Reward: -6.28 | Moving Average: -15.03
Episode 140 | Reward: 1.40 | Moving Average: -13.84
Episode 150 | Reward: -8.12 | Moving Average: -11.16
Episode 160 | Reward: -11.96 | Moving Average: -9.79
learning!
Episode 170 | Reward: -13.87 | Moving Average: -8.96
Episode 180 | Reward: -9.71 |