In [None]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import cv2
import random
import datetime
import os
import time
from dqn import logger
from dqn.commons.schedules import LinearSchedule
from mlagents.envs import UnityEnvironment
from dqn.tf_util import get_session,initialize
from dqn.models import build_q_func
from dqn.utils import adjust_shape
from dqn.build_graph import build_train
from dqn.utils import ObservationInput
from dqn.replay_buffer import ReplayBuffer,PrioritizedReplayBuffer

%matplotlib inline

### Set environment path

Be sure to set `env_name` to the name of the Unity environment file you want to launch.

In [None]:
os_ = "Windows"

env_name = "../environment/" + os_ + "/Driving" # Name of the Unity environment binary to launch
train_mode = True # Whether to run the environment in training or inference mode
#help(UnityEnvironment)

### Start the environment
`UnityEnvironment` launches and begins communication with the environment when instantiated.

Environments contain _brains_ which are responsible for deciding the actions of their associated _agents_. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [None]:
env = UnityEnvironment(file_name=env_name,worker_id=4, base_port=5005)

# Examine environment parameters
print(str(env))

# Set the default brain to work with
default_brain = env.brain_names[0]
brain = env.brains[default_brain]

### Examine the observation and state spaces
We can reset the environment to be provided with an initial set of observations and states for all the agents within the environment. In ML-Agents, _states_ refer to a vector of variables corresponding to relevant aspects of the environment for an agent. Likewise, _observations_ refer to a set of relevant pixel-wise visuals for an agent.

In [None]:
# Reset the environment
env_info = env.reset(train_mode=train_mode)[default_brain]

# Examine the state space for the default brain
print("Sensor data (LIDAR): \n{}".format(env_info.vector_observations[0]))

# Examine the observation space for the default brain
Num_obs = len(env_info.visual_observations)

print("Image data (Front Camera): \n{}")
if Num_obs > 1:
    f, axarr = plt.subplots(1, Num_obs, figsize=(20,10))
    for i, observation in enumerate(env_info.visual_observations):
        if observation.shape[3] == 3:
            axarr[i].imshow(observation[0,:,:,:])
            axarr[i].axis('off')
        else:
            axarr[i].imshow(observation[0,:,:,0])
            axarr[i].axis('off')
else:
    f, axarr = plt.subplots(1, Num_obs)
    for i, observation in enumerate(env_info.visual_observations):
        if observation.shape[3] == 3:
            axarr.imshow(observation[0,:,:,:])
            axarr.axis('off')
        else:
            axarr.imshow(observation[0,:,:,0])
            axarr.axis('off')

In [None]:
import os
import tempfile
import zipfile
import cloudpickle

class ActWrapper(object):
    def __init__(self, act, act_params):
        self._act = act
        self._act_params = act_params
        self.initial_state = None

    @staticmethod
    def load_act(path):
        with open(path, "rb") as f:
            model_data, act_params = cloudpickle.load(f)
        act = build_act(**act_params)
        sess = tf.Session()
        sess.__enter__()
        with tempfile.TemporaryDirectory() as td:
            arc_path = os.path.join(td, "packed.zip")
            with open(arc_path, "wb") as f:
                f.write(model_data)

            zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
            load_variables(os.path.join(td, "model"))

        return ActWrapper(act, act_params)

    def __call__(self, *args, **kwargs):
        return self._act(*args, **kwargs)

    def step(self, observation, **kwargs):
        # DQN doesn't use RNNs so we ignore states and masks
        kwargs.pop('S', None)
        kwargs.pop('M', None)
        return self._act([observation], **kwargs), None, None, None

    def save_act(self, path=None):
        """Save model to a pickle located at `path`"""
        if path is None:
            path = os.path.join(logger.get_dir(), "model.pkl")

        with tempfile.TemporaryDirectory() as td:
            save_variables(os.path.join(td, "model"))
            arc_name = os.path.join(td, "packed.zip")
            with zipfile.ZipFile(arc_name, 'w') as zipf:
                for root, dirs, files in os.walk(td):
                    for fname in files:
                        file_path = os.path.join(root, fname)
                        if file_path != arc_name:
                            zipf.write(file_path, os.path.relpath(file_path, td))
            with open(arc_name, "rb") as f:
                model_data = f.read()
        with open(path, "wb") as f:
            cloudpickle.dump((model_data, self._act_params), f)

    def save(self, path):
        save_variables(path)
        



In [None]:
def get_obs_state_lidar(env_info):
    #Informations d'observation vectorielle
    #Dans ce simulateur, la taille de l'observation vectorielle est de 373 .
    #0 ~ 359: Données LIDAR (1 particule pour 1 degré)
    #360 ~ 362: avertissement gauche, avertissement droit, avertissement avant (0: faux, 1: vrai)
    #363: distance avant normalisée
    #364: Vitesse du véhicule en marche avant
    #365: Vitesse du véhicule hôte
    #0 ~ 365 sont utilisés comme données d'entrée pour le capteur
    #366 ~ 372 sont utilisés pour envoyer des informations
    #366: Nombre de dépassements dans un épisode
    #367: Nombre de changement de voie dans un épisode
    #368 ~ 372: récompense longitudinale, récompense latérale, 
    #   récompense de dépassement, récompense de violation, récompense de collision
    
    state = env_info.vector_observations[0][:-7] #recupere les donnees du capteur
    
    return  np.uint8(state)

def state_initialization(env_info,Num_stackFram,Num_skipFrame,Num_dataSize): 
    #Informations d'observation vectorielle
    #Dans ce simulateur, la taille de l'observation vectorielle est de 373 .
    #0 ~ 359: Données LIDAR (1 particule pour 1 degré)
    #360 ~ 362: avertissement gauche, avertissement droit, avertissement avant (0: faux, 1: vrai)
    #363: distance avant normalisée
    #364: Vitesse du véhicule en marche avant
    #365: Vitesse du véhicule hôte
    #0 ~ 365 sont utilisés comme données d'entrée pour le capteur
    #366 ~ 372 sont utilisés pour envoyer des informations
    #366: Nombre de dépassements dans un épisode
    #367: Nombre de changement de voie dans un épisode
    #368 ~ 372: récompense longitudinale, récompense latérale, 
    #   récompense de dépassement, récompense de violation, récompense de collision
    

    
    state = env_info.vector_observations[0][:-7] #recupere les donnees du capteur

    state_set = []

    for i in range(Num_skipFrame * Num_stackFrame):
        state_set.append(state)
    
    # Stack the frame according to the number of skipping and stacking frames using observation set
    state_stack = np.zeros((Num_stackFrame, Num_dataSize))
    for stack_frame in range(Num_stackFrame):
        state_stack[(Num_stackFrame - 1) - stack_frame, :] = state_set[-1 - (Num_skipFrame * stack_frame)]

    state_stack = np.uint8(state_stack)

    return state_stack, state_set

# Resize input information 
def resize_state(env_info, state_set,Num_stackFram,Num_skipFrame,Num_dataSize):
    state = env_info.vector_observations[0][:-7]

    # Add state to the state_set
    state_set.append(state)
    
    # Stack the frame according to the number of skipping and stacking frames using observation set
    state_stack = np.zeros((Num_stackFrame, Num_dataSize))
    for stack_frame in range(Num_stackFrame):
        state_stack[(Num_stackFrame - 1) - stack_frame, :] = state_set[-1 - (Num_skipFrame * stack_frame)]
        
    del state_set[0]

    state_stack = np.uint8(state_stack)
    
    return state_stack, state_set



In [None]:
# Code for tensorboard
def setup_summary(Num_plot_episode):
    
    episode_speed      = tf.Variable(0.)
    episode_overtake   = tf.Variable(0.)
    episode_lanechange = tf.Variable(0.)
    episode_rewards    =tf.Variable(0.)

    tf.summary.scalar('Average_Speed/' + str(Num_plot_episode) + 'episodes', episode_speed)
    tf.summary.scalar('Average_overtake/' + str(Num_plot_episode) + 'episodes', episode_overtake)
    tf.summary.scalar('Average_lanechange/' + str(Num_plot_episode) + 'episodes', episode_lanechange)
    
    tf.summary.scalar('Average_reward/' + str(Num_plot_episode) + 'episodes', episode_rewards)


    summary_vars = [episode_speed, episode_overtake, episode_lanechange,episode_rewards]
    summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))]
    update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))]
    summary_op = tf.summary.merge_all()
    return summary_placeholders, update_ops, summary_op

In [None]:
# ================================================================
# Tensorboard interfacing
# ================================================================
    
import os.path as osp 
import threading, time

def launch_tensorboard_in_background(log_dir):

    import subprocess
    subprocess.Popen(['tensorboard', '--logdir', log_dir])

def start_tensorboard(session):
    
    time.sleep(10) # Wait until graph is setup
    tb_path = osp.join(logger.get_dir(), 'tb')
    summary_writer = tf.summary.FileWriter(tb_path, graph=session.graph)
    summary_op = tf.summary.merge_all()
    launch_tensorboard_in_background(tb_path)
    


In [None]:
def load_act(path):
    """Load act function that was returned by learn function.
    Parameters
    ----------
    path: str
        path to the act function pickle
    Returns
    -------
    act: ActWrapper
        function that takes a batch of observations
        and returns actions.
    """
    return ActWrapper.load_act(path)


def learn(env,network='mlp',
          seed=None,
          lr=5e-4,
          total_timesteps=10000000,
          buffer_size=100000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1000,
          batch_size=500,
          print_freq=2,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=50000,
          gamma=0.99,
          target_network_update_freq=10000,
          prioritized_replay=True,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=True,
          callback=None,
          load_path=None,
          train_mode = True,
          **network_kwargs
         ):
    """Train a deepq model.
    Parameters
    -------
    env: gym.Env
        environment to train on
    network: string or a function
        neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models
        (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which
        will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that)
    seed: int or None
        prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used.
    lr: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
    batch_size: int
        size of a batch sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to total_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    load_path: str
        path to load the model from. (default: None)
    **network_kwargs
        additional keyword arguments to pass to the network builder.
    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.

    """
    # Examine environment parameters
    #print(str(env))
    # Set the default brain to work with
    default_brain = env.brain_names[0]
    brain = env.brains[default_brain]

    num_actions=brain.vector_action_space_size[0]
    
    # Create all the functions necessary to train the model
    
    
    GPU_fraction = 0.4
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = GPU_fraction

    sess = get_session(config=config)
    #set_global_seeds(seed)

    q_func = build_q_func(network, **network_kwargs)

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    #observation_space = env.observation_space

    
    env_info = env.reset(train_mode=train_mode)[default_brain]
    reset = True
    obs = get_obs_state_lidar(env_info)

    observation_space=obs.copy()
    
    #start_tensorboard
    #th = threading.Thread(target=start_tensorboard, args=([sess]))
    #th.start()
    
   

   
    
    
    
    
    #def make_obs_ph(name,Num_action):

    #    tf.placeholder(shape=(None,) + state.shape, dtype=state.dtype, name='st')
        
    #    return tf.placeholder(tf.float32, shape = [None, Num_action],name=name)
    

    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)

    act, train, update_target, debug =build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': num_actions,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = total_timesteps
            
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
        
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)
    
     # date - hour - minute of training time
    date_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
     # Make folder for save data
    os.makedirs(checkpoint_path + date_time + '_DQN_sensor')
    speed_list = []
    overtake_list = []
    lanechange_list = []

    # Summary for tensorboard
    summary_placeholders, update_ops, summary_op = setup_summary(print_freq)
    summary_writer = tf.summary.FileWriter(checkpoint_path + date_time + '_DQN_sensor', sess.graph)
    

    # Initialize the parameters and copy them to the target network.
    initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    #obs = env.reset()
    speed_list = []
    overtake_list = []
    lanechange_list = []
    

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path + date_time + '_DQN_sensor' or td

        model_file = os.path.join(td, "model")
        model_saved = False
        

        if tf.train.latest_checkpoint(td) is not None:
            load_state(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            print('Loaded model from {}'.format(model_file))
            model_saved = True
        elif load_path is not None:
            load_state(load_path)
            logger.log('Loaded model from {}'.format(load_path))
            print('Loaded model from {}'.format(load_path))
            
            
            
       
        
        
        for t in range(total_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
    
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(num_actions))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            

            action = act(np.array(obs), update_eps=update_eps, **kwargs)[0]
            env_action = action
            reset = False
            
             # Get information for plotting
            vehicle_speed  = 100 * env_info.vector_observations[0][-8]
            num_overtake   = env_info.vector_observations[0][-7]
            num_lanechange = env_info.vector_observations[0][-6]
            
            
            # Get information for update
            env_info = env.step(action)[default_brain]
            new_obs = get_obs_state_lidar(env_info)
            rew = env_info.rewards[0]
            done = env_info.local_done[0]
            
            #new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs
            
            
            episode_rewards[-1] += rew
            num_episodes = len(episode_rewards)
            
            tab=episode_rewards[-print_freq-1:-1]
            
            if len(tab)==0:
                mean_reward=episode_rewards[-1]
            else:
                mean_reward = round(np.mean(tab), 1) #mean reward of last print_freq episode
            
            speed_list.append(vehicle_speed)
            overtake_list.append(num_overtake)
            lanechange_list.append(num_lanechange)
           
            
            
            
            #print('step: ' + str(t) + ' / '  + 'episode: ' + str(num_episodes)+' action '+str(action)+' done '+str(done))
            
            if done:
                #eps = tf.get_variable("eps", ())
                #Epsilon=eps.eval()
                
                # Print informations if terminal
                print('step: ' + str(t) + ' / '  + 'episode: ' + str(num_episodes) + ' / ' + ' episode_rewards: ' 
                      + str(episode_rewards[-1]))
                
                env_info = env.reset(train_mode=train_mode)[default_brain]
                reset = True
                obs = get_obs_state_lidar(env_info)
                #obs = env.reset()
                episode_rewards.append(0.0)
                #reset = True
                
                avg_speed = np.mean(speed_list)
                avg_overtake = np.mean(overtake_list)
                avg_lanechange = np.mean(lanechange_list)
                
                
            
            
            
            

            if train_mode and t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                    
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes, new_priorities)
            
                

            if train_mode and t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            
            
            if done and print_freq is not None and num_episodes % print_freq == 0:
                #logger.record_tabular("steps", t)
                #logger.record_tabular("episodes", num_episodes)
                #logger.record_tabular("mean 100 episode reward", mean_reward)
                #logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                #logger.dump_tabular()
                
                
                #avg_speed      = np.mean(speed_list)
                #avg_overtake   = np.mean(overtake_list)
                #avg_lanechange = np.mean(lanechange_list)
                tensorboard_info = [avg_speed, avg_overtake, avg_lanechange,mean_reward]
                
                for i in range(len(tensorboard_info)):
                    sess.run(update_ops[i], feed_dict = {summary_placeholders[i]: float(tensorboard_info[i])})
                    
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, t)
                
                speed_list = []
                overtake_list = []
                lanechange_list = []
                

            if (train_mode and checkpoint_freq is not None and t > learning_starts and num_episodes >= print_freq and t % checkpoint_freq == 0):
                
                if saved_mean_reward is None or mean_reward > saved_mean_reward:
                    
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_reward))
                        
                        print("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_reward))
                    save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_reward
                    
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
                print("Restored model with mean reward: {}".format(saved_mean_reward))
                load_state(model_file)

    return act

In [None]:
model =learn(env,dueling=False,checkpoint_path='saved_networks/',train_mode=True)