In [None]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import cv2
import random
import datetime
import os
import tempfile
import time
#from dqn import logger
#from dqn.commons.schedules import LinearSchedule
from mlagents.envs import UnityEnvironment
#from pg_actor_critic import PolicyGradientActorCritic
#from __future__ import print_function
from collections import deque
%matplotlib inline

### Set environment path

Be sure to set `env_name` to the name of the Unity environment file you want to launch.

In [None]:
os_ = "Windows"

env_name = "../environment/" + os_ + "/Driving" # Name of the Unity environment binary to launch
train_mode = True # Whether to run the environment in training or inference mode

### Start the environment
`UnityEnvironment` launches and begins communication with the environment when instantiated.

Environments contain _brains_ which are responsible for deciding the actions of their associated _agents_. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [None]:
env = UnityEnvironment(file_name=env_name,worker_id=2, base_port=5005)

# Examine environment parameters
print(str(env))

# Set the default brain to work with
default_brain = env.brain_names[0]
brain = env.brains[default_brain]
#brain.vector_action_space_type='continuous'

In [None]:
def build_actor_network(state_dim,num_actions):
    
    # define policy neural network
    def network_fn(states):
        W1 = tf.get_variable("W1", [state_dim, 20],initializer=tf.random_normal_initializer())
        b1 = tf.get_variable("b1", [20],initializer=tf.constant_initializer(0))
        h1 = tf.nn.tanh(tf.matmul(states, W1) + b1)
        W2 = tf.get_variable("W2", [20, num_actions],initializer=tf.random_normal_initializer(stddev=0.1))
        b2 = tf.get_variable("b2", [num_actions],initializer=tf.constant_initializer(0))
        p = tf.matmul(h1, W2) + b2

        return p
    return network_fn

def build_critic_network(state_dim,num_actions):
    
    def network_fn(states):
        # define policy neural network

        W1 = tf.get_variable("W1", [state_dim, 20],initializer=tf.random_normal_initializer())
        b1 = tf.get_variable("b1", [20],initializer=tf.constant_initializer(0))
        h1 = tf.nn.tanh(tf.matmul(states, W1) + b1)
        W2 = tf.get_variable("W2", [20, 1],initializer=tf.random_normal_initializer())
        b2 = tf.get_variable("b2", [1],initializer=tf.constant_initializer(0))
        v = tf.matmul(h1, W2) + b2
      
        return v

    return network_fn


In [None]:
def get_obs_state_lidar2(env_info):
    #Informations d'observation vectorielle
    #Dans ce simulateur, la taille de l'observation vectorielle est de 373 .
    #0 ~ 359: Données LIDAR (1 particule pour 1 degré)
    #360 ~ 362: avertissement gauche, avertissement droit, avertissement avant (0: faux, 1: vrai)
    #363: distance avant normalisée
    #364: Vitesse du véhicule en marche avant
    #365: Vitesse du véhicule hôte
    #0 ~ 365 sont utilisés comme données d'entrée pour le capteur
    #366 ~ 372 sont utilisés pour envoyer des informations
    #366: Nombre de dépassements dans un épisode
    #367: Nombre de changement de voie dans un épisode
    #368 ~ 372: récompense longitudinale, récompense latérale, 
    #   récompense de dépassement, récompense de violation, récompense de collision
    state=[]
    
    vector_obs= env_info.vector_observations #recupere les donnees du capteur
    
    
    state.append(vector_obs[0,0]) # 0 degrees 
    state.append(vector_obs[0,45]) # 45 degrees 
    state.append(vector_obs[0,90]) # 90 degrees 
    state.append(vector_obs[0,135]) # 135 degrees 
    state.append(vector_obs[0,180]) # 180 degrees 
    state.append(vector_obs[0,225]) # 225 degrees 
    state.append(vector_obs[0,270]) # 270 degrees 
    state.append(vector_obs[0,315]) # 315 degrees
    
    return np.array(state)

def get_obs_state_lidar(env_info):
    #Informations d'observation vectorielle
    #Dans ce simulateur, la taille de l'observation vectorielle est de 373 .
    #0 ~ 359: Données LIDAR (1 particule pour 1 degré)
    #360 ~ 362: avertissement gauche, avertissement droit, avertissement avant (0: faux, 1: vrai)
    #363: distance avant normalisée
    #364: Vitesse du véhicule en marche avant
    #365: Vitesse du véhicule hôte
    #0 ~ 365 sont utilisés comme données d'entrée pour le capteur
    #366 ~ 372 sont utilisés pour envoyer des informations
    #366: Nombre de dépassements dans un épisode
    #367: Nombre de changement de voie dans un épisode
    #368 ~ 372: récompense longitudinale, récompense latérale, 
    #   récompense de dépassement, récompense de violation, récompense de collision
    
    state = env_info.vector_observations[0][:-7] #recupere les donnees du capteur
    
    return  np.uint8(state)

In [None]:
import random
import numpy as np
import tensorflow as tf
import os

class PolicyGradientActorCritic(object):
    
    def __init__(self, session,
                     optimizer,
                     actor_network,
                     critic_network,
                     state_dim,
                     num_actions,
                     init_exp=0.1,         # initial exploration prob
                     final_exp=0.0,        # final exploration prob
                     anneal_steps=500000,    # N steps for annealing exploration
                     discount_factor=0.99, # discount future rewards
                     reg_param=0.001,      # regularization constants
                     max_gradient=5,       # max gradient norms
                     summary_writer=None,
                     train_mode=True,
                     summary_every=1):
        
        # tensorflow machinery
        self.session        = session
        self.optimizer      = optimizer
        self.summary_writer = summary_writer

        # model components
        self.actor_network  = actor_network
        self.critic_network = critic_network

        # training parameters
        self.state_dim       = state_dim
        self.num_actions     = num_actions
        self.discount_factor = discount_factor
        self.max_gradient    = max_gradient
        self.reg_param       = reg_param

        # exploration parameters
        self.exploration  = init_exp
        self.init_exp     = init_exp
        self.final_exp    = final_exp
        self.anneal_steps = anneal_steps

        # counters
        self.train_iteration = 0

        # rollout buffer
        self.state_buffer  = []
        self.reward_buffer = []
        self.action_buffer = []

        self.avg_speed=tf.Variable(0.)
        self.avg_overtake=tf.Variable(0.)
        self.avg_lanechange=tf.Variable(0.)
        self.mean_reward=tf.Variable(0.)
        #self.done=False

        self.train_mode=train_mode



        # create and initialize variables

        self.summary_every = summary_every
        self.create_variables()
        var_lists = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        self.session.run(tf.variables_initializer(var_lists))

        # make sure all variables are initialized
        self.session.run(tf.assert_variables_initialized())




        if self.summary_writer is not None:
          # graph was not available when journalist was created
          self.summary_writer.add_graph(self.session.graph)
          #print("self.summary_writer.add_graph(self.session.graph)")
        
    def resetModel(self):
        self.cleanUp()
        self.train_iteration = 0
        self.exploration     = self.init_exp
        var_lists = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        self.session.run(tf.variables_initializer(var_lists))

    def create_variables(self):
        
        with tf.name_scope("model_inputs"):
            # raw state representation
            self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states")

        # rollout action based on current policy
        with tf.name_scope("predict_actions"):
            # initialize actor-critic network
            with tf.variable_scope("actor_network"):
                self.policy_outputs = self.actor_network(self.states)
            with tf.variable_scope("critic_network"):
                self.value_outputs = self.critic_network(self.states)
                
            # predict actions from policy network
            self.action_scores = tf.identity(self.policy_outputs, name="action_scores")
            # Note 1: tf.multinomial is not good enough to use yet
            # so we don't use self.predicted_actions for now
            self.predicted_actions = tf.multinomial(self.action_scores, 1)

        # get variable list
        actor_network_variables  = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="actor_network")
        critic_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="critic_network")

        # compute loss and gradients
        with tf.name_scope("compute_pg_gradients"):
            # gradients for selecting action from policy network
            self.taken_actions = tf.placeholder(tf.int32, (None,), name="taken_actions")
            self.discounted_rewards = tf.placeholder(tf.float32, (None,), name="discounted_rewards")

            with tf.variable_scope("actor_network", reuse=True):
                self.logprobs = self.actor_network(self.states)

            with tf.variable_scope("critic_network", reuse=True):
                self.estimated_values = self.critic_network(self.states)

            # compute policy loss and regularization loss
            self.cross_entropy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logprobs, labels=self.taken_actions)
            self.pg_loss            = tf.reduce_mean(self.cross_entropy_loss)
            self.actor_reg_loss     = tf.reduce_sum([tf.reduce_sum(tf.square(x)) for x in actor_network_variables])
            self.actor_loss         = self.pg_loss + self.reg_param * self.actor_reg_loss

            # compute actor gradients
            self.actor_gradients = self.optimizer.compute_gradients(self.actor_loss, actor_network_variables)
            # compute advantages A(s) = R - V(s)
            self.advantages = tf.reduce_sum(self.discounted_rewards - self.estimated_values)
            # compute policy gradients
            for i, (grad, var) in enumerate(self.actor_gradients):
                if grad is not None:
                    self.actor_gradients[i] = (grad * self.advantages, var)

            # compute critic gradients
            self.mean_square_loss = tf.reduce_mean(tf.square(self.discounted_rewards - self.estimated_values))
            self.critic_reg_loss  = tf.reduce_sum([tf.reduce_sum(tf.square(x)) for x in critic_network_variables])
            self.critic_loss      = self.mean_square_loss + self.reg_param * self.critic_reg_loss
            self.critic_gradients = self.optimizer.compute_gradients(self.critic_loss, critic_network_variables)

            # collect all gradients
            self.gradients = self.actor_gradients + self.critic_gradients

            # clip gradients
            for i, (grad, var) in enumerate(self.gradients):
                # clip gradients by norm
                if grad is not None:
                    self.gradients[i] = (tf.clip_by_norm(grad, self.max_gradient), var)

            # summarize gradients
            #for grad, var in self.gradients:
                #tf.summary.histogram(var.name, var)
                    #if grad is not None:
                         #tf.summary.histogram(var.name + '/gradients', grad)

            # emit summaries

            tf.summary.scalar('Average_Speed/' + str(self.summary_every) + 'episodes',self.avg_speed)
            tf.summary.scalar('Average_overtake/' + str(self.summary_every) + 'episodes',self.avg_overtake)
            tf.summary.scalar('Average_lanechange/' + str(self.summary_every) + 'episodes',self.avg_lanechange)
            tf.summary.scalar('Average_reward/' + str(self.summary_every) + 'episodes', self.mean_reward)


            #tf.summary.histogram("estimated_values", self.estimated_values)
            #tf.summary.scalar("actor_loss", self.actor_loss)
            #tf.summary.scalar("critic_loss", self.critic_loss)
            #tf.summary.scalar("reg_loss", self.actor_reg_loss + self.critic_reg_loss)
            
        # training update
        with tf.name_scope("train_actor_critic"):
          # apply gradients to update actor network
          self.train_op = self.optimizer.apply_gradients(self.gradients)

        self.summarize = tf.summary.merge_all()
        self.no_op = tf.no_op()
        
    def sampleAction(self, states):
        # TODO: use this code piece when tf.multinomial gets better
        # sample action from current policy
        # actions = self.session.run(self.predicted_actions, {self.states: states})[0]
        # return actions[0]

        # temporary workaround
        def softmax(y):
            """ simple helper function here that takes unnormalized logprobs """
            maxy = np.amax(y)
            e = np.exp(y - maxy)
            return e / np.sum(e)

        # epsilon-greedy exploration strategy
        if self.train_mode:
            if random.random() < self.exploration and self.train_mode:
                return random.randint(0, self.num_actions-1)
            else:
                action_scores = self.session.run(self.action_scores, {self.states: states})[0]
                action_probs  = softmax(action_scores) #- 1e-5
                action_probs=np.asarray(action_probs).astype('float64')
                action_probs= action_probs/np.sum(action_probs)
                #print("action_scores",action_scores)
                #print("softmax(action_scores)",softmax(action_scores))
                #print("action_probs",action_probs)
                #print("np.random.multinomial(1, action_probs)",np.random.multinomial(1, action_probs))
                action = np.argmax(np.random.multinomial(1, action_probs))
                return action
        else:
            action_scores = self.session.run(self.action_scores, {self.states: states})[0]
            action_probs  = softmax(action_scores) #- 1e-5
            action_probs=np.asarray(action_probs).astype('float64')
            action_probs= action_probs/np.sum(action_probs)
            action = np.argmax(np.random.multinomial(1, action_probs))
            return action

    def updateModel(self,step):
        N = len(self.reward_buffer)
        r = 0 # use discounted reward to approximate Q value

        # compute discounted future rewards
        discounted_rewards = np.zeros(N)
        for t in reversed(range(N)):
            # future discounted reward from now on
            r = self.reward_buffer[t] + self.discount_factor * r
            discounted_rewards[t] = r
        # whether to calculate summaries
        calculate_summaries = self.summary_writer is not None and self.train_iteration % self.summary_every == 0



        #print("self.train_iteration",self.train_iteration)
        #print("self.summary_every",self.summary_every)
        #print("calculate_summaries",calculate_summaries)

        # update policy network with the rollout in batches
        if self.train_mode:
            summary_str = self.session.run(self.summarize if calculate_summaries else self.no_op)
             # emit summaries
            if calculate_summaries:
                #print("summary_writer")
                self.summary_writer.add_summary(summary_str,step)
                
            for t in range(N-1):
                # prepare inputs
                states  = self.state_buffer[t][np.newaxis, :]
                actions = np.array([self.action_buffer[t]])
                rewards = np.array([discounted_rewards[t]])

                # perform one update of training
                res= self.session.run(self.train_op if self.train_mode else self.no_op,{
                  self.states:             states,
                  self.taken_actions:      actions,
                  self.discounted_rewards: rewards
                })

        else:
            summary_str = self.session.run(self.summarize if calculate_summaries else self.no_op)
            if calculate_summaries:
                #print("summary_writer")
                self.summary_writer.add_summary(summary_str, step)



        self.annealExploration()
        self.train_iteration += 1

        # clean up
        self.cleanUp()

    def annealExploration(self, stategy='linear'):
        ratio = max((self.anneal_steps - self.train_iteration)/float(self.anneal_steps), 0)
        self.exploration = (self.init_exp - self.final_exp) * ratio + self.final_exp

    def storeRollout(self, state, action, reward):
        self.action_buffer.append(action)
        #print("self.action_buffer",self.action_buffer)
        self.reward_buffer.append(reward)
        self.state_buffer.append(state)

    def cleanUp(self):
        self.state_buffer  = []
        self.reward_buffer = []
        self.action_buffer = []


      # ================================================================
      # Saving variables
      # ================================================================

    def load_state(self,fname):
        #import logger
        #logger.warn('load_state method is deprecated, please use load_variables instead')
        #sess = sess or get_session()
        #saver = tf.train.Saver()
        #First let's load meta graph and restore weights
        #saver = tf.train.import_meta_graph('saved_networks/model.meta')
        #saver.restore(sess,tf.train.latest_checkpoint('saved_networks/'))
        #saver.restore(tf.get_default_session(), fname)
        
        saver = tf.train.Saver()
        saver.restore(self.session, fname)

    def save_state(self,fname):
        #import logger
        #logger.warn('save_state method is deprecated, please use save_variables instead')
        #sess = sess or get_session()
        dirname = os.path.dirname(fname)
        if any(dirname):
            os.makedirs(dirname, exist_ok=True)
        saver = tf.train.Saver()
        saver.save(self.session, fname)

    # The methods above and below are clearly doing the same thing, and in a rather similar way
    # TODO: ensure there is no subtle differences and remove one

    def save_variables(self,save_path, variables=None, sess=None):
        import joblib
        sess = sess or self.session
        variables = variables or tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)

        ps = sess.run(variables)
        save_dict = {v.name: value for v, value in zip(variables, ps)}
        dirname = os.path.dirname(save_path)
        if any(dirname):
            os.makedirs(dirname, exist_ok=True)
        joblib.dump(save_dict, save_path)

    def load_variables(self,load_path, variables=None, sess=None):
        import joblib
        sess = sess or self.session
        variables = variables or tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)

        loaded_params = joblib.load(os.path.expanduser(load_path))
        restores = []
        if isinstance(loaded_params, list):
            assert len(loaded_params) == len(variables), 'number of variables loaded mismatches len(variables)'
            for d, v in zip(loaded_params, variables):
                restores.append(v.assign(d))
        else:
            for v in variables:
                restores.append(v.assign(loaded_params[v.name]))

        sess.run(restores)

In [None]:
random.random()

In [None]:

def learn(env,network='mlp',
          seed=None,
          lr=5e-4,
          total_timesteps=2000000,
          buffer_size=100000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1000,
          batch_size=500,
          print_freq=2,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=50000,
          gamma=0.99,
          load_path=None,
          train_mode = True,
         ):
    """Train a deepq model.
    Parameters
    -------
    env: gym.Env
        environment to train on
    network: string or a function
        neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models
        (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which
        will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that)
    seed: int or None
        prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used.
    lr: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
    batch_size: int
        size of a batch sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to total_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    load_path: str
        path to load the model from. (default: None)
    **network_kwargs
        additional keyword arguments to pass to the network builder.
    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.

    """
    

    
    
    # Create all the functions necessary to train the model
    
    
    GPU_fraction = 0.4
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = GPU_fraction

    #sess = get_session()
    #set_global_seeds(seed)
    
    sess = tf.Session(config=config)
    #sess = tf.Session()
    optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9)
    
    
    # Examine environment parameters
    #print(str(env))
    # Set the default brain to work with
    default_brain = env.brain_names[0]
    brain = env.brains[default_brain]

    env_info = env.reset(train_mode=train_mode)[default_brain]
    reset = True
    state = get_obs_state_lidar(env_info)
    
    num_actions=brain.vector_action_space_size[0]
    state_dim   = state.shape[0]
    #num_actions = env.action_space.n

    #observation_space=obs.copy()
    
    #start_tensorboard
    #th = threading.Thread(target=start_tensorboard, args=([sess]))
    #th.start()
    
   
    
     # date - hour - minute of training time
    date_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
     # Make folder for save data
    os.makedirs(checkpoint_path + date_time + '_ACTOR_CRTIC_sensor')

    # Summary for tensorboard
    writer = tf.summary.FileWriter(checkpoint_path + date_time + '_ACTOR_CRTIC_sensor')
    #summary_placeholders, update_ops, summary_op = setup_summary(print_freq)
    #summary_writer = tf.summary.FileWriter(checkpoint_path + date_time + '_DQN_sensor', sess.graph)
    
    
    actor_network=build_actor_network(state_dim,num_actions)
    critic_network=build_critic_network(state_dim,num_actions)
    
    pg_reinforce = PolicyGradientActorCritic(sess,optimizer,actor_network,critic_network,state_dim,num_actions,
                                             summary_writer=writer,summary_every=print_freq,train_mode=train_mode)
    
    
    summary_vars = [pg_reinforce.avg_speed,pg_reinforce.avg_overtake, pg_reinforce.avg_lanechange,
                                pg_reinforce.mean_reward]
    
    summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))]
    update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))]
    

    # Initialize the parameters and copy them to the target network.
    #initialize()
    #update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    #obs = env.reset()
    speed_list = []
    overtake_list = []
    lanechange_list = []
    
    no_reward_since = 0
    episode_history = deque(maxlen=100)
    

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path + date_time + '_ACTOR_CRTIC_sensor' or td

        model_file = os.path.join(td, "model")
        model_saved = False
        

        if tf.train.latest_checkpoint(td) is not None:
            pg_reinforce.load_variables(model_file)
            #logger.log('Loaded model from {}'.format(model_file))
            print('Loaded model from {}'.format(model_file))
            model_saved = True
        elif load_path is not None:
            pg_reinforce.load_variables(load_path)
            #logger.log('Loaded model from {}'.format(load_path))
            print('Loaded model from {}'.format(load_path))
            
            
        pg_reinforce.updateModel(0)
        for t in range(total_timesteps):
                
            # Take action and update exploration to the newest value
            action = pg_reinforce.sampleAction(state[np.newaxis,:])
            env_action = action
            reset = False
            #next_state, reward, done, _ = env.step(action)
             # Get information for plotting

            vehicle_speed  = 100 * env_info.vector_observations[0][-8]
            num_overtake   = env_info.vector_observations[0][-7]
            num_lanechange = env_info.vector_observations[0][-6]


            # Get information for update
            env_info = env.step(action)[default_brain]
            next_state = get_obs_state_lidar(env_info)
            reward = env_info.rewards[0]
            done = env_info.local_done[0]

            #total_rewards += reward
            #reward = 5.0 if done else -0.1

            pg_reinforce.storeRollout(state,action,reward)
            state=next_state
            
            
            episode_rewards[-1] += reward
            num_episodes = len(episode_rewards)
            
            tab=episode_rewards[-print_freq-1:-1]
            
            
            
            if len(tab)==0:
                mean_reward=episode_rewards[-1]
            else:
                mean_reward = round(np.mean(tab), 1) #mean reward of last print_freq episode
            
            speed_list.append(vehicle_speed)
            overtake_list.append(num_overtake)
            lanechange_list.append(num_lanechange)
            

            if done:
                #eps = tf.get_variable("eps", ())
                #Epsilon=eps.eval()

                # Print informations if terminal
                print('step: ' + str(t) + ' / '  + 'episode: ' + str(num_episodes) + ' / ' + ' episode_rewards: ' + str(episode_rewards[-1]))

                env_info = env.reset(train_mode=train_mode)[default_brain]
                reset = True
                state = get_obs_state_lidar(env_info)
                #obs = env.reset()
                
                #reset = True

                avg_speed = np.mean(speed_list)
                avg_overtake = np.mean(overtake_list)
                avg_lanechange = np.mean(lanechange_list)
                
                
                
                tensorboard_info = [avg_speed, avg_overtake, avg_lanechange,mean_reward]
                
                for i in range(len(tensorboard_info)):
                    sess.run(update_ops[i], feed_dict = {summary_placeholders[i]: float(tensorboard_info[i])})
                    
                # if we don't see rewards in consecutive episodes
                # it's likely that the model gets stuck in bad local optima
                # we simply reset the model and try again

                if episode_rewards[-1] <= -300:
                    no_reward_since += 1
                    if no_reward_since >= 5:
                        # create and initialize variables
                        print('Resetting model... start anew!')
                        pg_reinforce.resetModel()
                        no_reward_since = 0
                        continue
                    else:
                        no_reward_since = 0
                        
                episode_rewards.append(0.0)
                
                pg_reinforce.updateModel(t)
                
                speed_list = []
                overtake_list = []
                lanechange_list = []
                
            
                
            if (train_mode and checkpoint_freq is not None and num_episodes >= print_freq and t % checkpoint_freq == 0):

                if saved_mean_reward is None or mean_reward > saved_mean_reward:

                    if print_freq is not None:
                        #logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   #saved_mean_reward, mean_reward))

                        print("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_reward))
                    pg_reinforce.save_variables(model_file)
                    model_saved = True
                    saved_mean_reward = mean_reward
                    
        if model_saved:
            if print_freq is not None:
                #logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
                print("Restored model with mean reward: {}".format(saved_mean_reward))
                pg_reinforce.load_variables(model_file)

                    
             

    return pg_reinforce

In [None]:
pg_reinforce=learn(env,checkpoint_path='saved_networks/',train_mode=True)

In [None]:
#np.random.multinomial(1, [1.0 / 3, 2.0 / 3])

In [None]:
#a=[0.31795933,0.16079315,0.21846536,0.15712377,0.14565839]
#a = np.asarray(a).astype('float32')
#a = a/np.sum (a)

In [None]:
#np.random.multinomial(1, a)

In [None]:
#a.astype