In [1]:
import webotsgym as wg

from webotsgym.config import WebotConfig, SimSpeedMode
from webotsgym.environment import WebotsEnv, WebotsGrid
from webotsgym.evaluate import Evaluate, EvaluateMats, EvaluatePJ0
from webotsgym.action import DiscreteAction, ContinuousAction
from webotsgym.observation import Observation

import numpy as np

import gym
import stable_baselines
from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, SAC, PPO1, PPO2, TD3, TRPO
from stable_baselines.common.env_checker import check_env
from stable_baselines.common.policies import MlpPolicy

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [2]:
def exponential_decay(x, N0=1, lambda_=5):
    return N0*np.exp(-lambda_*x)

def exponential_penalty(x, step_penalty=-1, lambda_=5):
    return step_penalty * (1 - exponential_decay(x, lambda_=lambda_))

class MyEval(Evaluate):
    def __init__(self, env, config: WebotConfig = WebotConfig()):
        super(MyEval, self).__init__(env, config)

    def calc_reward(self):
        if self.env.get_target_distance(normalized=False) < 0.05:
            reward = 10000
        else:
            distance_normalized = self.env.get_target_distance(normalized=True)
            step_base_penalty = -1
            reward = exponential_penalty(x=distance_normalized, step_penalty=step_base_penalty, lambda_=3)
            if self.env.gps_visited_count > 3:
                reward -= 0.2 * (self.env.gps_visited_count - 2)**2
            if self.env.state.action_denied is True:
                reward -= 500
        return reward
        
        
    def check_done(self):
        if self.env.time_steps == 300:
            return True
        if self.env.total_reward < -10000:
            return True
        if self.env.get_target_distance(normalized=False) < 0.05:
            return True
        return False


config = WebotConfig()
config.sim_mode = SimSpeedMode.RUN
config.reset_env_after = 20000
config.num_obstacles = 12
config.world_size = 8
env = WebotsGrid(train=True, 
                 config=config,
                 evaluate_class = MyEval)



Accepting on Port:  10201
sending: start env 1


In [None]:
time_steps = 0
model_name = "PPO_webots_v4"

model = PPO1("MlpPolicy", env, verbose=1, tensorboard_log="./PPO_webots_v4_fabian_tensorboard/")
#model.learn(total_timesteps=10000)
#model.save("models/keep/{}".format(model_name))
#time_steps += 10000

while time_steps < 500000:
    #model = PPO1("MlpPolicy", env, verbose=1, tensorboard_log="./PPO_webots_v4_fabian_tensorboard/")
    #model.load("models/keep/{}".format(model_name))
    model.learn(total_timesteps=10000)
    model.save("models/keep/{}".format(model_name))
    time_steps += 10000
    print("++++++++++++++++++ steps so far:", time_steps, " ++++++++++++++++++")





Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.










Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

********** Iteration 0 ************
sending: start env 1
sending: start env 1
sending: start env 1
sending: start env 1

Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00618 |      -0.01386 |      1.00e+07 |       0.00084 |       1.38551
     -0.02767 |      -0.01378 |      1.00e+07 |       0.00844 |       1.37818
     -0.03428 |      -0.01366 |      1.00e+07 |       0.02133 |       1.36570
     -0.03478 |      -0.01365 |      1.00e+07 |       0.02229 |       1.36477
Evaluating losses...
     -0.03793 |      -0.01369 |      1.00e+07 |       0.01761 |       1.36931
----------------------------------
| EpLenMean       | 78.3         |
| EpRewMean       | 9.83e+03     |
| EpThisIter      | 3         

----------------------------------
| EpLenMean       | 92.4         |
| EpRewMean       | 8.49e+03     |
| EpThisIter      | 4            |
| EpisodesSoFar   | 10           |
| TimeElapsed     | 176          |
| TimestepsSoFar  | 1024         |
| ev_tdlam_before | -1.79e-06    |
| loss_ent        | 1.3640745    |
| loss_kl         | 0.009231234  |
| loss_pol_entpen | -0.013640745 |
| loss_pol_surr   | -0.025676014 |
| loss_vf_loss    | 10001256.0   |
----------------------------------
********** Iteration 4 ************
sending: start env 1
Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
      0.00056 |      -0.01372 |      3.36e+06 |       0.00021 |       1.37164
     -0.00413 |      -0.01368 |      3.36e+06 |       0.00178 |       1.36825
     -0.00758 |      -0.01366 |      3.36e+06 |       0.00451 |       1.36604
     -0.00989 |      -0.01366 |      3.36e+06 |       0.00568 |       1.36630
Evaluating losses...
     -0.01367 |      -0.0136