In [1]:
import webotsgym as wg

from webotsgym.config import WebotConfig
from webotsgym.environment import WebotsEnv
from webotsgym.evaluate import Evaluate, EvaluateMats, EvaluatePJ0
from webotsgym.action import DiscreteAction, ContinuousAction
from webotsgym.observation import Observation

import numpy as np
import matplotlib.pyplot as plt

import gym
import stable_baselines
from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, SAC, PPO1, PPO2, TD3, TRPO
from stable_baselines.common.env_checker import check_env
from stable_baselines.common.policies import MlpPolicy

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [2]:
def exponential_decay(x, N0=1, lambda_=5):
    return N0*np.exp(-lambda_*x)

class MyObs(Observation):
    def __init__(self, env):
        super(MyObs, self).__init__(env)
        self.env = env

class MyEval(Evaluate):
    def __init__(self, env, config: WebotConfig = WebotConfig()):
        super(MyEval, self).__init__(env, config)
        self.reward_range = (-1000, 1000)

    def calc_reward(self):
        reward = -1
        distance_norm = self.env.get_target_distance()
        distance_abs = self.env.get_target_distance(False)

        if distance_abs < 0.1 and abs(self.env.state.speed) < 0.05 and self.env.state.touching is False:
            return 10000
        else:
            reward += 2500 * exponential_decay(distance_abs, lambda_=40) * exponential_decay(abs(self.env.state.speed), lambda_=10)
        
        if self.env.state.touching:
            reward -= 5
        return reward

    def check_done(self):
        if self.env.total_reward < -10000:
            print("reward boundary")
            return True
        if self.env.get_target_distance(False) < 0.1 and abs(self.env.state.speed) < 0.05 and self.env.state.touching is False:
            print("target reached")
            return True
        return False
    

config = WebotConfig()
config.fast_simulation = False
config.reset_env_after = 200000
config.num_obstacles = 0
config.world_size = 2
config.world_scaling = 0.5

action_class = ContinuousAction(direction_type="steering", relative=False)
env = WebotsEnv(train=True, 
                action_class=action_class, 
                evaluate_class=MyEval,
                observation_class=MyObs,
                config=config)



Accepting on Port:  10201
sending: env


In [None]:
time_steps = 499999
model_name = "Webots_find_target_small"

model = PPO1("MlpPolicy", env)

while True:
    model.learn(total_timesteps=50000, log_interval=10000)
    model.save("models/" + model_name)

# model = PPO1.load("models/{}".format('DQN_WebotFakeMini_TRPO_pj1_nReward2_200000'))
# env = MyEnv()
# obs = env.reset()

# env.render()
# done = False
# max_num_steps = 100
# time = 0





Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.









Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
sending: reset
Reward ( 250 )	 -0.9999956431018403
Reward ( 500 )	 -0.9999276072178435
Reward ( 750 )	 -0.9976977149669135
Reward ( 1000 )	 -0.991745932390656
Reward ( 1250 )	 -0.9500493189797294
Reward ( 1500 )	 -0.47200347903364703
Reward ( 1750 )	 1.1936482052495285
Reward ( 2000 )	 0.572522583278793
Reward ( 2250 )	 -0.9179160218456048
Reward ( 2500 )	 -5.950659804281127
Reward ( 2750 )	 -5.989975654322327
Reward ( 3000 )	 -5.992769091350721
Reward ( 3250 )	 -5.990092969894424
Reward ( 3500 )	 -5.9889632815410865
Reward ( 3750 )	 -5.993795467640498
reward boundary
sending: reset
Reward ( 4000 )	 -0.9999928867141173
Reward ( 4250 )	 -0.999990369831794
Reward ( 4500 )	 -0.9999920208737757
Reward ( 4750 )	 -0.9999933479919118
Reward ( 5000 )	 -0.9999

Reward ( 28500 )	 -0.9999999999142366
Reward ( 28750 )	 -5.999999999998931
Reward ( 29000 )	 -5.9999999999999565
Reward ( 29250 )	 -5.999999999999966
Reward ( 29500 )	 -5.999999999999966
Reward ( 29750 )	 -5.999999999999961
Reward ( 30000 )	 -5.999999999999966
Reward ( 30250 )	 -5.9999999999999565
reward boundary
sending: reset
Reward ( 30500 )	 -0.9999380921333383
Reward ( 30750 )	 -0.9997003710998621
Reward ( 31000 )	 -0.9999276589499746
Reward ( 31250 )	 -0.9988432333259973
Reward ( 31500 )	 -0.9812268104652674
Reward ( 31750 )	 -0.9736851981460091
Reward ( 32000 )	 -0.9986498478717881
Reward ( 32250 )	 -0.9999962961150726
Reward ( 32500 )	 -5.999999997662187
Reward ( 32750 )	 -5.999999997956034
Reward ( 33000 )	 -5.999999998816634
Reward ( 33250 )	 -5.999999998761673
Reward ( 33500 )	 -5.9999999988351815
Reward ( 33750 )	 -5.999999998184942
reward boundary
sending: reset
sending: reset
Reward ( 34000 )	 -0.999999999115092
Reward ( 34250 )	 -0.9999999991995373
Reward ( 34500 )	 -0.9