In [1]:
import webotsgym as wg

from webotsgym.config import WebotConfig
from webotsgym.environment import WebotsEnv
from webotsgym.evaluate import Evaluate, EvaluateMats, EvaluatePJ0
from webotsgym.action import DiscreteAction, ContinuousAction
from webotsgym.observation import Observation

import numpy as np
import matplotlib.pyplot as plt

import gym
import stable_baselines
from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, SAC, PPO1, PPO2, TD3, TRPO
from stable_baselines.common.env_checker import check_env
from stable_baselines.common.policies import MlpPolicy

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [2]:
def exponential_decay(x, N0=1, lambda_=5):
    return N0*np.exp(-lambda_*x)

class MyObs(Observation):
    def __init__(self, env):
        super(MyObs, self).__init__(env)
        self.env = env

class MyEval(Evaluate):
    def __init__(self, env, config: WebotConfig = WebotConfig()):
        super(MyEval, self).__init__(env, config)
        self.reward_range = (-1000, 1000)

    def calc_reward(self):
        reward = -1
        distance_norm = self.env.get_target_distance()
        distance_abs = self.env.get_target_distance(False)

        if distance_abs < 0.1 and abs(self.env.state.speed) < 0.05 and self.env.state.touching is False:
            return 10000
        else:
            reward += 2500 * exponential_decay(distance_abs, lambda_=40) * exponential_decay(abs(self.env.state.speed), lambda_=10)
        
        if self.env.state.touching:
            reward -= 5
        return reward

    def check_done(self):
        if self.env.total_reward < -10000:
            print("reward boundary")
            return True
        if self.env.get_target_distance(False) < 0.1 and abs(self.env.state.speed) < 0.05 and self.env.state.touching is False:
            print("target reached")
            return True
        return False
    

config = WebotConfig()
config.fast_simulation = True
config.reset_env_after = 200000
config.num_obstacles = 0
config.world_size = 2
config.world_scaling = 0.5

action_class = ContinuousAction(direction_type="steering", relative=False)
env = WebotsEnv(train=True, 
                action_class=action_class, 
                evaluate_class=MyEval,
                observation_class=MyObs,
                config=config)



Accepting on Port:  10201
sending: env
USE FAST MODE


In [3]:
time_steps = 499999
model_name = "Webots_find_target_small"

model = PPO1("MlpPolicy", env)





Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.









Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [4]:
while True:
    model.learn(total_timesteps=50000, log_interval=10000)
    model.save("models/" + model_name)

# model = PPO1.load("models/{}".format('DQN_WebotFakeMini_TRPO_pj1_nReward2_200000'))
# env = MyEnv()
# obs = env.reset()

# env.render()
# done = False
# max_num_steps = 100
# time = 0

sending: env
USE FAST MODE
Reward ( 250 )	 -0.9999999999596332
Reward ( 500 )	 -0.9966694025518726
Reward ( 750 )	 -5.997706564370685
Reward ( 1000 )	 -5.99798258080306
Reward ( 1250 )	 -5.980571397427766
Reward ( 1500 )	 -5.976038702904289
Reward ( 1750 )	 -5.980439497099049
Reward ( 2000 )	 -5.977194646506338
Reward ( 2250 )	 -5.949293025894477
reward boundary
sending: env
USE FAST MODE
sending: env
USE FAST MODE
sending: env
USE FAST MODE
Reward ( 2500 )	 -0.9999967850039992
Reward ( 2750 )	 -0.9999890334818876
Reward ( 3000 )	 -5.9999999943961075
Reward ( 3250 )	 -5.999999997142746
Reward ( 3500 )	 -5.9999992765373475
Reward ( 3750 )	 -5.999999980368584
Reward ( 4000 )	 -5.99999998130258
Reward ( 4250 )	 -5.999999982118452
Reward ( 4500 )	 -5.999999993426602
reward boundary
sending: env
USE FAST MODE
Reward ( 4750 )	 -0.9999741079750614
Reward ( 5000 )	 -0.8128900993952102
Reward ( 5250 )	 -5.56410779442428
Reward ( 5500 )	 -5.530556168843183
Reward ( 5750 )	 -5.672839071990747
Rew

Reward ( 47250 )	 -5.999999963196058
Reward ( 47500 )	 -0.9997357258553634
Reward ( 47750 )	 -5.999999826658153
Reward ( 48000 )	 -5.999992973174768
Reward ( 48250 )	 -5.9999999648696125
Reward ( 48500 )	 -5.9999999777850395
reward boundary
sending: env
USE FAST MODE
sending: env
USE FAST MODE
Reward ( 48750 )	 -0.9999993641832158
target reached
sending: env
USE FAST MODE
sending: env
USE FAST MODE
sending: env
USE FAST MODE
Reward ( 49000 )	 -0.9999999990588224
Reward ( 49250 )	 -5.999999998821843
Reward ( 49500 )	 -5.9999999999770495
Reward ( 49750 )	 -5.999999994399267
target reached
sending: env
USE FAST MODE
sending: env
USE FAST MODE
Reward ( 50000 )	 -0.9999973454538715
sending: env
USE FAST MODE
Reward ( 50250 )	 -0.9999999971475546
Reward ( 50500 )	 -0.9999999336945035
Reward ( 50750 )	 -5.999999999999631
Reward ( 51000 )	 -5.999999999999333
Reward ( 51250 )	 -5.999999999424606
Reward ( 51500 )	 -5.999999997066218
Reward ( 51750 )	 -0.9995662821194875
Reward ( 52000 )	 -0.9999

Reward ( 93000 )	 0.6974577166350495
Reward ( 93250 )	 -4.51951113351755
Reward ( 93500 )	 -5.772481384153716
target reached
sending: env
USE FAST MODE
Reward ( 93750 )	 -0.9999998939083921
Reward ( 94000 )	 -5.999999943722664
Reward ( 94250 )	 -5.999999978520267
Reward ( 94500 )	 -0.9999999967700836
Reward ( 94750 )	 -0.9134910088927155
Reward ( 95000 )	 -5.980399801114525
Reward ( 95250 )	 -5.998476673155156
Reward ( 95500 )	 -0.8484116843405083
Reward ( 95750 )	 -5.999999973890789
Reward ( 96000 )	 1.654394132151467
target reached
sending: env
USE FAST MODE
sending: env
USE FAST MODE
Reward ( 96250 )	 -0.9999999649536893
Reward ( 96500 )	 -0.9999463275035345
Reward ( 96750 )	 -0.9999963606935737
Reward ( 97000 )	 -0.9999999384957287
Reward ( 97250 )	 -0.9999999967844948
Reward ( 97500 )	 -0.9996180838864437
target reached
sending: env
USE FAST MODE
Reward ( 97750 )	 -0.9999967609349496
Reward ( 98000 )	 -0.9998245614887566
Reward ( 98250 )	 -0.44900177784608597
Reward ( 98500 )	 -5.

reward boundary
sending: env
USE FAST MODE
Reward ( 138250 )	 -0.9999999994522111
Reward ( 138500 )	 -0.9903784488620885
target reached
sending: env
USE FAST MODE
sending: env
USE FAST MODE
Reward ( 138750 )	 -5.99999916675669
Reward ( 139000 )	 2.066185754900603
Reward ( 139250 )	 -5.99836139456715
target reached
sending: env
USE FAST MODE
Reward ( 139500 )	 -5.999999992826931
Reward ( 139750 )	 -0.9999999584173449
Reward ( 140000 )	 -5.999999998215897
Reward ( 140250 )	 -5.99999997133106
Reward ( 140500 )	 -5.999999999408729
Reward ( 140750 )	 -0.9999984273336435
target reached
sending: env
USE FAST MODE
target reached
sending: env
USE FAST MODE
Reward ( 141000 )	 -0.999999999092275
Reward ( 141250 )	 -0.9994530901901664
Reward ( 141500 )	 -0.9999996259992812
Reward ( 141750 )	 -5.999999972955928
Reward ( 142000 )	 -5.999999999911447
Reward ( 142250 )	 -0.9999999995289824
Reward ( 142500 )	 -5.9999999999996225
Reward ( 142750 )	 -5.9999999999987885
Reward ( 143000 )	 -0.9999999981218

Reward ( 181500 )	 -5.999999985003207
Reward ( 181750 )	 -5.9999999812189895
Reward ( 182000 )	 -5.999999996891299
Reward ( 182250 )	 -5.9999967493542
target reached
sending: env
USE FAST MODE
Reward ( 182500 )	 -0.9999999985953514
Reward ( 182750 )	 -5.99999999989737
Reward ( 183000 )	 -5.99999999999946
Reward ( 183250 )	 -0.9996756021969969
Reward ( 183500 )	 -0.9999999952965577
Reward ( 183750 )	 -5.999999999993154
Reward ( 184000 )	 -0.9845419897476965
Reward ( 184250 )	 -5.999999987378381
Reward ( 184500 )	 -5.999999994622201
Reward ( 184750 )	 -5.9999997344533575
Reward ( 185000 )	 -0.9999983389906458
reward boundary
sending: env
USE FAST MODE
Reward ( 185250 )	 -0.9999924267035205
Reward ( 185500 )	 -5.999999954205606
Reward ( 185750 )	 -5.999999964974145
Reward ( 186000 )	 -5.999999994073108
target reached
sending: env
USE FAST MODE
Reward ( 186250 )	 -0.9999999972646888
Reward ( 186500 )	 -5.999999999425349
Reward ( 186750 )	 -5.999999996256283
Reward ( 187000 )	 -5.9999999947

USE FAST MODE
Reward ( 226500 )	 -0.9999970515238286
Reward ( 226750 )	 -0.9468149647091931
Reward ( 227000 )	 -0.8265613792443202
Reward ( 227250 )	 -5.99999999887913
Reward ( 227500 )	 -0.999999024952914
Reward ( 227750 )	 -5.999999992285589
target reached
sending: env
USE FAST MODE
sending: env
USE FAST MODE
Reward ( 228000 )	 -0.9999999813429771
Reward ( 228250 )	 -5.999999955666043
Reward ( 228500 )	 -5.9999999999155555
Reward ( 228750 )	 -0.9999996126301839
target reached
sending: env
USE FAST MODE
Reward ( 229000 )	 -0.9999999887965908
Reward ( 229250 )	 -5.999999999998938
Reward ( 229500 )	 -0.9999987393030887
Reward ( 229750 )	 -0.9994134765091927
target reached
sending: env
USE FAST MODE
Reward ( 230000 )	 -0.9999974252427136
Reward ( 230250 )	 -5.999999979046935
Reward ( 230500 )	 -5.999999994553497
Reward ( 230750 )	 -5.999999999945082
Reward ( 231000 )	 -5.999999993319584
Reward ( 231250 )	 -5.999999979864281
Reward ( 231500 )	 -5.999999999870697
Reward ( 231750 )	 -0.9999

Reward ( 272500 )	 -0.9992067768174477
Reward ( 272750 )	 -0.9629784305382819
Reward ( 273000 )	 -0.9999714172921248
Reward ( 273250 )	 -0.999994302775447
Reward ( 273500 )	 -0.9999995329872785
Reward ( 273750 )	 -5.999999975356813
Reward ( 274000 )	 -5.999999991220921
Reward ( 274250 )	 -5.999999955058825
Reward ( 274500 )	 -5.9999985992321
Reward ( 274750 )	 -0.9999995943302079
Reward ( 275000 )	 -0.9999955845986267
Reward ( 275250 )	 -0.9999949964656717
Reward ( 275500 )	 -0.9879789884748699
Reward ( 275750 )	 -0.9999623228497772
Reward ( 276000 )	 -0.9999997494776093
reward boundary
sending: env
USE FAST MODE
Reward ( 276250 )	 -0.9999977903130028
Reward ( 276500 )	 -0.9990299456167879
Reward ( 276750 )	 -5.991933189118776
Reward ( 277000 )	 -5.999989300243749
Reward ( 277250 )	 -0.9999977445486857
Reward ( 277500 )	 -0.9999911161994994
Reward ( 277750 )	 -0.9999954180872219
Reward ( 278000 )	 -0.999999945369294
Reward ( 278250 )	 -5.999999958696233
Reward ( 278500 )	 -5.9999999400

FileNotFoundError: [Errno 2] No such file or directory: '/home/fabian/uni/ees-pees/controller/build/controller': '/home/fabian/uni/ees-pees/controller/build/controller'

In [None]:
model = PPO1.load("models/Webots_find_target_small")

def exponential_decay(x, N0=1, lambda_=5):
    return N0*np.exp(-lambda_*x)

class MyObs(Observation):
    def __init__(self, env):
        super(MyObs, self).__init__(env)
        self.env = env

class MyEval(Evaluate):
    def __init__(self, env, config: WebotConfig = WebotConfig()):
        super(MyEval, self).__init__(env, config)
        self.reward_range = (-1000, 1000)

    def calc_reward(self):
        reward = -1
        distance_norm = self.env.get_target_distance()
        distance_abs = self.env.get_target_distance(False)

        if distance_abs < 0.1 and abs(self.env.state.speed) < 0.05 and self.env.state.touching is False:
            return 10000
        else:
            reward += 2500 * exponential_decay(distance_abs, lambda_=40) * exponential_decay(abs(self.env.state.speed), lambda_=10)
        
        if self.env.state.touching:
             reward -= 5
        return reward

    def check_done(self):
        if self.env.total_reward < -10000:
            print("reward boundary")
            return True
        if self.env.get_target_distance(False) < 0.1 and abs(self.env.state.speed) < 0.05 and self.env.state.touching is False:
            print("target reached")
            return True
        return False
    

config = WebotConfig()
config.fast_simulation = False
config.reset_env_after = 200000
config.num_obstacles = 0
config.world_size = 2
config.world_scaling = 0.5

action_class = ContinuousAction(direction_type="steering", relative=False)
env = WebotsEnv(train=True, 
                action_class=action_class, 
                evaluate_class=MyEval,
                observation_class=MyObs,
                config=config)

obs = env.reset()

for _ in range(100000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
    if done is True:
        break