In [1]:
from __future__ import division
#import numpy as np
import pickle
import rospy
import sys
from geometry_msgs.msg import Twist
from sensor_msgs.msg import Imu, JointState
from std_srvs.srv import Empty
from nav_msgs.msg import Odometry
from sensor_msgs.msg import LaserScan
from std_msgs.msg import Float32, Float64
import time
import pandas as pd
from matplotlib import pyplot as plt
from collections import deque
import random
import tensorflow as tf
import numpy as np

import torch
import torch.nn as nn
from torch.distributions import MultivariateNormal
import gym
import numpy as np

from gazebo_msgs.msg import ModelStates
from tensorboardX import SummaryWriter
import math

In [2]:
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter
import torch
import torch.nn as nn
import torch.optim as optim

In [3]:
wheel_l = "/self_balancing_ai/wheell_velocity_controller/command"
wheel_r = "/self_balancing_ai/wheelr_velocity_controller/command"


class SELF_BALANCING:
    def __init__(self):
        
        self.joint1 = rospy.Publisher(wheel_l,Float64,queue_size =1)
        self.joint2 = rospy.Publisher(wheel_r,Float64,queue_size =1)      
        
        self.joint1_msg = Float64()
        self.joint2_msg = Float64()    
    
        subscriber = rospy.Subscriber("/gazebo/model_states",ModelStates,callback=self.gazebo_callback)
        self.subscriber = rospy.Subscriber("/segway/joint_states",JointState,callback=self.robot_configuration_callback)
        self.subscriber = rospy.Subscriber("/imu",Imu,callback=self.imu_callback)
        self.subscriber = rospy.Subscriber("/segzay/laser/scan", LaserScan, callback=self.laser_callback)
        self.reset = rospy.ServiceProxy("/gazebo/reset_simulation",Empty)
        self.pause = rospy.ServiceProxy("/gazebo/pause_physics",Empty)
        self.unpause = rospy.ServiceProxy("/gazebo/unpause_physics",Empty)
        self.distance = 0
        self.tilt = 0
        self.postion = 0
        self.q = []
        self.roll = 0
        self.pitch= 0 
        self.yaw = 0 
        self.old_roll = 0
        self.robot_configuration = []

    def laser_callback(self,data):
        self.distance = data.ranges[0]
        
    def reset_pose(self):
            self.reset()
            Robot.joint1_msg.data = 0
            Robot.joint2_msg.data = 0
            
            Robot.joint1.publish(Robot.joint1_msg.data)
            Robot.joint2.publish(Robot.joint2_msg.data)  
            
            time.sleep(0.001)
            self.start_time = time.time()
            return Robot.get_state()
    
    def imu_callback(self, date):
        self.q = [
            date.orientation.x,
            date.orientation.y,
            date.orientation.z,
            date.orientation.w]
        x, y, z, w = self.q
        t0 = +2.0 * (w * x + y * z)
        t1 = +1.0 - 2.0 * (x * x + y * y)
        self.roll = math.atan2(t0, t1)
        t2 = +2.0 * (w * y - z * x)
        t2 = +1.0 if t2 > +1.0 else t2
        t2 = -1.0 if t2 < -1.0 else t2
        self.pitch = math.asin(t2)
        t3 = +2.0 * (w * z + x * y)
        t4 = +1.0 - 2.0 * (y * y + z * z)
        self.yaw = math.atan2(t3, t4)
    
    def execute_action(self, action1, action2):
        self.joint1_msg.data = action1
        self.joint2_msg.data = action2
        self.joint2.publish(self.joint1_msg.data)
        self.joint1.publish(self.joint2_msg.data) 
        
    def get_state(self):
        Ts = time.time() - self.start_time
        self.start_time = time.time()
        self.vel = (Robot.roll - self.old_roll)/Ts
        self.old_roll = Robot.roll
        return [self.roll,self.pitch, self.yaw, self.vel]
    
    def get_reward(self):
        return -abs(self.roll)
    
    def robot_configuration_callback(self, data):
        self.robot_configuration = data.position
        
    def gazebo_callback(self, data):
        self.position = data.pose[1].position.x
        

In [4]:
HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70

In [5]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x)

In [6]:
rospy.init_node('self_balancing_dqn_ai',anonymous=True)
Robot = SELF_BALANCING()
Robot.reset_pose()

[0.0, 0.0, 0.0, 0.0]

In [7]:
obs_size = 4
n_actions = 80 #range(-10,10,1) 

In [8]:
Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

def iterate_batches(net, batch_size):
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = Robot.reset_pose()
    sm = nn.Softmax(dim=1)
    while True:
        obs_v = torch.FloatTensor([obs])
        act_probs_v = sm(net(obs_v))
        act_probs = act_probs_v.data.numpy()[0]
        action = np.random.choice(len(act_probs), p=act_probs) 
        action_robot = float(action - 40) / 4.0
        Robot.execute_action(action_robot, action_robot)
        next_obs = Robot.get_state()
        reward = Robot.get_reward()
        
        episode_reward += reward
        episode_steps.append(EpisodeStep(observation=obs, action=action))
        if abs(Robot.roll) > 0.6:
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            episode_reward = 0.0
            episode_steps = []
            next_obs = Robot.reset_pose()          
            if len(batch) == batch_size:
                yield batch
                batch = []
        obs = next_obs
        time.sleep(0.005)


def filter_batch(batch, percentile):
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))

    train_obs = []
    train_act = []
    for example in batch:
        if example.reward < reward_bound:
            continue
        train_obs.extend(map(lambda step: step.observation, example.steps))
        train_act.extend(map(lambda step: step.action, example.steps))

    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    return train_obs_v, train_act_v, reward_bound, reward_mean





In [9]:
    net = Net(obs_size, HIDDEN_SIZE, n_actions)
    objective = nn.CrossEntropyLoss()
    optimizer = optim.Adam(params=net.parameters(), lr=0.01)

    for iter_no, batch in enumerate(iterate_batches(net, BATCH_SIZE)):
        print("Iter: {}".format(iter_no))
        obs_v, acts_v, reward_b, reward_m = filter_batch(batch, PERCENTILE)
        optimizer.zero_grad()
        action_scores_v = net(obs_v)
        loss_v = objective(action_scores_v, acts_v)
        loss_v.backward()
        optimizer.step()

        print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f" % (
            iter_no, loss_v.item(), reward_m, reward_b))

Iter: 0
0: loss=4.243, reward_mean=-13.4, reward_bound=-13.2
Iter: 1
1: loss=4.158, reward_mean=-12.9, reward_bound=-12.2
Iter: 2
2: loss=4.167, reward_mean=-12.8, reward_bound=-12.4
Iter: 3
3: loss=4.027, reward_mean=-12.1, reward_bound=-11.6
Iter: 4
4: loss=3.873, reward_mean=-12.1, reward_bound=-11.2
Iter: 5
5: loss=3.742, reward_mean=-11.5, reward_bound=-11.0
Iter: 6
6: loss=3.549, reward_mean=-10.9, reward_bound=-10.1
Iter: 7
7: loss=3.247, reward_mean=-10.2, reward_bound=-9.2
Iter: 8
8: loss=3.033, reward_mean=-10.2, reward_bound=-9.1
Iter: 9
9: loss=2.685, reward_mean=-9.3, reward_bound=-8.9
Iter: 10
10: loss=2.485, reward_mean=-8.8, reward_bound=-8.4
Iter: 11
11: loss=2.396, reward_mean=-9.1, reward_bound=-8.7
Iter: 12
12: loss=1.939, reward_mean=-8.5, reward_bound=-8.1
Iter: 13
13: loss=1.903, reward_mean=-8.9, reward_bound=-8.4
Iter: 14
14: loss=1.857, reward_mean=-8.1, reward_bound=-8.3
Iter: 15
15: loss=1.611, reward_mean=-8.7, reward_bound=-8.2
Iter: 16
16: loss=1.423, rew

Iter: 134
134: loss=0.626, reward_mean=-7.1, reward_bound=-6.8
Iter: 135
135: loss=0.803, reward_mean=-7.0, reward_bound=-6.7
Iter: 136
136: loss=0.687, reward_mean=-7.3, reward_bound=-7.0
Iter: 137
137: loss=0.667, reward_mean=-7.2, reward_bound=-6.9
Iter: 138
138: loss=0.610, reward_mean=-7.1, reward_bound=-6.7
Iter: 139
139: loss=0.633, reward_mean=-6.9, reward_bound=-6.6
Iter: 140
140: loss=0.549, reward_mean=-7.1, reward_bound=-6.9
Iter: 141
141: loss=0.375, reward_mean=-6.9, reward_bound=-6.6
Iter: 142
142: loss=0.712, reward_mean=-6.9, reward_bound=-6.7
Iter: 143
143: loss=0.530, reward_mean=-6.9, reward_bound=-6.3
Iter: 144
144: loss=0.496, reward_mean=-7.1, reward_bound=-6.7
Iter: 145
145: loss=0.571, reward_mean=-7.2, reward_bound=-6.6
Iter: 146
146: loss=0.500, reward_mean=-6.9, reward_bound=-6.7
Iter: 147
147: loss=0.446, reward_mean=-7.3, reward_bound=-6.7
Iter: 148
148: loss=0.461, reward_mean=-7.4, reward_bound=-7.0
Iter: 149
149: loss=0.416, reward_mean=-7.2, reward_bou

Iter: 265
265: loss=0.305, reward_mean=-7.1, reward_bound=-6.8
Iter: 266
266: loss=0.276, reward_mean=-7.1, reward_bound=-7.0
Iter: 267
267: loss=0.358, reward_mean=-6.7, reward_bound=-6.4
Iter: 268
268: loss=0.312, reward_mean=-7.0, reward_bound=-6.8
Iter: 269
269: loss=0.247, reward_mean=-6.8, reward_bound=-6.4
Iter: 270
270: loss=0.167, reward_mean=-6.8, reward_bound=-6.5
Iter: 271
271: loss=0.246, reward_mean=-6.9, reward_bound=-6.6
Iter: 272
272: loss=0.204, reward_mean=-7.0, reward_bound=-6.5
Iter: 273
273: loss=0.323, reward_mean=-6.9, reward_bound=-6.6
Iter: 274
274: loss=0.218, reward_mean=-7.2, reward_bound=-7.1
Iter: 275
275: loss=0.231, reward_mean=-7.0, reward_bound=-6.6
Iter: 276
276: loss=0.236, reward_mean=-7.0, reward_bound=-6.4
Iter: 277
277: loss=0.307, reward_mean=-7.1, reward_bound=-6.6
Iter: 278
278: loss=0.264, reward_mean=-6.9, reward_bound=-6.4
Iter: 279
279: loss=0.236, reward_mean=-7.3, reward_bound=-6.6
Iter: 280
280: loss=0.257, reward_mean=-6.9, reward_bou

Iter: 396
396: loss=0.187, reward_mean=-6.8, reward_bound=-6.4
Iter: 397
397: loss=0.279, reward_mean=-6.9, reward_bound=-6.7
Iter: 398
398: loss=0.285, reward_mean=-6.9, reward_bound=-6.7
Iter: 399
399: loss=0.286, reward_mean=-6.8, reward_bound=-6.5
Iter: 400
400: loss=0.257, reward_mean=-6.9, reward_bound=-6.6
Iter: 401
401: loss=0.217, reward_mean=-7.3, reward_bound=-6.9
Iter: 402
402: loss=0.300, reward_mean=-6.9, reward_bound=-6.6
Iter: 403
403: loss=0.284, reward_mean=-6.9, reward_bound=-6.7
Iter: 404
404: loss=0.278, reward_mean=-7.1, reward_bound=-6.9
Iter: 405
405: loss=0.222, reward_mean=-6.7, reward_bound=-6.4
Iter: 406
406: loss=0.305, reward_mean=-6.9, reward_bound=-6.6
Iter: 407
407: loss=0.275, reward_mean=-6.8, reward_bound=-6.5
Iter: 408
408: loss=0.178, reward_mean=-7.0, reward_bound=-6.5
Iter: 409
409: loss=0.196, reward_mean=-6.9, reward_bound=-6.4
Iter: 410
410: loss=0.132, reward_mean=-6.9, reward_bound=-6.5
Iter: 411
411: loss=0.243, reward_mean=-7.1, reward_bou

Iter: 527
527: loss=0.248, reward_mean=-6.9, reward_bound=-6.6
Iter: 528
528: loss=0.275, reward_mean=-6.7, reward_bound=-6.5
Iter: 529
529: loss=0.287, reward_mean=-6.8, reward_bound=-6.5
Iter: 530
530: loss=0.236, reward_mean=-6.8, reward_bound=-6.5
Iter: 531
531: loss=0.193, reward_mean=-6.8, reward_bound=-6.7
Iter: 532
532: loss=0.271, reward_mean=-6.7, reward_bound=-6.4
Iter: 533
533: loss=0.280, reward_mean=-7.0, reward_bound=-6.7
Iter: 534
534: loss=0.323, reward_mean=-6.8, reward_bound=-6.5
Iter: 535
535: loss=0.202, reward_mean=-7.1, reward_bound=-6.7
Iter: 536
536: loss=0.269, reward_mean=-6.7, reward_bound=-6.4
Iter: 537
537: loss=0.331, reward_mean=-7.1, reward_bound=-6.8
Iter: 538
538: loss=0.208, reward_mean=-7.0, reward_bound=-6.6
Iter: 539
539: loss=0.251, reward_mean=-7.1, reward_bound=-6.8
Iter: 540
540: loss=0.176, reward_mean=-6.8, reward_bound=-6.5
Iter: 541
541: loss=0.244, reward_mean=-7.1, reward_bound=-6.7
Iter: 542
542: loss=0.290, reward_mean=-6.7, reward_bou

Iter: 658
658: loss=0.139, reward_mean=-7.1, reward_bound=-6.5
Iter: 659
659: loss=0.098, reward_mean=-6.7, reward_bound=-6.5
Iter: 660
660: loss=0.159, reward_mean=-6.7, reward_bound=-6.5
Iter: 661
661: loss=0.125, reward_mean=-6.7, reward_bound=-6.5
Iter: 662
662: loss=0.179, reward_mean=-7.1, reward_bound=-6.9
Iter: 663
663: loss=0.127, reward_mean=-6.7, reward_bound=-6.4
Iter: 664
664: loss=0.141, reward_mean=-6.8, reward_bound=-6.5
Iter: 665
665: loss=0.183, reward_mean=-6.7, reward_bound=-6.4
Iter: 666
666: loss=0.165, reward_mean=-6.8, reward_bound=-6.4
Iter: 667
667: loss=0.174, reward_mean=-6.7, reward_bound=-6.5
Iter: 668
668: loss=0.174, reward_mean=-6.9, reward_bound=-6.7
Iter: 669
669: loss=0.194, reward_mean=-6.8, reward_bound=-6.7
Iter: 670
670: loss=0.185, reward_mean=-7.1, reward_bound=-6.8
Iter: 671
671: loss=0.170, reward_mean=-6.8, reward_bound=-6.4
Iter: 672
672: loss=0.160, reward_mean=-6.8, reward_bound=-6.5
Iter: 673
673: loss=0.114, reward_mean=-6.9, reward_bou

Iter: 789
789: loss=0.084, reward_mean=-6.7, reward_bound=-6.4
Iter: 790
790: loss=0.089, reward_mean=-6.6, reward_bound=-6.4
Iter: 791
791: loss=0.135, reward_mean=-7.0, reward_bound=-6.8
Iter: 792
792: loss=0.107, reward_mean=-6.6, reward_bound=-6.3
Iter: 793
793: loss=0.112, reward_mean=-6.7, reward_bound=-6.5
Iter: 794
794: loss=0.134, reward_mean=-6.8, reward_bound=-6.5
Iter: 795
795: loss=0.142, reward_mean=-6.9, reward_bound=-6.6
Iter: 796
796: loss=0.171, reward_mean=-6.9, reward_bound=-6.7
Iter: 797
797: loss=0.112, reward_mean=-6.6, reward_bound=-6.3
Iter: 798
798: loss=0.124, reward_mean=-6.9, reward_bound=-6.7
Iter: 799
799: loss=0.175, reward_mean=-7.0, reward_bound=-6.7
Iter: 800
800: loss=0.166, reward_mean=-6.8, reward_bound=-6.4
Iter: 801
801: loss=0.089, reward_mean=-6.7, reward_bound=-6.4
Iter: 802
802: loss=0.117, reward_mean=-6.9, reward_bound=-6.5
Iter: 803
803: loss=0.163, reward_mean=-6.6, reward_bound=-6.3
Iter: 804
804: loss=0.104, reward_mean=-6.6, reward_bou

Iter: 920
920: loss=0.069, reward_mean=-6.7, reward_bound=-6.4
Iter: 921
921: loss=0.065, reward_mean=-6.9, reward_bound=-6.5
Iter: 922
922: loss=0.085, reward_mean=-6.6, reward_bound=-6.4
Iter: 923
923: loss=0.115, reward_mean=-6.7, reward_bound=-6.4
Iter: 924
924: loss=0.089, reward_mean=-6.8, reward_bound=-6.5
Iter: 925
925: loss=0.101, reward_mean=-6.9, reward_bound=-6.5
Iter: 926
926: loss=0.125, reward_mean=-6.6, reward_bound=-6.4
Iter: 927
927: loss=0.142, reward_mean=-7.0, reward_bound=-6.7
Iter: 928
928: loss=0.093, reward_mean=-6.7, reward_bound=-6.5
Iter: 929
929: loss=0.173, reward_mean=-6.7, reward_bound=-6.4
Iter: 930
930: loss=0.114, reward_mean=-6.8, reward_bound=-6.4
Iter: 931
931: loss=0.076, reward_mean=-6.7, reward_bound=-6.4
Iter: 932
932: loss=0.128, reward_mean=-6.6, reward_bound=-6.4
Iter: 933
933: loss=0.184, reward_mean=-6.7, reward_bound=-6.4
Iter: 934
934: loss=0.091, reward_mean=-7.0, reward_bound=-6.6
Iter: 935
935: loss=0.070, reward_mean=-6.7, reward_bou

Iter: 1049
1049: loss=0.099, reward_mean=-6.6, reward_bound=-6.4
Iter: 1050
1050: loss=0.064, reward_mean=-6.8, reward_bound=-6.4
Iter: 1051
1051: loss=0.098, reward_mean=-6.6, reward_bound=-6.3
Iter: 1052
1052: loss=0.092, reward_mean=-6.6, reward_bound=-6.4
Iter: 1053
1053: loss=0.072, reward_mean=-6.7, reward_bound=-6.5
Iter: 1054
1054: loss=0.092, reward_mean=-6.7, reward_bound=-6.5
Iter: 1055
1055: loss=0.058, reward_mean=-6.8, reward_bound=-6.5
Iter: 1056
1056: loss=0.062, reward_mean=-6.5, reward_bound=-6.4
Iter: 1057
1057: loss=0.074, reward_mean=-6.7, reward_bound=-6.5
Iter: 1058
1058: loss=0.061, reward_mean=-6.9, reward_bound=-6.5
Iter: 1059
1059: loss=0.088, reward_mean=-6.7, reward_bound=-6.5
Iter: 1060
1060: loss=0.084, reward_mean=-6.5, reward_bound=-6.3
Iter: 1061
1061: loss=0.061, reward_mean=-6.9, reward_bound=-6.5
Iter: 1062
1062: loss=0.171, reward_mean=-6.7, reward_bound=-6.4
Iter: 1063
1063: loss=0.076, reward_mean=-6.9, reward_bound=-6.6
Iter: 1064
1064: loss=0.0

Iter: 1176
1176: loss=0.131, reward_mean=-6.6, reward_bound=-6.5
Iter: 1177
1177: loss=0.086, reward_mean=-6.8, reward_bound=-6.4
Iter: 1178
1178: loss=0.105, reward_mean=-6.9, reward_bound=-6.7
Iter: 1179
1179: loss=0.103, reward_mean=-6.8, reward_bound=-6.5
Iter: 1180
1180: loss=0.145, reward_mean=-6.8, reward_bound=-6.5
Iter: 1181
1181: loss=0.073, reward_mean=-6.8, reward_bound=-6.6
Iter: 1182
1182: loss=0.100, reward_mean=-6.8, reward_bound=-6.5
Iter: 1183
1183: loss=0.160, reward_mean=-6.9, reward_bound=-6.5
Iter: 1184
1184: loss=0.113, reward_mean=-6.5, reward_bound=-6.4
Iter: 1185
1185: loss=0.053, reward_mean=-6.5, reward_bound=-6.4
Iter: 1186
1186: loss=0.085, reward_mean=-6.7, reward_bound=-6.3
Iter: 1187
1187: loss=0.088, reward_mean=-6.7, reward_bound=-6.5
Iter: 1188
1188: loss=0.155, reward_mean=-7.0, reward_bound=-6.6
Iter: 1189
1189: loss=0.049, reward_mean=-7.0, reward_bound=-6.5
Iter: 1190
1190: loss=0.058, reward_mean=-7.0, reward_bound=-6.6
Iter: 1191
1191: loss=0.0

Iter: 1303
1303: loss=0.066, reward_mean=-6.7, reward_bound=-6.3
Iter: 1304
1304: loss=0.038, reward_mean=-6.6, reward_bound=-6.3
Iter: 1305
1305: loss=0.086, reward_mean=-6.7, reward_bound=-6.5
Iter: 1306
1306: loss=0.055, reward_mean=-6.7, reward_bound=-6.6
Iter: 1307
1307: loss=0.048, reward_mean=-6.6, reward_bound=-6.4
Iter: 1308
1308: loss=0.043, reward_mean=-6.8, reward_bound=-6.4
Iter: 1309
1309: loss=0.044, reward_mean=-6.7, reward_bound=-6.4
Iter: 1310
1310: loss=0.037, reward_mean=-6.8, reward_bound=-6.6
Iter: 1311
1311: loss=0.057, reward_mean=-6.7, reward_bound=-6.5
Iter: 1312
1312: loss=0.040, reward_mean=-6.6, reward_bound=-6.4
Iter: 1313
1313: loss=0.035, reward_mean=-6.7, reward_bound=-6.4
Iter: 1314
1314: loss=0.044, reward_mean=-6.7, reward_bound=-6.3
Iter: 1315
1315: loss=0.047, reward_mean=-6.9, reward_bound=-6.6
Iter: 1316
1316: loss=0.048, reward_mean=-6.5, reward_bound=-6.2
Iter: 1317
1317: loss=0.056, reward_mean=-6.5, reward_bound=-6.2
Iter: 1318
1318: loss=0.0

Iter: 1430
1430: loss=0.041, reward_mean=-6.4, reward_bound=-6.3
Iter: 1431
1431: loss=0.044, reward_mean=-6.7, reward_bound=-6.4
Iter: 1432
1432: loss=0.042, reward_mean=-6.8, reward_bound=-6.4
Iter: 1433
1433: loss=0.037, reward_mean=-6.8, reward_bound=-6.4
Iter: 1434
1434: loss=0.035, reward_mean=-6.7, reward_bound=-6.3
Iter: 1435
1435: loss=0.018, reward_mean=-6.9, reward_bound=-6.5
Iter: 1436
1436: loss=0.040, reward_mean=-6.9, reward_bound=-6.4
Iter: 1437
1437: loss=0.040, reward_mean=-6.7, reward_bound=-6.5
Iter: 1438
1438: loss=0.039, reward_mean=-6.8, reward_bound=-6.5
Iter: 1439
1439: loss=0.026, reward_mean=-6.7, reward_bound=-6.4
Iter: 1440
1440: loss=0.039, reward_mean=-6.5, reward_bound=-6.4
Iter: 1441
1441: loss=0.068, reward_mean=-6.5, reward_bound=-6.3
Iter: 1442
1442: loss=0.085, reward_mean=-6.7, reward_bound=-6.4
Iter: 1443
1443: loss=0.043, reward_mean=-6.6, reward_bound=-6.2
Iter: 1444
1444: loss=0.097, reward_mean=-6.6, reward_bound=-6.4
Iter: 1445
1445: loss=0.0

Iter: 1557
1557: loss=0.022, reward_mean=-6.7, reward_bound=-6.3
Iter: 1558
1558: loss=0.030, reward_mean=-6.6, reward_bound=-6.4
Iter: 1559
1559: loss=0.037, reward_mean=-6.7, reward_bound=-6.5
Iter: 1560
1560: loss=0.039, reward_mean=-6.9, reward_bound=-6.4
Iter: 1561
1561: loss=0.032, reward_mean=-6.8, reward_bound=-6.5
Iter: 1562
1562: loss=0.037, reward_mean=-6.7, reward_bound=-6.4
Iter: 1563
1563: loss=0.017, reward_mean=-6.7, reward_bound=-6.4
Iter: 1564
1564: loss=0.042, reward_mean=-6.4, reward_bound=-6.3
Iter: 1565
1565: loss=0.048, reward_mean=-6.5, reward_bound=-6.3
Iter: 1566
1566: loss=0.028, reward_mean=-6.6, reward_bound=-6.4
Iter: 1567
1567: loss=0.038, reward_mean=-6.6, reward_bound=-6.4
Iter: 1568
1568: loss=0.041, reward_mean=-6.9, reward_bound=-6.5
Iter: 1569
1569: loss=0.034, reward_mean=-6.5, reward_bound=-6.4
Iter: 1570
1570: loss=0.023, reward_mean=-6.6, reward_bound=-6.3
Iter: 1571
1571: loss=0.067, reward_mean=-6.4, reward_bound=-6.2
Iter: 1572
1572: loss=0.0

Iter: 1684
1684: loss=0.025, reward_mean=-6.9, reward_bound=-6.4
Iter: 1685
1685: loss=0.012, reward_mean=-6.5, reward_bound=-6.1
Iter: 1686
1686: loss=0.025, reward_mean=-6.6, reward_bound=-6.3
Iter: 1687
1687: loss=0.033, reward_mean=-6.6, reward_bound=-6.4
Iter: 1688
1688: loss=0.019, reward_mean=-6.6, reward_bound=-6.2
Iter: 1689
1689: loss=0.024, reward_mean=-6.8, reward_bound=-6.3
Iter: 1690
1690: loss=0.021, reward_mean=-6.5, reward_bound=-6.2
Iter: 1691
1691: loss=0.025, reward_mean=-6.5, reward_bound=-6.3
Iter: 1692
1692: loss=0.017, reward_mean=-6.7, reward_bound=-6.3
Iter: 1693
1693: loss=0.012, reward_mean=-6.7, reward_bound=-6.5
Iter: 1694
1694: loss=0.018, reward_mean=-6.6, reward_bound=-6.3
Iter: 1695
1695: loss=0.022, reward_mean=-6.7, reward_bound=-6.4
Iter: 1696
1696: loss=0.019, reward_mean=-6.6, reward_bound=-6.3
Iter: 1697
1697: loss=0.042, reward_mean=-6.7, reward_bound=-6.3
Iter: 1698
1698: loss=0.016, reward_mean=-6.6, reward_bound=-6.4
Iter: 1699
1699: loss=0.0

Iter: 1811
1811: loss=0.014, reward_mean=-6.4, reward_bound=-6.3
Iter: 1812
1812: loss=0.019, reward_mean=-6.4, reward_bound=-6.2
Iter: 1813
1813: loss=0.017, reward_mean=-6.6, reward_bound=-6.3
Iter: 1814
1814: loss=0.024, reward_mean=-6.5, reward_bound=-6.3
Iter: 1815
1815: loss=0.015, reward_mean=-6.7, reward_bound=-6.3
Iter: 1816
1816: loss=0.015, reward_mean=-6.6, reward_bound=-6.3
Iter: 1817
1817: loss=0.042, reward_mean=-6.8, reward_bound=-6.4
Iter: 1818
1818: loss=0.012, reward_mean=-6.6, reward_bound=-6.3
Iter: 1819
1819: loss=0.021, reward_mean=-6.7, reward_bound=-6.4
Iter: 1820
1820: loss=0.020, reward_mean=-6.5, reward_bound=-6.2
Iter: 1821
1821: loss=0.012, reward_mean=-6.8, reward_bound=-6.4
Iter: 1822
1822: loss=0.038, reward_mean=-6.8, reward_bound=-6.4
Iter: 1823
1823: loss=0.015, reward_mean=-6.8, reward_bound=-6.3
Iter: 1824
1824: loss=0.016, reward_mean=-6.5, reward_bound=-6.3
Iter: 1825
1825: loss=0.015, reward_mean=-6.4, reward_bound=-6.2
Iter: 1826
1826: loss=0.0

Iter: 1938
1938: loss=0.009, reward_mean=-6.6, reward_bound=-6.4
Iter: 1939
1939: loss=0.030, reward_mean=-6.7, reward_bound=-6.2
Iter: 1940
1940: loss=0.026, reward_mean=-6.9, reward_bound=-6.5
Iter: 1941
1941: loss=0.010, reward_mean=-6.5, reward_bound=-6.4
Iter: 1942
1942: loss=0.019, reward_mean=-6.6, reward_bound=-6.3
Iter: 1943
1943: loss=0.020, reward_mean=-6.4, reward_bound=-6.3
Iter: 1944
1944: loss=0.017, reward_mean=-6.5, reward_bound=-6.3
Iter: 1945
1945: loss=0.016, reward_mean=-6.9, reward_bound=-6.6
Iter: 1946
1946: loss=0.015, reward_mean=-6.6, reward_bound=-6.4
Iter: 1947
1947: loss=0.020, reward_mean=-6.6, reward_bound=-6.3
Iter: 1948
1948: loss=0.014, reward_mean=-6.6, reward_bound=-6.3
Iter: 1949
1949: loss=0.013, reward_mean=-6.5, reward_bound=-6.3
Iter: 1950
1950: loss=0.007, reward_mean=-6.4, reward_bound=-6.1
Iter: 1951
1951: loss=0.006, reward_mean=-6.5, reward_bound=-6.3
Iter: 1952
1952: loss=0.021, reward_mean=-6.5, reward_bound=-6.3
Iter: 1953
1953: loss=0.0

KeyboardInterrupt: 