In [1]:
import gym
import gym_interf
import numpy as np
import torch
from tqdm import trange

from baselines.common.vec_env.shmem_vec_env import ShmemVecEnv
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.common.vec_env import VecEnvWrapper


DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
N_ENVS = 1
N_STEPS = 128


class StateWrapper(gym.Wrapper):
    def __init__(self, e):
        super().__init__(e)
        self.prev_dist = None
        self.prev_angle = None
        
    def set_noise(self):
        rnd = np.random.rand()
        self.env.add_noise(rnd * 20)
        
    def _permutate(self, state):
        start = np.random.randint(0, state.shape[0])
        result = []
        for i in range(start, start + state.shape[0]):
            result.append(state[i % state.shape[0]])
        return np.asarray(result)
    
    def _fft(self, state):
        state = np.fft.fftn(state)
        return np.concatenate([np.real(state), np.imag(state)], axis=0)
        
    def reset(self, **kwargs):
        self.set_noise()
        
        self.env.x_min = -np.random.uniform(low=0.8, high=3)#-3.57 / 2
        self.env.x_max = np.random.uniform(low=0.8, high=3)#3.57 / 2

        state = self.env.reset(**kwargs)
        state = self._permutate(state)
        #state = self._fft(state)
        
        self.prev_dist = self.env.dist
        self.prev_angle = self.env.angle
        return (
            state, np.array([       
                -self.env.mirror1_screw_x,
                -self.env.mirror1_screw_y,
                -self.env.mirror2_screw_x,
                -self.env.mirror2_screw_y,
            ]), 
            1.0 - self.env.visib
        )
    
    def step(self, action):
        self.set_noise()
            
        state, reward, done, info = self.env.step(action)
        state = self._permutate(state)
        #state = self._fft(state)
        
        delta_dist = self.prev_dist - self.env.dist
        delta_angle = self.prev_angle - self.env.angle
        
        self.prev_dist = self.env.dist
        self.prev_angle = self.env.angle
                
        #reward += delta_dist + delta_angle * 1000
        
        #reward -= 1.0 / 200.0
        
        return [
            state, np.array([       
                -self.env.mirror1_screw_x,
                -self.env.mirror1_screw_y,
                -self.env.mirror2_screw_x,
                -self.env.mirror2_screw_y,
            ]), 
            1.0 - self.env.visib
        ], reward, done, info
    

class MyVecPyTorch(VecEnvWrapper):
    def __init__(self, venv, device):
        """Return only every `skip`-th frame"""
        super(MyVecPyTorch, self).__init__(venv)
        self.device = device
        # TODO: Fix data types

    def reset(self):
        obs = self.venv.reset()
        imgs = np.stack(obs[:,0])
        actions = np.stack(obs[:,1])
        visibs = np.stack(obs[:,2])
        obs = (
            torch.from_numpy(imgs).float().to(self.device),
            torch.from_numpy(actions).to(self.device),
            torch.from_numpy(visibs).to(self.device)
        )
        return obs

    def step_async(self, actions):
        if isinstance(actions, torch.LongTensor):
            # Squeeze the dimension for discrete actions
            actions = actions.squeeze(1)
        actions = actions.cpu().numpy()
        self.venv.step_async(actions)

    def step_wait(self):
        obs, reward, done, info = self.venv.step_wait()
        imgs = np.stack(obs[:,0])
        actions = np.stack(obs[:,1])
        visibs = np.stack(obs[:,2])
        obs = (
            torch.from_numpy(imgs).float().to(self.device),
            torch.from_numpy(actions).to(self.device),
            torch.from_numpy(visibs).to(self.device)
        )
        reward = torch.from_numpy(reward).unsqueeze(dim=1).float()
        return obs, reward, done, info




def make_interf_env(seed):
    env = StateWrapper(gym.make('interf-v1'))
    env.set_calc_reward('visib_minus_1')
    #env.set_calc_reward('delta_visib')
    #env.set_calc_image('gpu')
    env.seed(seed)
    return env


temp_env = make_interf_env(0)
obs = temp_env.reset()
ACTION_SPACE = temp_env.action_space
print(ACTION_SPACE)
OBS_SHAPE = obs[0].shape
N_ACTIONS = ACTION_SPACE.shape[0]


env_lambda = [
    lambda env_seed=env_seed: make_interf_env(seed=env_seed)
    for env_seed in range(N_ENVS)]

envs = SubprocVecEnv(env_lambda, context='fork')
#envs = ShmemVecEnv(env_lambda, context='fork')
envs = MyVecPyTorch(envs, DEVICE)

print(envs.reset()[0].shape)
print(envs.reset()[0].type)




print(DEVICE)
print(obs[0].dtype, obs[0].shape)
#assert obs.shape == (16, 64, 64), obs.shape
#assert obs.dtype == torch.uint8


Box(4,)
torch.Size([1, 16, 64, 64])
<built-in method type of Tensor object at 0x7f10ec2b4048>
cuda
uint8 (16, 64, 64)
SubprocVecEnv worker: got KeyboardInterrupt


In [2]:
import copy
import glob
import os
import time
from collections import deque

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

GPU = True
device_idx = 0
if GPU:
    device = torch.device("cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu")
else:
    device = torch.device("cpu")
print(device)


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

cuda:0


In [3]:
from numpy import random

class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
    
    def push(self, state, action, reward, next_state, done, action_target, value_target):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done, action_target, value_target)
        self.position = int((self.position + 1) % self.capacity)  # as a ring buffer
    
    def sample(self, batch_size):
        choices = np.array(self.buffer)
        idx = random.choice(len(choices), batch_size)
        batch = choices[idx]
        state, action, reward, next_state, done, action_target, value_target = map(np.stack, zip(*batch)) # stack for each element
        state = state.reshape(-1, 16, 64, 64)
        next_state = next_state.reshape(-1, 16, 64, 64)
        ''' 
        the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ;
        zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ;
        the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ;
        np.stack((1,2)) => array([1, 2])
        '''
        return state, action, reward, next_state, done, action_target, value_target
    
    def __len__(self):
        return len(self.buffer)


In [4]:
class CEM(object):
    """ cross-entropy method, as optimization of the action policy """
    def __init__(self, theta_dim, ini_mean_scale=0.0, ini_std_scale=1.0):
        self.theta_dim = theta_dim
        self.initialize(ini_mean_scale=ini_mean_scale, ini_std_scale=ini_std_scale)
        self.mean = None
        self.std = None

    def initialize(self, ini_mean_scale=0.0, ini_std_scale=0.33):
        self.mean = ini_mean_scale * np.ones(self.theta_dim)
        self.std = ini_std_scale * np.ones(self.theta_dim)
        
    def sample(self):
        # theta = self.mean + np.random.randn(self.theta_dim) * self.std
        theta = self.mean + np.random.normal(size=self.theta_dim) * self.std
        return theta

    def sample_multi(self, n):
        theta_list = []
        for i in range(n):
            theta_list.append(self.sample())
        return np.array(theta_list)

    def update(self, selected_samples):
        self.mean = np.mean(selected_samples, axis=0)
        self.std = np.std(selected_samples, axis=0)  # plus the entropy offset, or else easily get 0 std

        return self.mean, self.std


In [5]:
class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.size(0), -1)


def init(module, weight_init, bias_init, gain=1):
    weight_init(module.weight.data, gain=gain)
    bias_init(module.bias.data)
    return module


class QNetwork(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size=512):
        super(QNetwork, self).__init__()
        
        init_ = lambda m: init(
            m, nn.init.orthogonal_, lambda x: nn.init.
            constant_(x, 0), nn.init.calculate_gain('relu')
        )
        
        self.main = nn.Sequential(
            init_(nn.Conv2d(num_inputs, 32, 8, stride=4)), nn.ReLU(),
            init_(nn.Conv2d(32, 64, 4, stride=2)), nn.ReLU(),
            init_(nn.Conv2d(64, 32, 3, stride=1)), nn.ReLU(), Flatten(),
            init_(nn.Linear(32 * 4 * 4, hidden_size)), nn.ReLU()
        )
        
        self.linear1 = nn.Sequential(
            init_(nn.Linear(hidden_size + num_outputs, 512)), nn.ReLU()
        )
        
        init_ = lambda m: init(
            m, nn.init.orthogonal_, lambda x: nn.init.
            constant_(x, 0)
        )
        
        self.linear2 = init_(nn.Linear(512, 1))
        
    def forward(self, inputs, action):
        inputs = inputs.float()
        x = inputs / torch.mean(inputs, dim=(2, 3), keepdim=True)
        x = self.main(inputs)

        x = torch.cat([x, action], 1)
        x = self.linear1(x)
        x = self.linear2(x)

        return x

In [6]:
class QT_Opt():
    def __init__(self, replay_buffer, obs_shape, action_space, q_lr=3e-4, cem_update_itr=2, select_num=6, num_samples=64):
        self.num_samples = num_samples
        self.select_num = select_num
        self.cem_update_itr = cem_update_itr
        self.replay_buffer = replay_buffer

        num_inputs = obs_shape[0]
        num_outputs = action_space.shape[0]

        self.qnet = QNetwork(num_inputs, num_outputs).to(device) # gpu
        self.target_qnet1 = QNetwork(num_inputs, num_outputs).to(device)
        self.target_qnet2 = QNetwork(num_inputs, num_outputs).to(device)

        self.cem = CEM(theta_dim=num_outputs)  # cross-entropy method for updating

        self.q_optimizer = optim.Adam(self.qnet.parameters(), lr=q_lr)
        self.step_cnt = 0

    def update(self, batch_size, gamma=0.9, soft_tau=1e-4, update_delay=10000):
        state, action, reward, next_state, done, action_target, value_target = self.replay_buffer.sample(batch_size)
        #print('action_target', action_target.shape, 'value_target', value_target.shape)
        self.step_cnt += 1

        state = torch.from_numpy(state).to(device)
        next_state_ = torch.from_numpy(next_state).to(device)
        action = torch.FloatTensor(action).to(device)
        reward = torch.FloatTensor(reward).unsqueeze(1).to(device)  # reward is single value, unsqueeze() to add one dim to be [reward] at the sample dim;
        done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device)

        predict_q = self.qnet(state, action) # predicted Q(s,a) value

        # get argmax_a' from the CEM for the target Q(s', a')
        new_next_action = []
        for i in range(batch_size):      # batch of states, use them one by one, to prevent the lack of memory
            new_next_action.append(self.cem_optimal_action(next_state[i].reshape(-1, 16, 64, 64)))
        new_next_action = torch.FloatTensor(new_next_action).to(device)

        target_q_min = torch.min(
            self.target_qnet1(next_state_, new_next_action),
            self.target_qnet2(next_state_, new_next_action)
        )

        target_q = reward + (1 - done) * gamma * target_q_min

        
        # MSE loss, note that original paper uses cross-entropy loss
        q_loss = ((predict_q - target_q.detach())**2).mean()
        
        ######################
        #value_target = torch.from_numpy(value_target).float().to(device)
        #my_loss = ((predict_q - value_target)**2).mean()
        #q_loss += my_loss
        #####################

        self.q_optimizer.zero_grad()
        q_loss.backward()
        self.q_optimizer.step()

        # update the target nets, according to original paper:
        # one with Polyak averaging, another with lagged/delayed update
        self.target_qnet1 = self.target_soft_update(self.qnet, self.target_qnet1, soft_tau)
        self.target_qnet2 = self.target_delayed_update(self.qnet, self.target_qnet2, update_delay)
        
        return q_loss.detach().cpu().numpy()

    def cem_optimal_action(self, state):
        """ evaluate action wrt Q(s,a) to select the optimal using CEM """
        cuda_states = torch.from_numpy(np.vstack([state] * self.num_samples)).to(device)
        # every time use a new cem, cem is only for deriving the argmax_a'
        self.cem.initialize()
        for itr in range(self.cem_update_itr):
            actions = self.cem.sample_multi(self.num_samples)
            q_values = self.target_qnet1(
                cuda_states,
                torch.from_numpy(actions).float().to(device)
            ).detach().cpu().numpy().reshape(-1) # 2 dim to 1 dim
            max_idx = q_values.argsort()[-1]  # select one maximal q
            idx = q_values.argsort()[-int(self.select_num):]  # select top maximum q
            selected_actions = actions[idx]
            _, _ = self.cem.update(selected_actions)
        optimal_action = actions[max_idx]
        return optimal_action

    def target_soft_update(self, net, target_net, soft_tau):
        """ Soft update the target net """
        for target_param, param in zip(target_net.parameters(), net.parameters()):
            target_param.data.copy_(  # copy data value into target parameters
                target_param.data * (1.0 - soft_tau) + param.data * soft_tau
            )

        return target_net

    def target_delayed_update(self, net, target_net, update_delay):
        """ delayed update the target net """
        if self.step_cnt % update_delay == 0:
            for target_param, param in zip(target_net.parameters(), net.parameters()):
                target_param.data.copy_(  # copy data value into target parameters
                    param.data 
                )

        return target_net

    def save_model(self, path):
        torch.save(self.qnet.state_dict(), path)
        torch.save(self.target_qnet1.state_dict(), path)
        torch.save(self.target_qnet2.state_dict(), path)

    def load_model(self, path):
        self.qnet.load_state_dict(torch.load(path))
        self.target_qnet1.load_state_dict(torch.load(path))
        self.target_qnet2.load_state_dict(torch.load(path))
        self.qnet.eval()
        self.target_qnet1.eval()
        self.target_qnet2.eval()


In [7]:
replay_buffer_size = 30000

print(OBS_SHAPE)
print(ACTION_SPACE)

replay_buffer = ReplayBuffer(replay_buffer_size)
qt_opt = QT_Opt(replay_buffer, OBS_SHAPE, ACTION_SPACE)

(16, 64, 64)
Box(4,)


In [8]:
from tqdm import trange
from collections import deque

def evaluate_(model, env, n_games=10):
    n_solved = 0
    n_steps = 0
    dist = 0
    angle = 0
    rewards = 0
    actions = np.zeros(ACTION_SPACE.shape[0])
    
    for i in range(n_games):
        s = env.reset()
        visib = env.env.visib
        istep = 0
        while(True):
            s = s[0]
            s = torch.FloatTensor(s.reshape(1, 16, 64, 64)).to(DEVICE)
            with torch.no_grad():
                action = model.cem_optimal_action(s.cpu())
            
            actions += action
            s, reward, done, info = env.step(action)

            istep += 1
            if done:
                rewards += env.env.visib - visib
                n_solved += istep < 200
                n_steps += istep
                dist += info['dist']
                angle += info['angle_between_beams']

                break
    return (
        n_solved / n_games, 
        n_steps / n_games, 
        dist / n_games, 
        angle / n_games, 
        rewards / n_games, 
        actions / n_steps)

In [9]:
#evaluate_(qt_opt, make_interf_env(123))

In [10]:

from tensorboardX import SummaryWriter
from tqdm import trange

max_episodes  = int(1e7)
batch_size=100
max_steps = 200
episode_rewards = []

train_env = make_interf_env(123)
test_env = make_interf_env(123)


writer = SummaryWriter(
    'logs_v11'
)


for i_episode in trange(0, max_episodes, batch_size):

    episode_reward = 0
    episode_dist = 0
    episode_angle = 0
    episode_actions = np.zeros(ACTION_SPACE.shape[0])
    
    obs, action_target, value_target = train_env.reset()
    state = obs.reshape(1, 16, 64, 64)


    for step in range(max_steps):
        # action = qt_opt.policy.act(state) 
        action = qt_opt.cem_optimal_action(state)
        (obs, next_action_target, next_value_target), reward, done, info = train_env.step(action)
        next_state = obs.reshape(1, 16, 64, 64)

        episode_reward += reward
        episode_actions += action
        
        # only last value needed
        episode_dist = info['dist']
        episode_angle = info['angle_between_beams']
        #episode_reward = info['visib']

        replay_buffer.push(state, action, reward, next_state, done, action_target, [value_target])
        state = next_state

    if len(replay_buffer) > batch_size:
        loss = qt_opt.update(batch_size)
        writer.add_scalar('loss', loss, i_episode)
    
        qt_opt.save_model('saved_model')
        
    writer.add_scalar('reward', episode_reward, i_episode)
    writer.add_scalar('dist', episode_dist, i_episode)
    writer.add_scalar('angle', episode_angle, i_episode)
    
    for i, a in enumerate(episode_actions):
        writer.add_scalar('actions_{}'.format(i), a, i_episode)
            
    print('Episode: {}  | Reward:  {}'.format(i_episode, episode_reward))


  0%|          | 1/100000 [00:01<34:36:47,  1.25s/it]

Episode: 0  | Reward:  -0.0021778969000028053


  0%|          | 2/100000 [00:02<34:21:26,  1.24s/it]

Episode: 100  | Reward:  0.004551178155628139


  0%|          | 3/100000 [00:03<34:19:03,  1.24s/it]

Episode: 200  | Reward:  -0.0004161117232501236


  0%|          | 4/100000 [00:04<33:50:05,  1.22s/it]

Episode: 300  | Reward:  -0.0011419849512813267


  0%|          | 5/100000 [00:06<33:41:36,  1.21s/it]

Episode: 400  | Reward:  0.009610435826826073


  0%|          | 6/100000 [00:07<33:38:41,  1.21s/it]

Episode: 500  | Reward:  0.004200497312558733


  0%|          | 7/100000 [00:08<33:30:25,  1.21s/it]

Episode: 600  | Reward:  0.10879770477992683


  0%|          | 8/100000 [00:09<33:44:13,  1.21s/it]

Episode: 700  | Reward:  -0.0007225370607431053


  0%|          | 9/100000 [00:10<33:32:35,  1.21s/it]

Episode: 800  | Reward:  -0.0017728522999830281


  0%|          | 10/100000 [00:12<33:17:21,  1.20s/it]

Episode: 900  | Reward:  -0.000756861004572094


  0%|          | 11/100000 [00:13<33:34:23,  1.21s/it]

Episode: 1000  | Reward:  -0.0002510094177109547


  0%|          | 12/100000 [00:14<33:34:29,  1.21s/it]

Episode: 1100  | Reward:  0.006658116435754642


  0%|          | 13/100000 [00:15<33:33:41,  1.21s/it]

Episode: 1200  | Reward:  -0.0005079181717002845


  0%|          | 14/100000 [00:16<33:16:52,  1.20s/it]

Episode: 1300  | Reward:  -0.001342895498835896


  band_width_y = InterfEnv.lamb / abs(wave_vector2[1])
  0%|          | 15/100000 [00:18<33:08:44,  1.19s/it]

Episode: 1400  | Reward:  0.00014578103318756315


  0%|          | 16/100000 [00:19<33:14:57,  1.20s/it]

Episode: 1500  | Reward:  0.004392929262466543


  0%|          | 17/100000 [00:20<33:08:19,  1.19s/it]

Episode: 1600  | Reward:  -7.51900988206352e-05


  0%|          | 18/100000 [00:21<32:45:46,  1.18s/it]

Episode: 1700  | Reward:  -0.0017531859058760942


  0%|          | 19/100000 [00:22<32:44:26,  1.18s/it]

Episode: 1800  | Reward:  -0.0021456063793642557


  0%|          | 20/100000 [00:23<32:55:46,  1.19s/it]

Episode: 1900  | Reward:  0.005590466865267746


  0%|          | 21/100000 [00:25<32:38:12,  1.18s/it]

Episode: 2000  | Reward:  -0.00020899750795798827


  0%|          | 22/100000 [00:26<32:48:45,  1.18s/it]

Episode: 2100  | Reward:  0.006353833979554391


  0%|          | 23/100000 [00:27<33:07:49,  1.19s/it]

Episode: 2200  | Reward:  -0.00014030000113186055


  0%|          | 24/100000 [00:28<32:45:42,  1.18s/it]

Episode: 2300  | Reward:  0.0020726667543225414


  0%|          | 25/100000 [00:29<32:40:27,  1.18s/it]

Episode: 2400  | Reward:  0.0005428215194869959


  0%|          | 26/100000 [00:31<33:06:10,  1.19s/it]

Episode: 2500  | Reward:  -0.0007131938718626983


  0%|          | 27/100000 [00:32<33:08:26,  1.19s/it]

Episode: 2600  | Reward:  0.005596607521172149


  0%|          | 28/100000 [00:33<33:04:48,  1.19s/it]

Episode: 2700  | Reward:  0.00035802067418580616


  0%|          | 29/100000 [00:34<32:48:56,  1.18s/it]

Episode: 2800  | Reward:  0.004266876858003821


  0%|          | 30/100000 [00:35<33:17:04,  1.20s/it]

Episode: 2900  | Reward:  -0.0007093679626500257


  0%|          | 31/100000 [00:37<32:51:07,  1.18s/it]

Episode: 3000  | Reward:  0.0021482889628098075


  0%|          | 32/100000 [00:38<33:04:52,  1.19s/it]

Episode: 3100  | Reward:  0.0028515420790263908


  0%|          | 33/100000 [00:39<33:17:43,  1.20s/it]

Episode: 3200  | Reward:  -0.0014226325634901884


  0%|          | 34/100000 [00:40<33:28:41,  1.21s/it]

Episode: 3300  | Reward:  0.04590741531880613


  0%|          | 35/100000 [00:41<33:43:03,  1.21s/it]

Episode: 3400  | Reward:  -0.0007744210606363567


  0%|          | 36/100000 [00:43<33:20:31,  1.20s/it]

Episode: 3500  | Reward:  -0.0015030984991272792


  0%|          | 37/100000 [00:44<33:09:47,  1.19s/it]

Episode: 3600  | Reward:  0.0006909045190034175


  0%|          | 38/100000 [00:45<32:56:07,  1.19s/it]

Episode: 3700  | Reward:  0.0026857933180333255


  0%|          | 39/100000 [00:46<32:46:32,  1.18s/it]

Episode: 3800  | Reward:  -0.001467917427218979


  0%|          | 40/100000 [00:47<32:40:58,  1.18s/it]

Episode: 3900  | Reward:  0.0034386884460804274


  0%|          | 41/100000 [00:48<32:38:49,  1.18s/it]

Episode: 4000  | Reward:  0.003587705212444127


  0%|          | 42/100000 [00:50<33:03:16,  1.19s/it]

Episode: 4100  | Reward:  0.0006469715087158535


  0%|          | 43/100000 [00:51<33:00:13,  1.19s/it]

Episode: 4200  | Reward:  0.0037150871506648805


  0%|          | 44/100000 [00:52<33:23:10,  1.20s/it]

Episode: 4300  | Reward:  -0.0001787409235819892


  0%|          | 45/100000 [00:53<33:41:27,  1.21s/it]

Episode: 4400  | Reward:  -0.00044084817387464144


  0%|          | 46/100000 [00:55<33:35:38,  1.21s/it]

Episode: 4500  | Reward:  0.0008754560984879744


  0%|          | 47/100000 [00:56<33:45:53,  1.22s/it]

Episode: 4600  | Reward:  0.010738684898885425


  0%|          | 48/100000 [00:57<33:32:10,  1.21s/it]

Episode: 4700  | Reward:  -0.00016123318828058415


  0%|          | 49/100000 [00:58<33:27:17,  1.20s/it]

Episode: 4800  | Reward:  -0.0028444657192274554


  0%|          | 50/100000 [00:59<33:25:27,  1.20s/it]

Episode: 4900  | Reward:  0.0007290680691738691


  0%|          | 51/100000 [01:01<35:53:20,  1.29s/it]

Episode: 5000  | Reward:  0.0006450614355751238


  0%|          | 52/100000 [01:02<37:56:00,  1.37s/it]

Episode: 5100  | Reward:  -0.00011084809665956011


  0%|          | 53/100000 [01:04<39:17:51,  1.42s/it]

Episode: 5200  | Reward:  -0.00021209339128908986


  0%|          | 54/100000 [01:05<40:31:49,  1.46s/it]

Episode: 5300  | Reward:  -0.0009652146130507739


  0%|          | 55/100000 [01:07<41:29:41,  1.49s/it]

Episode: 5400  | Reward:  9.01744163133095e-06


  0%|          | 56/100000 [01:09<42:07:18,  1.52s/it]

Episode: 5500  | Reward:  -0.0005782648703960149


  0%|          | 57/100000 [01:10<42:06:35,  1.52s/it]

Episode: 5600  | Reward:  0.001752135579558116


  0%|          | 58/100000 [01:12<43:08:41,  1.55s/it]

Episode: 5700  | Reward:  0.0018419447032538392


  0%|          | 59/100000 [01:13<42:51:54,  1.54s/it]

Episode: 5800  | Reward:  0.004902985992065539


  0%|          | 60/100000 [01:15<42:59:54,  1.55s/it]

Episode: 5900  | Reward:  0.00293831386789977


  0%|          | 61/100000 [01:16<43:04:38,  1.55s/it]

Episode: 6000  | Reward:  -0.0010746401158451236


  0%|          | 62/100000 [01:18<43:29:03,  1.57s/it]

Episode: 6100  | Reward:  0.010595743617663916


  0%|          | 63/100000 [01:20<43:52:00,  1.58s/it]

Episode: 6200  | Reward:  0.003369765946278637


  0%|          | 64/100000 [01:21<43:42:37,  1.57s/it]

Episode: 6300  | Reward:  -0.00035246026171888503


  0%|          | 65/100000 [01:23<43:09:17,  1.55s/it]

Episode: 6400  | Reward:  0.007342699569957426


  0%|          | 66/100000 [01:24<42:58:33,  1.55s/it]

Episode: 6500  | Reward:  -0.0005650717671088308


  0%|          | 67/100000 [01:26<42:56:01,  1.55s/it]

Episode: 6600  | Reward:  0.1704948971700899


  0%|          | 68/100000 [01:27<43:19:27,  1.56s/it]

Episode: 6700  | Reward:  -0.0021089927217670086


  0%|          | 69/100000 [01:29<43:40:23,  1.57s/it]

Episode: 6800  | Reward:  0.003001901302764208


  0%|          | 70/100000 [01:30<43:09:07,  1.55s/it]

Episode: 6900  | Reward:  -0.0019178237414958208


  0%|          | 71/100000 [01:32<42:43:44,  1.54s/it]

Episode: 7000  | Reward:  0.11883933253882809


  0%|          | 72/100000 [01:33<42:26:09,  1.53s/it]

Episode: 7100  | Reward:  -0.00024860271705334606


  0%|          | 73/100000 [01:35<42:32:48,  1.53s/it]

Episode: 7200  | Reward:  0.004454396586539276


  0%|          | 74/100000 [01:37<42:49:18,  1.54s/it]

Episode: 7300  | Reward:  0.00021193650824355165


  0%|          | 75/100000 [01:38<43:06:02,  1.55s/it]

Episode: 7400  | Reward:  -0.0004370398927821574


  0%|          | 76/100000 [01:40<43:26:13,  1.56s/it]

Episode: 7500  | Reward:  0.027177593898565142


KeyboardInterrupt: 

In [None]:
j