In [1]:
import time
import os
import random
import numpy as np                                                
import matplotlib.pyplot as plt                                   
import autograd, autograd.core, autograd.extend, autograd.tracer  
import autograd.numpy as anp      
import scipy, scipy.ndimage, scipy.sparse, scipy.sparse.linalg    
                                                     
import gym
# from gym import spaces
from gymnasium import spaces
import gymnasium 

from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common import results_plotter




In [2]:
# !pip3 install --upgrade stable_baselines3
# !pip3 install gym==0.22.0 

The below code is my RL extension to and rework of https://www.researchgate.net/publication/360698153_A_Tutorial_on_Structural_Optimization 
and
https://www.sciencedirect.com/science/article/pii/S0264127522002933

In [3]:
class ObjectView(object):
    def __init__(self, d): self.__dict__ = d
    
def get_args(normals, forces, density=1e-4):  # Manage the problem setup parameters
    width = normals.shape[0] - 1
    height = normals.shape[1] - 1
    fixdofs = np.flatnonzero(normals.ravel())
    alldofs = np.arange(2 * (width + 1) * (height + 1))
    freedofs = np.sort(list(set(alldofs) - set(fixdofs)))
    params = {
      # material properties
      'young': 1, 'young_min': 1e-9, 'poisson': 0.3, 'g': 0,
      # constraints
      'density': density, 'xmin': 0.001, 'xmax': 1.0,
      # input parameters
      'nelx': width, 'nely': height, 'mask': 1, 'penal': 3.0, 'filter_width': 1,
      'freedofs': freedofs, 'fixdofs': fixdofs, 'forces': forces.ravel(),
      # optimization parameters
      'opt_steps': 80, 'print_every': 10}
    return ObjectView(params)

def mbb_beam(width=12, height=12, density=1e-4, y=1, x=0, rd=-1):  # textbook beam example
    normals = np.zeros((width + 1, height + 1, 2))
    normals[0, 0, x] = 1
    normals[0, 0, y] = 1
    normals[0, -1, x] = 1
    normals[0, -1, y] = 1
    forces = np.zeros((width + 1, height + 1, 2))
    forces[-1, rd, y] = -1
    return normals, forces, density

In [4]:
def young_modulus(x, e_0, e_min, p=3):
    return e_min + x ** p * (e_0 - e_min)

def physical_density(x, args, volume_contraint=False, use_filter=True):
    x = args.mask * x.reshape(args.nely, args.nelx)  # reshape from 1D to 2D
    return gaussian_filter(x, args.filter_width) if use_filter else x  # maybe filter

def mean_density(x, args, volume_contraint=False, use_filter=True):
    return anp.mean(physical_density(x, args, volume_contraint, use_filter)) / anp.mean(args.mask)

In [5]:
def objective(x, args, volume_contraint=False, use_filter=True):
    kwargs = dict(penal=args.penal, e_min=args.young_min, e_0=args.young)
    x_phys = physical_density(x, args, volume_contraint=volume_contraint, use_filter=use_filter)
    ke     = get_stiffness_matrix(args.young, args.poisson)  # stiffness matrix
    u      = displace(x_phys, ke, args.forces, args.freedofs, args.fixdofs, **kwargs)
    c      = compliance(x_phys, u, ke, **kwargs)
    return c

In [6]:
# @autograd.extend.primitive
def gaussian_filter(x, width): # 2D gaussian blur/filter
    return scipy.ndimage.gaussian_filter(x, width, mode='reflect')

def _gaussian_filter_vjp(ans, x, width): # gives the gradient of orig. function w.r.t. x
    del ans, x  # unused
    return lambda g: gaussian_filter(g, width)
# autograd.extend.defvjp(gaussian_filter, _gaussian_filter_vjp)

In [7]:
def compliance(x_phys, u, ke, *, penal=3, e_min=1e-9, e_0=1):
    nely, nelx = x_phys.shape
    ely, elx = anp.meshgrid(range(nely), range(nelx))  # x, y coords for the index map

    n1 = (nely+1)*(elx+0) + (ely+0)  # nodes
    n2 = (nely+1)*(elx+1) + (ely+0)
    n3 = (nely+1)*(elx+1) + (ely+1)
    n4 = (nely+1)*(elx+0) + (ely+1)
    all_ixs = anp.array([2*n1, 2*n1+1, 2*n2, 2*n2+1, 2*n3, 2*n3+1, 2*n4, 2*n4+1])
    u_selected = u[all_ixs]  # select from u matrix

    ke_u = anp.einsum('ij,jkl->ikl', ke, u_selected)  # compute x^penal * U.T @ ke @ U
    ce = anp.einsum('ijk,ijk->jk', u_selected, ke_u)
    C = young_modulus(x_phys, e_0, e_min, p=penal) * ce.T
    return anp.sum(C)

def get_stiffness_matrix(e, nu):  # e=young's modulus, nu=poisson coefficient
    k = anp.array([1/2-nu/6, 1/8+nu/8, -1/4-nu/12, -1/8+3*nu/8,
                -1/4+nu/12, -1/8-nu/8, nu/6, 1/8-3*nu/8])
    return e/(1-nu**2)*anp.array([[k[0], k[1], k[2], k[3], k[4], k[5], k[6], k[7]],
                               [k[1], k[0], k[7], k[6], k[5], k[4], k[3], k[2]],
                               [k[2], k[7], k[0], k[5], k[6], k[3], k[4], k[1]],
                               [k[3], k[6], k[5], k[0], k[7], k[2], k[1], k[4]],
                               [k[4], k[5], k[6], k[7], k[0], k[1], k[2], k[3]],
                               [k[5], k[4], k[3], k[2], k[1], k[0], k[7], k[6]],
                               [k[6], k[3], k[4], k[1], k[2], k[7], k[0], k[5]],
                               [k[7], k[2], k[1], k[4], k[3], k[6], k[5], k[0]]])

In [8]:
def get_k(stiffness, ke):
    # Constructs sparse stiffness matrix k (used in the displace fn)
    # First, get position of the nodes of each element in the stiffness matrix
    nely, nelx = stiffness.shape
    ely, elx = anp.meshgrid(range(nely), range(nelx))  # x, y coords
    ely, elx = ely.reshape(-1, 1), elx.reshape(-1, 1)

    n1 = (nely+1)*(elx+0) + (ely+0)
    n2 = (nely+1)*(elx+1) + (ely+0)
    n3 = (nely+1)*(elx+1) + (ely+1)
    n4 = (nely+1)*(elx+0) + (ely+1)
    edof = anp.array([2*n1, 2*n1+1, 2*n2, 2*n2+1, 2*n3, 2*n3+1, 2*n4, 2*n4+1])
    edof = edof.T[0]
    x_list = anp.repeat(edof, 8)  # flat list pointer of each node in an element
    y_list = anp.tile(edof, 8).flatten()  # flat list pointer of each node in elem

    # make the global stiffness matrix K
    kd = stiffness.T.reshape(nelx*nely, 1, 1)
    value_list = (kd * anp.tile(ke, kd.shape)).flatten()
    return value_list, y_list, x_list

def displace(x_phys, ke, forces, freedofs, fixdofs, *, penal=3, e_min=1e-9, e_0=1):
    # Displaces the load x using finite element techniques (solve_coo=most of runtime)
    stiffness = young_modulus(x_phys, e_0, e_min, p=penal)
    k_entries, k_ylist, k_xlist = get_k(stiffness, ke)

    index_map, keep, indices = _get_dof_indices(freedofs, fixdofs, k_ylist, k_xlist)

    u_nonzero = solve_coo(k_entries[keep], indices, forces[freedofs], sym_pos=True)
    u_values = anp.concatenate([u_nonzero, anp.zeros(len(fixdofs))])
    return u_values[index_map]

In [9]:
def _get_dof_indices(freedofs, fixdofs, k_xlist, k_ylist):
    index_map = inverse_permutation(anp.concatenate([freedofs, fixdofs]))
    keep = anp.isin(k_xlist, freedofs) & anp.isin(k_ylist, freedofs)
    # Now we index an indexing array that is being indexed by the indices of k
    i = index_map[k_ylist][keep]
    j = index_map[k_xlist][keep]
    return index_map, keep, anp.stack([i, j])

def inverse_permutation(indices):  # reverses an index operation
    inverse_perm = np.zeros(len(indices), dtype=anp.int64)
    inverse_perm[indices] = np.arange(len(indices), dtype=anp.int64)
    return inverse_perm

In [10]:
def _get_solver(a_entries, a_indices, size, sym_pos):
    # a is (usu.) symmetric positive; could solve 2x faster w/sksparse.cholmod.cholesky(a).solve_A
    a = scipy.sparse.coo_matrix((a_entries, a_indices), shape=(size,)*2).tocsc()
    return scipy.sparse.linalg.splu(a).solve

# @autograd.primitive
def solve_coo(a_entries, a_indices, b, sym_pos=False):
    solver = _get_solver(a_entries, a_indices, b.size, sym_pos)
    return solver(b)

def grad_solve_coo_entries(ans, a_entries, a_indices, b, sym_pos=False):
    def jvp(grad_ans):
        lambda_ = solve_coo(a_entries, a_indices if sym_pos else a_indices[::-1],
                            grad_ans, sym_pos)
        i, j = a_indices
        return -lambda_[i] * ans[j]
    return jvp

# autograd.extend.defvjp(solve_coo, grad_solve_coo_entries,
#                        lambda: print('err: gradient undefined'),
#                        lambda: print('err: gradient not implemented'))

In [11]:
class Model:
    def __init__(self, x):
        self.flag_ = True
#         self.flag_ = False
        self.n, self.m = x.shape
        self.actions_dic={} 
    
        k=0
        for i in range(self.n):
            for j in range(self.m):
                self.actions_dic[k]=(i,j)
                k+=1
        
    def action_space_(self, action, X):
        x,y=self.actions_dic[action]
        X[x][y]=1
        
    def draw(self,X):  
        plt.figure(dpi=50) 
        print('\nFinal Cantilever beam design:')
        plt.imshow(X) 
        plt.show()

In [12]:
def fast_stopt(args, x):

    reshape = lambda x: x.reshape(args.nely, args.nelx)
    objective_fn = lambda x: objective(reshape(x), args)
#     constraint = lambda params: mean_density(reshape(params), args) - args.density
    constraint = lambda params: mean_density(reshape(params), args) 
    value = objective_fn(x)
    const = constraint(x)
    return value, const

In [13]:
class CantileverEnv(gym.Env):
    
    metadata = {"render.modes": ["human"]}

    def __init__(self):
        super().__init__()
        
        
        self.rd=0
        self.args = get_args(*mbb_beam(rd=self.rd))
        
        DIM=self.args.nelx*self.args.nely
        N_DISCRETE_ACTIONS=self.args.nelx*self.args.nely
        
        self.action_space = spaces.Discrete(N_DISCRETE_ACTIONS)
        self.observation_space = spaces.Box(low=np.array([-1e10 for x in range(DIM)]),
                                            high=np.array([1e10 for y in range(DIM)]),
                                            shape=(DIM,),
                                           dtype=np.float64)
        
 
        self.x = anp.ones((self.args.nely, self.args.nelx))*self.args.density 
    
        self.M=Model(self.x)
        
        self.reward=0
        self.step_=0
        self.needs_reset = True
#         self.seed=0
        
    def step(self, action):
        
        self.args = get_args(*mbb_beam(rd=self.rd))
        
        self.M.action_space_(action, self.x)
        
        self.tmp, self.const = fast_stopt(self.args, self.x)
#         print(tmp, const)
        
        self.step_+=1
        
#         self.reward+=(1/self.tmp)**2
        self.reward=(1/self.tmp)**2
#         self.reward+=(1/self.tmp)**0.5

       
        done=False
                  
        if self.step_>self.M.n*self.M.m:
            done=True
            
        if self.const>0.7:
#             self.reward-=1
            done=True
            
        if self.needs_reset:
            raise RuntimeError("Tried to step environment that needs reset")
            
        if done:
            self.needs_reset = True
                         
      
        return self.x.reshape(self.x.shape[0]*self.x.shape[1]), self.reward, done,False, dict()

    def reset(self, seed=0):
        
        if not self.M.flag_:
            self.rd=random.choice([0,2,-2])
        else:
            self.rd=-1
           
        self.x = anp.ones((self.args.nely, self.args.nelx))*self.args.density 

        self.reward=0
        self.needs_reset = False
        self.step_=0

        return self.x.reshape(self.x.shape[0]*self.x.shape[1]),{}

    def render(self, mode="human"):
        self.M.draw(self.x)    

    def close(self):
        pass

In [14]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """

    def __init__(self, check_freq: int, log_dir: str, verbose=1):
        super().__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, "best_model")
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

            # Retrieve training reward
            x, y = ts2xy(load_results(self.log_dir), "timesteps")
            if len(x) > 0:
                # Mean training reward over the last 100 episodes
                mean_reward = np.mean(y[-100:])
                if self.verbose > 0:
                    print(f"Num timesteps: {self.num_timesteps}")
                    print(
                        f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}"
                    )

                # New best model, you could save the agent here
                if mean_reward > self.best_mean_reward:
                    self.best_mean_reward = mean_reward
                    # Example for saving best model
                    if self.verbose > 0:
                        print(f"Saving new best model to {self.save_path}.zip")
                    self.model.save(self.save_path)

        return True

In [15]:
# ts=5e6
ts=5e6

In [16]:
# Create log dir
log_dir = "/tmp/gym6/"
os.makedirs(log_dir, exist_ok=True)

# Create and wrap the environment
env = CantileverEnv()
# Logs will be saved in log_dir/monitor.csv

env = Monitor(env, log_dir)
check_env(env)

  logger.warn(


In [17]:
# callback = SaveOnBestTrainingRewardCallback(check_freq=5000, log_dir=log_dir)
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)

In [None]:
start=time.time()
model = PPO("MlpPolicy", env).learn(total_timesteps=ts, callback=callback)
end=time.time()   

Num timesteps: 1000
Best mean reward: -inf - Last mean reward per episode: 0.00
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 3000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 4000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 5000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 6000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 7000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 8000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 9000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 10000
Best mean

Num timesteps: 97000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 98000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 99000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 100000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 101000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 102000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 103000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 104000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 105000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 106000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 107000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 108000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 1090

Num timesteps: 197000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 198000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 199000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 200000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 201000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 202000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 203000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 204000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 205000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 206000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 207000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 208000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 2

Num timesteps: 297000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 298000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 299000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 300000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 301000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 302000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 303000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 304000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 305000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 306000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 307000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 308000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 3

Num timesteps: 390000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 391000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 392000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 393000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 394000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 395000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 396000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 397000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 398000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 399000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 400000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 401000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 4

Num timesteps: 490000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 491000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 492000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 493000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 494000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 495000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 496000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 497000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 498000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 499000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 500000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 501000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 5

Num timesteps: 590000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 591000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 592000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 593000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 594000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 595000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 596000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 597000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 598000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 599000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 600000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 601000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 6

Num timesteps: 690000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 691000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 692000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 693000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 694000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 695000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 696000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 697000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 698000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 699000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 700000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 701000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 7

Num timesteps: 790000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 791000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 792000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 793000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 794000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 795000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 796000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 797000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 798000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 799000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 800000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 801000
Best mean reward: 0.00 - Last mean reward per episode: 0.00
Num timesteps: 8

Num timesteps: 884000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 885000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 886000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 887000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 888000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 889000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 890000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 891000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 892000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 893000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 894000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 895000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 8

Num timesteps: 979000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 980000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 981000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 982000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 983000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 984000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 985000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 986000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 987000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 988000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 989000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 990000
Best mean reward: 0.01 - L

Num timesteps: 1078000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1079000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1080000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1081000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1082000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1083000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1084000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1085000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1086000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1087000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 1088000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 1089000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num 

Num timesteps: 1177000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 1178000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 1179000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 1180000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1181000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 1182000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 1183000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 1184000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 1185000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 1186000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 1187000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num timesteps: 1188000
Best mean reward: 0.01 - Last mean reward per episode: 0.00
Num 

Num timesteps: 1274000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1275000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1276000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1277000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1278000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1279000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1280000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1281000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1282000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1283000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1284000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1285000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num 

Num timesteps: 1370000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1371000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1372000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1373000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1374000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1375000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1376000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1377000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1378000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1379000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1380000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1381000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num 

Num timesteps: 1465000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1466000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1467000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1468000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1469000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1470000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1471000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1472000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1473000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1474000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1475000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1476000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num 

Num timesteps: 1563000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1564000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1565000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1566000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1567000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 1568000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1569000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 1570000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 1571000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1572000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 15730

Num timesteps: 1659000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1660000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1661000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1662000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1663000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1664000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1665000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1666000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1667000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1668000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 1669000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1670000
Best mean rewa

Num timesteps: 1755000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1756000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1757000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1758000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1759000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1760000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1761000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1762000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1763000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1764000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1765000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1766000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Savi

Num timesteps: 1848000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1849000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1850000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1851000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1852000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1853000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1854000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1855000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1856000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1857000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1858000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1859000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num 

Num timesteps: 1944000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1945000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1946000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1947000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1948000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1949000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1950000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1951000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1952000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1953000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1954000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 1955000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num 

Num timesteps: 2043000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2044000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2045000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2046000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2047000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2048000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2049000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2050000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2051000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2052000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2053000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2054000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num 

Num timesteps: 2142000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2143000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2144000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2145000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2146000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2147000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2148000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2149000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2150000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2151000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 21520

Num timesteps: 2238000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2239000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2240000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2241000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2242000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2243000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2244000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2245000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2246000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2247000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2248000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num timesteps: 2249000
Best mean reward: 0.01 - Last mean reward per episode: 0.01
Num 

Num timesteps: 2335000
Best mean reward: 0.02 - Last mean reward per episode: 0.01
Num timesteps: 2336000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2337000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2338000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2339000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2340000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2341000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2342000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2343000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2344000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Saving new best mode

Num timesteps: 2431000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2432000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2433000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2434000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2435000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2436000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2437000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2438000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2439000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2440000
Best mean reward: 0.02 - Last 

Num timesteps: 2516000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2517000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2518000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2519000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2520000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2521000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2522000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2523000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2524000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2525000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2526000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2527000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num 

Num timesteps: 2612000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2613000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2614000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2615000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2616000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2617000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2618000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2619000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Num timesteps: 2620000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2621000
Best mean reward: 0.02 - Last mean reward per episode: 0.02
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2622000
Best mean reward: 0.02 - Last mean reward per e

Num timesteps: 2705000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2706000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2707000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2708000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2709000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2710000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2711000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2712000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Saving new best model to /tmp/gym6/best_model.zip
Num timesteps: 2713000
Best mean rew

Num timesteps: 2801000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2802000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2803000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2804000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2805000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2806000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2807000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2808000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2809000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2810000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2811000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2812000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num 

Num timesteps: 2900000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2901000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2902000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2903000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2904000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2905000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2906000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2907000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2908000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2909000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2910000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 2911000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num 

Num timesteps: 2996000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 2997000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 2998000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 2999000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3000000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3001000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3002000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3003000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3004000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3005000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3006000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3007000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num 

Num timesteps: 3095000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3096000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3097000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3098000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3099000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3100000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3101000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3102000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3103000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3104000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3105000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3106000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num 

Num timesteps: 3194000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3195000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3196000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3197000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3198000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3199000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3200000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3201000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3202000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3203000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3204000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3205000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num 

Num timesteps: 3293000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3294000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3295000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3296000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3297000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3298000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3299000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3300000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3301000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3302000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3303000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num timesteps: 3304000
Best mean reward: 0.03 - Last mean reward per episode: 0.03
Num 

Num timesteps: 3392000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3393000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3394000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3395000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3396000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3397000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3398000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3399000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3400000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3401000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3402000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3403000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num 

Num timesteps: 3491000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3492000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3493000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3494000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3495000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3496000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3497000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3498000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3499000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3500000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3501000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3502000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num 

Num timesteps: 3590000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3591000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3592000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3593000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3594000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3595000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3596000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 3597000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 3598000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 3599000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 3600000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 3601000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num 

Num timesteps: 3689000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 3690000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 3691000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 3692000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3693000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3694000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3695000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3696000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3697000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3698000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3699000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3700000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num 

Num timesteps: 3788000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3789000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3790000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3791000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3792000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3793000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3794000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3795000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3796000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3797000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3798000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3799000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num 

Num timesteps: 3887000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3888000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3889000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3890000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3891000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3892000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3893000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 3894000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 3895000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 3896000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 3897000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 3898000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num 

Num timesteps: 3986000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 3987000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 3988000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3989000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3990000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3991000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3992000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3993000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3994000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3995000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3996000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 3997000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num 

Num timesteps: 4085000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4086000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4087000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4088000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4089000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4090000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4091000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4092000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4093000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4094000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4095000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4096000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num 

Num timesteps: 4184000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4185000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4186000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4187000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4188000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4189000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4190000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4191000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4192000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4193000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4194000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4195000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num 

Num timesteps: 4283000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4284000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4285000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4286000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4287000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4288000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4289000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4290000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4291000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4292000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4293000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4294000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num 

Num timesteps: 4382000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4383000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4384000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4385000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4386000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4387000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4388000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4389000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4390000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4391000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4392000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num timesteps: 4393000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num 

Num timesteps: 4481000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4482000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4483000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4484000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4485000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4486000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4487000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4488000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4489000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4490000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4491000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4492000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num 

Num timesteps: 4580000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4581000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4582000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4583000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4584000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4585000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4586000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4587000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4588000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4589000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4590000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4591000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num 

Num timesteps: 4679000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4680000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4681000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4682000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4683000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4684000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4685000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4686000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4687000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4688000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4689000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4690000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num 

Num timesteps: 4778000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4779000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4780000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4781000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4782000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4783000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4784000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4785000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4786000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4787000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4788000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4789000
Best mean reward: 0.03 - Last mean reward per episode: 0.01
Num 

Num timesteps: 4877000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4878000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4879000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4880000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4881000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4882000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4883000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4884000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4885000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4886000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4887000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4888000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num 

Num timesteps: 4976000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4977000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4978000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4979000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4980000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4981000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4982000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4983000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4984000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4985000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4986000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num timesteps: 4987000
Best mean reward: 0.03 - Last mean reward per episode: 0.02
Num 

In [None]:
print('Total time taken: {} min'.format((end - start)/60))

### Inference

In [None]:
# env.M.flag_=True
env.M.flag_=False
obs=env.reset()

In [None]:
obs=obs[0]

In [None]:
i=0
while i<1000:
    action, _states = model.predict(obs)
    obs, rewards, dones,_, info = env.step(int(action))
#     print(env.tmp, env.const)
    
    if dones:
        break
    i+=1

In [None]:
print(i)

In [None]:
env.render()

In [None]:
results_plotter.plot_results([log_dir], ts, results_plotter.X_TIMESTEPS, "CantileverEnv")

In [None]:
obs=env.reset()[0]

In [None]:
# Load the agent
model_best = PPO.load(log_dir + "best_model.zip",env=env)

In [None]:
i=0
while i<1000:
    action, _states = model_best.predict(obs)
    
    obs, rewards, dones, _,info = env.step(int(action))
#     print(env.tmp, env.const)
    if dones:
        break
    i+=1

In [None]:
i

In [None]:
env.render()

In [None]:
# TODO - https://www.sciencedirect.com/science/article/pii/S0264127522002933

# 6 by 6 grid
# different reward 3 
# training ~1.5 hr

# illegal moves reward = -1, terminate ? 

# CNN
# randomize loads 
# utilize symmetry
# reward at the end of episode
# action - remove elements not add 
# refinement 6x6 -> 12x12
# load two elements in inference - test generalizability
# input stress field instead of density field 
# test different RL algorithms 

In [None]:
import random

In [None]:
for j in range(10):
    i=0
    obs=env.reset()
    while i<16*0.68753125:
        action = random.randint(0, 15)

        env.args = get_args(*mbb_beam(rd=env.rd))

        env.M.action_space_(action, env.x)   
        env.tmp, env.const = fast_stopt(env.args, env.x)
        
        i+=1
    print(env.tmp, env.const)    

In [None]:
for j in range(10):
    i=0
    obs=env.reset()
    while i<9*0.68753125:
        action = random.randint(0, 8)

        env.args = get_args(*mbb_beam(rd=env.rd))

        env.M.action_space_(action, env.x)   
        env.tmp, env.const = fast_stopt(env.args, env.x)
        
        i+=1
    print(env.tmp, env.const)   