In [None]:
from queue import Queue
from threading import Thread
from numpy.distutils.misc_util import is_sequence
import threading
# from stable_baselines3 import PPO,DQN,A2C,SAC
import gym
from gym import spaces,Env

In [None]:
# from stable_baselines3.common.monitor import Monitor as Mon

In [None]:
# import import_ipynb
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
import pickle

### Base classes for an **AI Agent** having the following architecture:
1.1 An **AI Agent** operates in a '**World**' that calls it via '**action<-act(state)**' requests and '**reward(reward)**' intimations. (Worlds are typically wrappers around traditional RL-environments; Worlds can also wrap supervised learning tasks.) The goal of the AI Agent is to maximise *long-term steady-state average reward*. Periodically the World may also update the Agent regarding the completion of an *episode* (e.g. an RL episode or completion of epoch).

1.2 An Agent has **Controller**, **Perception**, **Memory** and **Actor** components (In line with Lecun's "Archicture of an Autonomous AI Agent". World Model to be added later.) The Memory contains a Perceptual Memory as well as a State-Action-Reward memory. The Actor includes a **Model**. A Model includes a **Network** and **Trainer** (class that handles publishes training procedure(s) for the Network). Overall orchestration of all components including the Agent's public interface is handled by the Controller. Further, the learning schedule, to train the Network, be it done only intitally, periodically, or continually online, is decided by the Controller. 

1.3 Each component of an Agent may be customised for a specific World by inheriting from the default base class for that component. Agent's *begin* method indicates that a new episode/epoch is starting and resets/increments the time/ep counters (see below); the *clear* method clears (e.g. removes all storage) from applicable components.

In [None]:
class AIAgent():
    def __init__(self,controller=None,perception=None,memory=None,actor=None):
        if controller is not None: self.controller=controller
        else: self.controller=Controller(parent=self)
        if perception is not None: self.perception=perception
        else: self.perception=Perception(parent=self)
        if memory is not None: self.memory=memory
        else: self.memory=Memory(parent=self)
        if actor is not None: self.actor=actor
        else: self.actor=Actor(parent=self)
        self.time=0
        self.ep=[]
        # self.controller.parent=self
        # self.perception.parent=self
        # self.memory.parent=self
        # self.actor.parent=self
        self.debug=False
        self.use_memory=True
        self.limit_memory=False
    def act(self,world_state):
        world_action = self.controller.act(world_state)
        # check to see if network needs training - TBDesigned
        self.time+=1
        return world_action
    def reward(self,world_reward):
        return self.controller.reward(world_reward)
    def episode(self):
        return self.controller.episode()
    def begin(self,state=None):
        ## TBD may need to do more - both episode and time may be needed and episode reset
        self.ep+=[self.time]
    def clear(self):
        self.memory.clear()

2.1 Control flow goes as follows: Agent tracks a *time* counter and a list *ep*; the latter tracks episodes/epochs and is set to the first time counter for each epoch/episode e.g. ep[e]=starting time of epoch/epsode e. World calls *act*(world-state) on Agent, which is routed to Controller's *act*. The incoming *world-state* is mapped to a percept using the *perceive_state* function published by the Perception module, and stored in the perceptual memory (via Memory's *add_percept*, against the current *time*). 

2.2 The Actor reads from the perceptual memory and creates an actor-state by processing the current percept (and possibly also using prior rewards and actions, e.g. for meta-RL). The Actor also updates the state-action-reward memory with the previous time's percept after mapping it to an actor-state.

2.3 The Actor calls its Model to decide the action to return. Before returning the action, it is stored in the perceptual memory; a new entry is also created in the state-action-reward memory with the current actor-state and action. Also, the action is mapped to a world_action using the Perception component's *action_to_world* function.

2.4 Before completing the *act* (or *reward*) flow, Actor checks to see if any periodic or online training is needed, and updates the *training* flag accordingly. It also updates the *time* counter.

2.5 On receiving a *world_reward* from the World, the Actor passes it to the Controller that extracts information using Perception's *perceive_reward* function.  These are stored in the perceptual memory (for the prior time step, since by now the Actor's time step has been update as soon as its action was completed) as well as appended to the latest entry (prev time step) of the state-action-reward memory. Note: e.g. additional information might include, in addition to state, labels in case of a supervised learning scenario. 

In [None]:
class Controller():
    def __init__(self,parent=None):
        self.parent=parent
    def act(self,world_state):
        perceived_state=self.parent.perception.perceive_state(world_state)
        if self.parent.debug: print(perceived_state)
        if self.parent.use_memory: self.parent.memory.add_percept(perceived_state,self.parent.time)
        actor_state=self.parent.actor.percept_to_state(perceived_state)
        if self.parent.use_memory: actor_state=self.parent.actor.augment_state(actor_state)
        if self.parent.debug: print('actor_state:',actor_state)
        if self.parent.use_memory: self.parent.memory.update_next_state(actor_state,self.parent.time-1)
        action=self.parent.actor.call_model(actor_state)
        if self.parent.debug: print('action:',action)
        if self.parent.use_memory: action_to_store=self.parent.perception.action_perceptual(action)
        if self.parent.use_memory: 
            intrinsic_reward=self.parent.actor.intrinsic_reward(perceived_state,action_to_store,actor_state)
        # if self.parent.use_memory: self.parent.memory.update_reward_sar(intrinsic_reward,self.parent.time-1)
        if self.parent.use_memory: self.parent.memory.update_reward_sar(intrinsic_reward,self.parent.time)
        if self.parent.use_memory: self.parent.memory.update_action_perceptual(action_to_store,self.parent.time)
        if self.parent.use_memory: self.parent.memory.add_state_action(actor_state,action_to_store,self.parent.time)
        world_action=self.parent.perception.action_to_world(action)
        if self.parent.limit_memory: self.parent.memory.pop()
        return world_action
    def reward(self,world_reward):
        reward=self.parent.perception.perceive_reward(world_reward)
        if self.parent.use_memory: self.parent.memory.update_reward_perceptual(reward,self.parent.time-1)
        if self.parent.use_memory: reward_sar=self.parent.actor.compute_reward(reward)
        if self.parent.use_memory: self.parent.memory.update_reward_sar(reward_sar,self.parent.time-1)
        if self.parent.limit_memory: self.parent.memory.pop() 
        return reward

3.1 ***Memory*** stores are nested dictionaries indexed by *time*. Each entry is a dictionary with keys *typically* being *'percept','action','reward'* and *'state','action','reward','next_state'* for perceptual memory / state-action-reward memory respectively. However these may be extended for specifc kinds of worlds, e.g. supervised learning.

In [None]:
class Memory():
    def __init__(self,parent=None):
        self.parent=parent
        self.clear()
        self.limit_perceptual=None
        self.limit_sar=None
    def pop(self):
        psize=self.limit_perceptual
        M=self.perceptual_memory
        if psize!=None: 
            for t in [t for t in M if t<self.parent.time-psize]: M.pop(t)
        S=self.sar_memory
        ssize=self.limit_sar
        if ssize!=None: 
            for t in [t for t in S if t<self.parent.time-ssize]: S.pop(t)
    def clear(self):
        self.perceptual_memory={}
        self.sar_memory={}
    def add_percept(self,perceived_state,time):
        self.perceptual_memory[time]=perceived_state
        if self.parent.debug: print('add_percept:',self.perceptual_memory)
    def update_next_state(self,actor_state,time):
        if time in self.sar_memory: self.sar_memory[time]['next_state']=actor_state
        else: self.sar_memory[time]={'next_state':actor_state}
        if self.parent.debug: print('update_next_state:',self.sar_memory)
    def update_action_perceptual(self,action,time):
        self.perceptual_memory[time]['action']=action
        if self.parent.debug: print('update_action_perceptual:',self.perceptual_memory)
    def add_state_action(self,actor_state,action,time):
        if time in self.sar_memory: 
            self.sar_memory[time]['state']=actor_state
            self.sar_memory[time]['action']=action
        else: self.sar_memory[time]={'state':actor_state,'action':action}
        if self.parent.debug: print('add_state_action:',self.sar_memory)
    def update_reward_perceptual(self,reward,time):
        self.perceptual_memory[time]['reward']=reward
        if self.parent.debug: print('update_reward_perceptual:',self.perceptual_memory)
    def update_reward_sar(self,reward,time):
        if time in self.sar_memory:
            if 'reward' in self.sar_memory[time]:self.sar_memory[time]['reward']+=reward
            else: self.sar_memory[time]['reward']=reward
        else:
            self.sar_memory[time]={'reward':reward}
        if self.parent.debug: print('update_reward_sar:',self.sar_memory)

3.2 The default **Perception** class just copies world states/actions/rewards to actor states/actions/rewards. Should be subclassed for a given World.

In [None]:
class Perception():
    def __init__(self,parent=None):
        self.parent=parent
    def perceive_state(self,world_state):
        if type(world_state)==dict: return world_state
        else: return {'percept':world_state}
    def action_to_world(self,action):
        return action
    def action_perceptual(self,action):
        return action
    def perceive_reward(self,reward):
        return reward

3.3 The default **Actor** has no Model and returns a fixed action (can be set). It copies the percept from perceptual memory directly into the actor_state. This should be subclassed and/or method *percept_to_state* or *create_actor_state* overridden for a given World.

In [None]:
class Actor():
    def __init__(self,parent=None,model=None):
        self.parent=parent
        self.model=model
        self.default_action='default_action'
        self.intrinsic_reward_value=0
    def percept_to_state(self,perceived_state):
        return perceived_state['percept']
    def augment_state(self,state):
        return state
    def call_model(self,actor_state):
        return self.default_action
    def compute_reward(self,reward):
        return reward
    def intrinsic_reward(self,precept,action,state):
        return self.intrinsic_reward_value

### Guidelines on overriding/augmenting base classes for world-specific agents
Template indicating methods that need to be overridden/augmented

In [None]:
class TemplateAIAgent(AIAgent):
    def __init__(self):
        super().__init__()
        ##Augmenting AIAgent
        self.actor=self.Actor(parent=self)
        # etc. for all augmented/overridden components
    def reward(self,reward):
        ##Augmenting AIAgent
        return super().reward(reward)
    def begin(self,state):
        ##Augmenting AIAgent
        super().begin(state)
        
    class Actor(Actor):
        def __init__(self,parent):
            super().__init__(parent=parent)
        def call_model(self,state):
            ##Overriding AIAgent
            action=self.action_space.sample() #override with actual policy
            return action

### Generic (On-policy) Reinforcement Learning Agent including Windowing and Meta-RL
Create a local environment called by a Monitor thread running within Agent, which implements or
re-uses an on-policy RL training procedure (such as PPO etc.). The env has an input and output queue. If a training flag is set, the Agent uses a ProxyModel place of its normal model to compute actions: The actor-state is placed in the env's input queue and an action is awaited from the env's output queue.

The monitor starts by calling the env.reset method that waits on the input queue to receive 
and then return a state. The monitor thread computes an action on the current state and calls
env.step(action), which places the action in the output queue and awaits a reward from the input
queue. After receiving a reward, step again waits for the next state on the input queue.
Once this is also received, both next stte and reward are returned.

The generic RL agent incorporates windowing, i.e., past win states are concatenated, and meta-RL via the $RL^2$ algorithm (metarl parameter). Note that if win>1 then use_memory has to be true as the window computation uses memory. Further metarl=True requires win>=2.

In [None]:
class RLAgent(AIAgent):
    # RL Agent using window of previous states taken from memory: parameter win - length
    def __init__(self,algoclass,monclass,action_space,observation_space,policytype='MlpPolicy',
                n_steps=2048,verbose=1,win=1,soclass=None,metarl=False):
        ##Augmenting AIAgent
        super().__init__()
        ##Local RL environment running in a thread to interact with World via queues
        self.env=self.TrainingEnv(parent=self)
        ##Monitoring (need to move class to input parameter and cascaded for scripts to work)
        self.env=monclass(self.env,'/tmp/aiagents')
        self.env.action_space=action_space
        self.action_dim=(lambda a: 1 if len(a.shape)==0 else a.nvec.flatten().shape[0])(action_space)
        if metarl: observation_space=self.expand_obs_space(observation_space,action_space)
        if win>1:self.env.observation_space=soclass(1,win,observation_space).stack_observation_space(observation_space)
        else:self.env.observation_space=observation_space
        self.monitor=self.Monitor(parent=self)
        self.set_training(True)
        self.tot_rew=0
        self.logL=[]
        self.kill=False
        ##Override Actor
        self.model=algoclass(policytype, self.env, verbose=verbose,n_steps=n_steps)
        self.actor=self.Actor(parent=self,model=self.model)
        self.perception.perceive_reward=self.perceive_reward
        ## Win/MetaRL Specific
        self.win=win
        self.metarl=metarl
        self.use_memory=True
        self.monitorname='monitor'
    
    def expand_obs_space(self,obs_space,action_space):
        if len(action_space.shape)==0:
            high=np.concatenate([obs_space.high,np.array([action_space.n,np.inf])])
            low=np.concatenate([obs_space.low,np.array([0,-np.inf])])
        else:
            h=action_space.nvec.flatten()
            z=np.zeros(action_space.nvec.flatten().shape[0])
            high=np.concatenate([obs_space.high,h,np.array([np.inf])])
            low=np.concatenate([obs_space.low,z,np.array([-np.inf])])
        obs_space.high=high
        obs_space.low=low
        return obs_space
    
    def start(self,training_steps=20000):
        # self.monitorthread=Thread(name='monitor',target=self.monitor.run,args=(training_steps,))
        self.monitorthread=Thread(name=self.monitorname,target=self.monitor.train,args=(training_steps,))
        self.monitorthread.start() 
    
    def stop(self):
        self.kill=True
    
    def log(self,entry):
        self.logL+=[entry]
        
    def set_training(self,value):
        self.training=value
        
    class TrainingEnv(Env):
        def __init__(self,parent):
            self.parent=parent
            self.inputS=Queue() #written by act read by reset and step
            self.outputS=Queue() #written by act read by act
            self.rewardI=Queue() #written by act and read by step
            self.actionO=Queue() #written by step and read by act
            self.counter=0
        def reset(self):
            # print('reset')
            self.parent.log(('reset waiting',self.counter))
            self.state=self.inputS.get()
            return self.state
        def step(self,action):
            if self.parent.kill: sys.exit(-1)
            # print('step')
            self.actionO.put(action)
            self.parent.log(('step put action',self.counter))
            self.parent.log((self.counter,action))
            self.parent.log(('step waiting for reward',self.counter))
            reward,done,info=self.rewardI.get()
            if not done:
                self.parent.log(('step waiting for next state',self.counter))
                next_state=self.inputS.get()
                self.counter+=1
                # self.parent.log((self.counter,next_state))
            else: next_state=self.state
            # print(self.counter,done)
            return next_state,reward,done,info
        def print_queues(self):
            print('inputS',self.inputS.queue)
            print('actionO',self.actionO.queue)
            print('rewardI',self.rewardI.queue)
            print('outputS',self.outputS.queue)
             
    class Monitor():
        def __init__(self,parent):
            self.parent=parent
        def run(self,training_steps,N_EPISODES=10):
            for episode in range(N_EPISODES):
                state=self.parent.env.reset()
                for steps in range(training_steps):
                    # env.render()
                    action=self.parent.env.action_space.sample()
                    state, reward, done, info = self.parent.env.step(action)
                    self.parent.log(('step returned:',state, reward, done, info))
            self.parent.set_training(False)
        def train(self,training_steps):
            self.parent.actor.model.learn(total_timesteps=training_steps)
            self.parent.env.actionO.put(self.parent.env.action_space.sample())
            self.parent.set_training(False)
            # self.parent.log((self.parent.training,self.parent))
    
    class Actor(Actor):
        def __init__(self,parent,model):
            super().__init__(parent=parent,model=model)
        def call_model(self,state):
        ##Overriding AIAgent
            time=self.parent.time
            if self.parent.training: 
                state=self.parent.get_win_state(state)
                self.parent.env.inputS.put(state)
                self.parent.log(('call model put state at time',time))
                self.parent.log(('call model waiting for action at time',time))
                try: action = self.parent.env.actionO.get()#timeout=5)
                except: action=0
                self.parent.log(('call model received action at time',time))
            else:
                state=self.parent.get_win_state(state)
                action, _states = self.model.predict(state)
            return action
        def augment_state(self,state):
            if not self.parent.metarl: return state
            if self.parent.time-1 in self.parent.memory.sar_memory:
                prev_action=self.parent.memory.sar_memory[self.parent.time-1]['action']
                prev_reward=self.parent.memory.sar_memory[self.parent.time-1]['reward']
            else:
                if self.parent.action_dim==1: prev_action,prev_reward=0,0
                else: prev_action,prev_reward=np.zeros(self.parent.action_dim),0
            if not is_sequence(prev_action): 
                state=np.concatenate([state,np.array([prev_action,prev_reward])])
            elif is_sequence(prev_action):
                state=np.concatenate([state,prev_action,np.array([prev_reward])])
            return state
    # Win Specific
    def get_win_state(self,state):
        if self.time-self.win+1>=0:
            prev_stateL=[self.memory.sar_memory[self.time-w]['state'] for 
                         w in range(1,self.win) if self.time>=w]
            win_state=np.concatenate([state]+prev_stateL)
        else: 
            prev_stateL=[state]*self.win
            win_state=np.concatenate(prev_stateL)
        return win_state
    ###
    def perceive_reward(self,reward):
        return reward[0]
    def get_intrinsic_reward(self):
        M=self.memory.sar_memory
        i_rew=0
        if self.time-1 in M:
            if 'reward' in M[self.time-1]:
                i_rew=self.memory.sar_memory[self.time-1]['reward']
        else: i_rew=0
        return i_rew
    def reward(self,reward):
        ##Augmenting AIAgent
        reward_in=reward
        i_rew=self.get_intrinsic_reward()
        reward_in=(reward_in[0]+i_rew,reward_in[1],reward_in[2])
        reward=super().reward(reward)
        self.tot_rew+=reward
        if self.training: 
            self.env.rewardI.put(reward_in)
            self.log(('call model put reward at time',self.time))
    def begin(self):
        ##Augmenting AIAgent
        self.rewL+=[self.tot_rew]
        super().begin()
    def avg_rew(self):
        return sum(self.rewL)/len(self.rewL)

OLD STUFF can be deleted in due course

In [None]:
# class RLAgent(AIAgent):
#     def __init__(self,algoclass,action_space,observation_space,policytype='MlpPolicy',
#                 n_steps=2048,verbose=1):
#         ##Augmenting AIAgent
#         super().__init__()
#         ##Local RL environment running in a thread to interact with World via queues
#         self.env=self.TrainingEnv(parent=self)
#         self.env.action_space=action_space
#         self.env.observation_space=observation_space
#         self.monitor=self.Monitor(parent=self)
#         self.set_training(True)
#         self.tot_rew=0
#         self.logL=[]
#         self.kill=False
#         ##Override Actor
#         self.model=algoclass(policytype, self.env, verbose=verbose,n_steps=n_steps)
#         self.actor=self.Actor(parent=self,model=self.model)
#         self.perception.perceive_reward=self.perceive_reward
        
#     def start(self,training_steps=20000):
#         # self.monitorthread=Thread(name='monitor',target=self.monitor.run,args=(training_steps,))
#         self.monitorthread=Thread(name='monitor',target=self.monitor.train,args=(training_steps,))
#         self.monitorthread.start() 
    
#     def stop(self):
#         self.kill=True
    
#     def log(self,entry):
#         self.logL+=[entry]
        
#     def set_training(self,value):
#         self.training=value
        
#     class TrainingEnv(Env):
#         def __init__(self,parent):
#             self.parent=parent
#             self.inputS=Queue() #written by act read by reset and step
#             self.outputS=Queue() #written by act read by act
#             self.rewardI=Queue() #written by act and read by step
#             self.actionO=Queue() #written by step and read by act
#             self.counter=0
#         def reset(self):
#             # print('reset')
#             self.parent.log(('reset waiting',self.counter))
#             self.state=self.inputS.get()
#             return self.state
#         def step(self,action):
#             if self.parent.kill: sys.exit(-1)
#             # print('step')
#             self.actionO.put(action)
#             self.parent.log(('step put action',self.counter))
#             self.parent.log((self.counter,action))
#             self.parent.log(('step waiting for reward',self.counter))
#             reward,done,info=self.rewardI.get()
#             if not done:
#                 self.parent.log(('step waiting for next state',self.counter))
#                 next_state=self.inputS.get()
#                 self.counter+=1
#                 # self.parent.log((self.counter,next_state))
#             else: next_state=self.state
#             # print(self.counter,done)
#             return next_state,reward,done,info
#         def print_queues(self):
#             print('inputS',self.inputS.queue)
#             print('actionO',self.actionO.queue)
#             print('rewardI',self.rewardI.queue)
#             print('outputS',self.outputS.queue)
             
#     class Monitor():
#         def __init__(self,parent):
#             self.parent=parent
#         def run(self,training_steps,N_EPISODES=10):
#             for episode in range(N_EPISODES):
#                 state=self.parent.env.reset()
#                 for steps in range(training_steps):
#                     # env.render()
#                     action=self.parent.env.action_space.sample()
#                     state, reward, done, info = self.parent.env.step(action)
#                     self.parent.log(('step returned:',state, reward, done, info))
#             self.parent.set_training(False)
#         def train(self,training_steps):
#             self.parent.actor.model.learn(total_timesteps=training_steps)
#             self.parent.env.actionO.put(self.parent.env.action_space.sample())
#             self.parent.set_training(False)
#             # self.parent.log((self.parent.training,self.parent))
    
#     class Actor(Actor):
#         def __init__(self,parent,model):
#             super().__init__(parent=parent,model=model)
#         def call_model(self,state):
#         ##Overriding AIAgent
#             time=self.parent.time
#             if self.parent.training: 
#                 self.parent.env.inputS.put(state)
#                 self.parent.log(('call model put state at time',time))
#                 self.parent.log(('call model waiting for action at time',time))
#                 try: action = self.parent.env.actionO.get()#timeout=5)
#                 except: action=0
#                 self.parent.log(('call model received action at time',time))
#             else: action, _states = self.model.predict(state)
#             return action
        
#     def perceive_reward(self,reward):
#         return reward[0]
#     def reward(self,reward):
#         ##Augmenting AIAgent
#         reward_in=reward
#         reward=super().reward(reward)
#         self.tot_rew+=reward
#         if self.training: 
#             self.env.rewardI.put(reward_in)
#             self.log(('call model put reward at time',self.time))
#     def begin(self):
#         ##Augmenting AIAgent
#         self.rewL+=[self.tot_rew]
#         super().begin()
#     def avg_rew(self):
#         return sum(self.rewL)/len(self.rewL)

In [None]:
# class RLAgent(AIAgent):
#     def __init__(self,action_space,observation_space,algoclass,training_steps=20000,
#                 policytype='MlpPolicy',verbose=0):
#         ##Augmenting AIAgent
#         super().__init__()
#         ##Local RL environment running in a thread to interact with World via queues
#         self.env=self.TrainingEnv(parent=self)
#         self.env.action_space=action_space
#         self.env.observation_space=observation_space
#         self.monitor=self.Monitor(parent=self)
#         self.set_training(True)
#         self.monitorthread=Thread(name='monitor',target=self.monitor.train,args=(training_steps,))
#         self.tot_rew=0
#         self.logL=[]
#         self.kill=False
#         ##Override Actor
#         self.model=algoclass(policytype, self.env, verbose=verbose,n_steps=training_steps)
#         self.actor=self.Actor(parent=self,model=self.model)
        
#     def start(self):
#         self.monitorthread.start() 
    
#     def stop(self):
#         self.kill=True
    
#     def log(self,entry):
#         self.logL+=[entry]
        
#     def set_training(self,value):
#         self.training=value
        
#     class TrainingEnv(Env):
#         def __init__(self,parent):
#             self.parent=parent
#             # self.observation_space=spaces.Box(
#             #     low=np.array([-4.8000002e+00, -3.4028235e+38, -4.1887903e-01, -3.4028235e+38]), 
#             #     high=np.array([4.8000002e+00, 3.4028235e+38, 4.1887903e-01, 3.4028235e+38]), 
#             #     shape=(4,), dtype=np.float32)
#             self.inputS=Queue() #written by act read by reset and step
#             self.outputS=Queue() #written by act read by act
#             self.rewardI=Queue() #written by act and read by step
#             self.actionO=Queue() #written by step and read by act
#             self.counter=0
#         def reset(self):
#             # print('reset')
#             self.parent.log(('reset waiting',self.counter))
#             self.state=self.inputS.get()
#             return self.state
#         def step(self,action):
#             if self.parent.kill: sys.exit(-1)
#             # print('step')
#             self.actionO.put(action)
#             self.parent.log(('step put action',self.counter))
#             self.parent.log((self.counter,action))
#             self.parent.log(('step waiting for reward',self.counter))
#             reward,done,info=self.rewardI.get()
#             if not done:
#                 self.parent.log(('step waiting for next state',self.counter))
#                 next_state=self.inputS.get()
#                 self.counter+=1
#                 self.parent.log((self.counter,next_state))
#             else: next_state=self.state
#             # print(self.counter,done)
#             return next_state,reward,done,info
#         def print_queues(self):
#             print('inputS',self.inputS.queue)
#             print('actionO',self.actionO.queue)
#             print('rewardI',self.rewardI.queue)
#             print('outputS',self.outputS.queue)
             
#     class Monitor():
#         def __init__(self,parent):
#             self.parent=parent
#         def run(self,training_steps,N_EPISODES=10):
#             for episode in range(N_EPISODES):
#                 state=self.parent.env.reset()
#                 for steps in range(training_steps):
#                     # env.render()
#                     action=self.parent.env.action_space.sample()
#                     state, reward, done, info = self.parent.env.step(action)
#                     # print(next_state, reward, done, info, action)
#             self.parent.set_training(False)
#         def train(self,training_steps):
#             self.parent.actor.model.learn(total_timesteps=training_steps)
#             self.parent.env.actionO.put(self.parent.env.action_space.sample())
#             self.parent.set_training(False)
#             # self.parent.log((self.parent.training,self.parent))
    
#     class Actor(Actor):
#         def __init__(self,parent,model):
#             super().__init__(parent=parent,model=model)
#         def call_model(self,state):
#         ##Overriding AIAgent
#             time=self.parent.time
#             if self.parent.training: 
#                 self.parent.env.inputS.put(state)
#                 self.parent.log(('call model put state at time',time))
#                 self.parent.log(('call model waiting for action at time',time))
#                 try: action = self.parent.env.actionO.get()#timeout=5)
#                 except: action=0
#                 self.parent.log(('call model received action at time',time))
#             else: action, _states = self.model.predict(state)
#             return action
#         def compute_reward(self,reward):
#             return reward[0]
    
#     def reward(self,reward):
#         ##Augmenting AIAgent
#         reward=super().reward(reward)
#         self.tot_rew+=reward[0]
#         if self.training: 
#             self.env.rewardI.put(reward)
#             self.log(('call model put reward at time',self.time))
#     def begin(self):
#         ##Augmenting AIAgent
#         self.rewL+=[self.tot_rew]
#         super().begin()
#     def avg_rew(self):
#         return sum(self.rewL)/len(self.rewL)