In [1]:
import gym
from gym import spaces
from gym import Env
import random
import numpy as np
from threading import Thread
import threading
from stable_baselines3 import PPO

In [2]:
# for thread in threading.enumerate(): 
#     print(thread.name)

In [3]:
# for thread in threading.enumerate(): 
#     print(thread.name)

### Agent-based RL in Simple Worlds

In [4]:
env = gym.make("CartPole-v1")
# env = gym.make('MountainCar-v0')

In [5]:
import import_ipynb
from aiagentbase import AIAgent,Controller,Memory,Perception,Actor

importing Jupyter notebook from aiagentbase.ipynb


In [6]:
class GenWorld():
    def __init__(self,env):
        self.env=env
        self.test_episodes=[]
        self.world_over=False
    def stop(self):
        self.world_over=True
    def run(self,agent=None,n_episodes=10,episode_maxlen=10):
        agent.observation_space=env.observation_space
        if 'training' not in agent.__dict__: agent.training=False
        if agent.training: testing=False 
        else: testing=True
        if agent.training: print('Starting Training time: ',agent.time)
        for episode in range(n_episodes):
            # print('CartAgent','starting episode')
            state=self.env.reset()
            agent.begin()
            # print(agent.time)#,agent.ep)
            for t in range(episode_maxlen):
                # env.render(mode='rgb_array')
                action=agent.act(state)
                # print(episode,t,'Action: ', action)
                state, reward, done, info = env.step(action)
                agent.reward((reward,done,info))
                # print(episode,t,'Reward sent: ', reward)
                if done:
                    break
            if self.world_over:break
            if not agent.training: self.test_episodes+=[episode]
            if not agent.training and not testing: 
                print('Training Over at time: ',agent.time)
                testing=True
        print('Testing Done time: ', agent.time, ' Reward: ', agent.avg_rew())
        return agent.avg_rew()

In [7]:
#Doesnt use AIAgent Architecture Classes but implements the same interface - for initial testing
class RandomAgent():
    def __init__(self,action_space):
        self.action_space=action_space
        self.tot_rew=0
        self.rewL=[]
    def act(self,state):
        action = self.action_space.sample()
        return action
    def reward(self,rew):
        self.tot_rew+=rew[0]
    def begin(self,state):
        self.rewL+=[self.tot_rew]
    def avg_rew(self):
        return sum(self.rewL)/len(self.rewL)

In [8]:
class RandomAIAgent(AIAgent):
    def __init__(self,action_space):
        super().__init__()
        self.actor=self.Actor(parent=self)
        self.action_space=action_space
        self.tot_rew=0
        self.rewL=[]
        
    class Actor(Actor):
        def __init__(self,parent): 
            super().__init__(parent=parent)
        def call_model(self,state):
        ##Overriding AIAgent.Model
            action = self.parent.action_space.sample()
            return action
        def compute_reward(self,reward):
            return reward[0]
    
    def reward(self,rew):
        ##Augmenting AIAgent
        self.tot_rew+=rew[0]
        return super().reward(rew)
    def begin(self):
        ##Augmenting AIAgent
        self.rewL+=[self.tot_rew]
        super().begin()
    def avg_rew(self):
        return sum(self.rewL)/len(self.rewL)

In [9]:
agent=RandomAIAgent(env.action_space)
agent.training=False

In [10]:
agent.debug=False
agent.use_memory=True

In [11]:
world=GenWorld(env=env)

In [12]:
worldthread=Thread(name='world',target=world.run,args=(agent,1000,200))

In [13]:
worldthread.start()

Testing Done time:  22281  Reward:  11120.887


In [14]:
# world.run(agent,10,10)

In [15]:
# agent.memory.sar_memory

### Training an AI Agent's Model using Generic RL Agent

In [16]:
from threading import Thread
import threading
import sys

In [17]:
from queue import Queue

In [18]:
from aiagentbase import RLAgent

In [19]:
# class RLAgent(AIAgent):
#     def __init__(self,algoclass,action_space,observation_space,policytype='MlpPolicy',
#                 n_steps=2048,verbose=1):
#         ##Augmenting AIAgent
#         super().__init__()
#         ##Local RL environment running in a thread to interact with World via queues
#         self.env=self.TrainingEnv(parent=self)
#         self.env.action_space=action_space
#         self.env.observation_space=observation_space
#         self.monitor=self.Monitor(parent=self)
#         self.set_training(True)
#         self.tot_rew=0
#         self.logL=[]
#         self.kill=False
#         ##Override Actor
#         self.model=algoclass(policytype, self.env, verbose=verbose,n_steps=n_steps)
#         self.actor=self.Actor(parent=self,model=self.model)
        
#     def start(self,training_steps=20000):
#         # self.monitorthread=Thread(name='monitor',target=self.monitor.run,args=(training_steps,))
#         self.monitorthread=Thread(name='monitor',target=self.monitor.train,args=(training_steps,))
#         self.monitorthread.start() 
    
#     def stop(self):
#         self.kill=True
    
#     def log(self,entry):
#         self.logL+=[entry]
        
#     def set_training(self,value):
#         self.training=value
        
#     class TrainingEnv(Env):
#         def __init__(self,parent):
#             self.parent=parent
#             self.inputS=Queue() #written by act read by reset and step
#             self.outputS=Queue() #written by act read by act
#             self.rewardI=Queue() #written by act and read by step
#             self.actionO=Queue() #written by step and read by act
#             self.counter=0
#         def reset(self):
#             # print('reset')
#             self.parent.log(('reset waiting',self.counter))
#             self.state=self.inputS.get()
#             return self.state
#         def step(self,action):
#             if self.parent.kill: sys.exit(-1)
#             # print('step')
#             self.actionO.put(action)
#             self.parent.log(('step put action',self.counter))
#             self.parent.log((self.counter,action))
#             self.parent.log(('step waiting for reward',self.counter))
#             reward,done,info=self.rewardI.get()
#             if not done:
#                 self.parent.log(('step waiting for next state',self.counter))
#                 next_state=self.inputS.get()
#                 self.counter+=1
#                 # self.parent.log((self.counter,next_state))
#             else: next_state=self.state
#             # print(self.counter,done)
#             return next_state,reward,done,info
#         def print_queues(self):
#             print('inputS',self.inputS.queue)
#             print('actionO',self.actionO.queue)
#             print('rewardI',self.rewardI.queue)
#             print('outputS',self.outputS.queue)
             
#     class Monitor():
#         def __init__(self,parent):
#             self.parent=parent
#         def run(self,training_steps,N_EPISODES=10):
#             for episode in range(N_EPISODES):
#                 state=self.parent.env.reset()
#                 for steps in range(training_steps):
#                     # env.render()
#                     action=self.parent.env.action_space.sample()
#                     state, reward, done, info = self.parent.env.step(action)
#                     self.parent.log(('step returned:',state, reward, done, info))
#             self.parent.set_training(False)
#         def train(self,training_steps):
#             self.parent.actor.model.learn(total_timesteps=training_steps)
#             self.parent.env.actionO.put(self.parent.env.action_space.sample())
#             self.parent.set_training(False)
#             # self.parent.log((self.parent.training,self.parent))
    
#     class Actor(Actor):
#         def __init__(self,parent,model):
#             super().__init__(parent=parent,model=model)
#         def call_model(self,state):
#         ##Overriding AIAgent
#             time=self.parent.time
#             if self.parent.training: 
#                 self.parent.env.inputS.put(state)
#                 self.parent.log(('call model put state at time',time))
#                 self.parent.log(('call model waiting for action at time',time))
#                 try: action = self.parent.env.actionO.get()#timeout=5)
#                 except: action=0
#                 self.parent.log(('call model received action at time',time))
#             else: action, _states = self.model.predict(state)
#             return action
    
#     def reward(self,reward):
#         ##Augmenting AIAgent
#         reward_in=reward
#         reward=super().reward(reward)
#         self.tot_rew+=reward
#         if self.training: 
#             self.env.rewardI.put(reward_in)
#             self.log(('call model put reward at time',self.time))
#     def begin(self):
#         ##Augmenting AIAgent
#         self.rewL+=[self.tot_rew]
#         super().begin()
#     def avg_rew(self):
#         return sum(self.rewL)/len(self.rewL)

In [26]:
training_steps=2048

In [27]:
agent=RLAgent(algoclass=PPO,action_space=env.action_space,observation_space=env.observation_space,
              verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [28]:
agent.debug=False
agent.use_memory=True

In [29]:
agent.rewL=[]
agent.tot_rew=0

In [30]:
agent.start(training_steps=training_steps)

In [31]:
world=GenWorld(env=env)

In [32]:
worldthread=Thread(name='world',target=world.run,args=(agent,2000,200))

In [33]:
worldthread.start()

Starting Training time:  0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 24.5     |
|    ep_rew_mean     | 24.5     |
| time/              |          |
|    fps             | 828      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
Training Over at time:  2055
Testing Done time:  58762  Reward:  29315.1605


In [30]:
len(agent.logL)

16388

In [31]:
agent.time

56454

In [None]:
# world.run(agent,n_episodes=2000,episode_maxlen=200)

In [None]:
from matplotlib import pyplot as plt

In [None]:
testing_len=len([agent.rewL[t] for t in world.test_episodes])

In [None]:
# testing_len

In [None]:
# agent.rewL

In [None]:
print(np.gradient(agent.rewL).mean())

In [None]:
plt.plot(np.gradient(agent.rewL))

In [None]:
episodes = 500
rewL=[]
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    steps=0
    while not done and steps<=200:
        # env.render()
        action,_ = agent.model.predict(state)
        state, reward, done, info = env.step(action)
        score+=reward
        steps+=1
    # print('Episode:{} Score:{}'.format(episode, score))
    rewL+=[score]
env.close()

In [None]:
from matplotlib import pyplot as plt
import numpy as np

In [None]:
np.array(rewL).mean()

In [None]:
plt.plot(rewL)

In [None]:
# PPO??

In [None]:
len(agent.logL)

In [30]:
S=agent.memory.sar_memory

In [33]:
[S[t] for t in range(10)]

[{'state': array([-0.0053942 ,  0.02053552, -0.01799585, -0.02342881]),
  'action': 0,
  'reward': 1.0,
  'next_state': array([-0.00498349, -0.17432379, -0.01846443,  0.26352234])},
 {'state': array([-0.00498349, -0.17432379, -0.01846443,  0.26352234]),
  'action': 1,
  'reward': 1.0,
  'next_state': array([-0.00846996,  0.02105678, -0.01319398, -0.0349268 ])},
 {'state': array([-0.00846996,  0.02105678, -0.01319398, -0.0349268 ]),
  'action': 0,
  'reward': 1.0,
  'next_state': array([-0.00804883, -0.1738735 , -0.01389252,  0.25356426])},
 {'state': array([-0.00804883, -0.1738735 , -0.01389252,  0.25356426]),
  'action': 1,
  'reward': 1.0,
  'next_state': array([-0.0115263 ,  0.02144403, -0.00882123, -0.04346804])},
 {'state': array([-0.0115263 ,  0.02144403, -0.00882123, -0.04346804]),
  'action': 1,
  'reward': 1.0,
  'next_state': array([-0.01109742,  0.21669136, -0.00969059, -0.33892104])},
 {'state': array([-0.01109742,  0.21669136, -0.00969059, -0.33892104]),
  'action': 0,
  '

In [34]:
M=agent.memory.perceptual_memory

In [35]:
[M[t] for t in range(10)]

[{'percept': array([-0.0053942 ,  0.02053552, -0.01799585, -0.02342881]),
  'action': 0,
  'reward': 1.0},
 {'percept': array([-0.00498349, -0.17432379, -0.01846443,  0.26352234]),
  'action': 1,
  'reward': 1.0},
 {'percept': array([-0.00846996,  0.02105678, -0.01319398, -0.0349268 ]),
  'action': 0,
  'reward': 1.0},
 {'percept': array([-0.00804883, -0.1738735 , -0.01389252,  0.25356426]),
  'action': 1,
  'reward': 1.0},
 {'percept': array([-0.0115263 ,  0.02144403, -0.00882123, -0.04346804]),
  'action': 1,
  'reward': 1.0},
 {'percept': array([-0.01109742,  0.21669136, -0.00969059, -0.33892104]),
  'action': 0,
  'reward': 1.0},
 {'percept': array([-0.00676359,  0.02170863, -0.01646902, -0.04930967]),
  'action': 1,
  'reward': 1.0},
 {'percept': array([-0.00632942,  0.21706281, -0.01745521, -0.34714288]),
  'action': 0,
  'reward': 1.0},
 {'percept': array([-0.00198816,  0.02219344, -0.02439807, -0.06001492]),
  'action': 1,
  'reward': 1.0},
 {'percept': array([-0.00154429,  0.2