In [None]:
# !pip install --quiet stable_baselines3
# !pip install --quiet import_ipynb
# !git clone https://github.com/gmshroff/aiagentarch.git
# %cd aiagentarch

In [None]:
import gym
from gym import spaces
from gym import Env
import random
import numpy as np
from threading import Thread
import threading
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv,VecFrameStack,StackedObservations
from stable_baselines3.common.monitor import Monitor as Mon

In [None]:
# for thread in threading.enumerate(): 
#     print(thread.name)

In [None]:
# for thread in threading.enumerate(): 
#     print(thread.name)

### Agent-based RL in Simple Worlds with windowing and Meta-RL

- using window of states in case where velocity is masked
- can use meta-RL: **TBD test with varying physics in a CL setting**

In [None]:
class MaskedPole(Env):
    def __init__(self):
        super().__init__()
        self.env=gym.make('CartPole-v1')
        self.action_space=self.env.action_space
        self.observation_space=self.env.observation_space
    def reset(self):
        obs=self.env.reset()
        # print(obs)
        obs[1]=0
        obs[3]=0
        return obs
    def step(self,action):
        obs, rewards, dones, info = self.env.step(action)
        # print(obs)
        obs[1]=0
        obs[3]=0
        return obs, rewards, dones, info
    def render(self,mode="human"):
        self.env.render()

In [None]:
env = MaskedPole()

In [None]:
env = gym.make("CartPole-v1")

In [None]:
import import_ipynb
from aiagentbase import AIAgent,Controller,Memory,Perception,Actor

In [None]:
class GenWorld():
    def __init__(self,env):
        self.env=env
        self.test_episodes=[]
        self.world_over=False
    def stop(self):
        self.world_over=True
    def run(self,agent=None,n_episodes=10,episode_maxlen=10):
        agent.observation_space=env.observation_space
        if 'training' not in agent.__dict__: agent.training=False
        if agent.training: testing=False 
        else: testing=True
        if agent.training: print('Starting Training time: ',agent.time)
        for episode in range(n_episodes):
            # print('CartAgent','starting episode')
            state=self.env.reset()
            agent.begin()
            # print(agent.time)#,agent.ep)
            for t in range(episode_maxlen):
                # env.render(mode='rgb_array')
                action=agent.act(state)
                # print(episode,t,'Action: ', action)
                state, reward, done, info = env.step(action)
                agent.reward((reward,done,info))
                # print(episode,t,'Reward sent: ', reward)
                if done:
                    break
            if self.world_over:break
            if not agent.training: self.test_episodes+=[episode]
            if not agent.training and not testing: 
                print('Training Over at time: ',agent.time)
                testing=True
                self.world_over=True
        print('Testing Done time: ', agent.time, ' Reward: ', agent.avg_rew())
        return agent.avg_rew()

In [None]:
#Doesnt use AIAgent Architecture Classes but implements the same interface - for initial testing
class RandomAgent():
    def __init__(self,action_space):
        self.action_space=action_space
        self.tot_rew=0
        self.rewL=[]
    def act(self,state):
        action = self.action_space.sample()
        return action
    def reward(self,rew):
        self.tot_rew+=rew[0]
    def begin(self,state):
        self.rewL+=[self.tot_rew]
    def avg_rew(self):
        return sum(self.rewL)/len(self.rewL)

In [None]:
class RandomAIAgent(AIAgent):
    def __init__(self,action_space):
        super().__init__()
        self.actor=self.Actor(parent=self)
        self.action_space=action_space
        self.tot_rew=0
        self.rewL=[]
        
    class Actor(Actor):
        def __init__(self,parent): 
            super().__init__(parent=parent)
        def call_model(self,state):
        ##Overriding AIAgent.Model
            action = self.parent.action_space.sample()
            return action
        def compute_reward(self,reward):
            return reward[0]
    
    def reward(self,rew):
        ##Augmenting AIAgent
        self.tot_rew+=rew[0]
        return super().reward(rew)
    def begin(self):
        ##Augmenting AIAgent
        self.rewL+=[self.tot_rew]
        super().begin()
    def avg_rew(self):
        return sum(self.rewL)/len(self.rewL)

In [None]:
agent=RandomAIAgent(env.action_space)
agent.training=False

In [None]:
agent.debug=False
agent.use_memory=True

In [None]:
agent.limit_memory=True
agent.memory.limit_perceptual=2
agent.memory.limit_sar=4

In [None]:
world=GenWorld(env=env)

In [None]:
agent.tot_rew,agent.rewL,agent.ep=0,[],[]

In [None]:
worldthread=Thread(name='world',target=world.run,args=(agent,1000,200))

In [None]:
worldthread.start()

In [None]:
agent.avg_rew()/len(agent.ep)

In [None]:
# world.run(agent,10,10)

In [None]:
# agent.memory.perceptual_memory

### Training an AI Agent's Model using Generic RL Agent

In [None]:
from threading import Thread
import threading
import sys

In [None]:
from queue import Queue

In [None]:
from aiagentbase import RLAgent

In [None]:
training_steps=50000

In [None]:
agent=RLAgent(algoclass=PPO,monclass=Mon,action_space=env.action_space,observation_space=env.observation_space,
              verbose=1,win=1,soclass=StackedObservations,metarl=False)

In [None]:
agent.debug=False
agent.use_memory=True
agent.training=True

In [None]:
agent.rewL=[]
agent.tot_rew=0
agent.ep=[]

In [None]:
if agent.training: agent.start(training_steps=training_steps)

In [None]:
world=GenWorld(env=env)

In [None]:
# worldthread=Thread(name='world',target=world.run,args=(agent,2000,200))

In [None]:
# worldthread.start()

In [None]:
world.run(agent,n_episodes=2000,episode_maxlen=200)

In [None]:
import pandas as pd
df=pd.read_csv('/tmp/aiagents.monitor.csv',comment='#')

In [None]:
import plotly.express as px
px.line(df['r'].rolling(window=100).mean().values).show()

In [None]:
# len(agent.logL)

In [None]:
from matplotlib import pyplot as plt

In [None]:
# testing_len=len([agent.rewL[t] for t in world.test_episodes])

In [None]:
# testing_len

In [None]:
# agent.rewL

In [None]:
print(np.gradient(agent.rewL).mean())

In [None]:
# plt.plot(np.gradient(agent.rewL))

In [None]:
episodes = 50
rewL=[]
agent.training=False
for episode in range(1, episodes+1):
    done = False
    score = 0 
    steps=0
    state = env.reset()
    while not done and steps<=1000:
        action = agent.act(state)
        state, reward, done, info = env.step(action)
        # env.render()
        score+=reward
        steps+=1
    # print('Episode:{} Score:{}'.format(episode, score))
    rewL+=[score]
env.close()

In [None]:
# from matplotlib import pyplot as plt
# import numpy as np

In [None]:
print(np.array(rewL).mean())

In [None]:
plt.plot(rewL)

In [None]:
# PPO??