In [None]:
import gym
from gym import spaces
import random
from stable_baselines3 import PPO
import numpy as np
import torch

In [None]:
import import_ipynb
import utils
import models

### World and Agents for Supervised Learning Tasks

In [None]:
import import_ipynb
from aiagentbase import AIAgent,Controller,Memory,Perception,Actor

In [None]:
class SLWorld():
    def __init__(self,train_ds,test_ds,n_classes):
        self.train_ds=train_ds
        self.test_ds=test_ds
        self.action_space=spaces.Discrete(n_classes)
    def run(self,agent=None,n_episodes=10):
        self.test_rew=0
        self.test_rewL=[]
        agent.set_training(True)
        if 'training' not in agent.__dict__: agent.training=False
        for episode in range(n_episodes):
            tot_rew=0
            agent.begin()
            for sample,label in self.train_ds:
                action=agent.act(sample)
                reward=(self.accuracy(action,label),{'default'},{'label':label})
                agent.reward(reward)
                tot_rew+=reward[0]
            print('episode: ',episode,'avg reward: ',tot_rew/len(train_ds))
        agent.set_training(False)
        print('Training Over')
        agent.begin()
        for sample,label in self.test_ds:
            action=agent.act(sample)
            reward=(self.accuracy(action,label),'default',{})
            agent.reward(reward)
            self.test_rewL+=[reward]
            self.test_rew+=reward[0]
        print('Test Over; Accuracy: ',self.test_rew/len(self.test_ds))
        return self.test_rew/len(self.test_ds)
    def accuracy(self,action,label):
        if action==label: return 1
        else: return 0

In [None]:
class MLPAgent(AIAgent):
    def __init__(self,action_space,net):
        super().__init__()
        ##Augmenting AIAgent
        self.actor=self.Actor(parent=self,model=net)
        self.action_space=action_space
        self.tot_rew=0
        self.rewL=[]
        
    class Actor(Actor):
        def __init__(self,parent,model):
            super().__init__(parent=parent,model=model)
        def call_model(self,state):
            ##Overriding AIAgent
            lpreds=self.model(state)
            action=torch.argmax(lpreds,axis=1)
            return action

    def set_training(self,value):
        self.training=value
    def avg_rew(self):
        return sum(self.rewL)/len(self.rewL)
    def reward(self,rew):
        ##Augmenting AIAgent
        if self.training:
            prev_state=self.memory.sar_memory[self.time-1]['state']
            net=self.actor.model
            action=torch.argmax(net(prev_state))
            prev_action=self.memory.sar_memory[self.time-1]['action']
            net,_,_=models.Train(net,[(prev_state,rew[2]['label'])],epochs=1)
        self.tot_rew+=rew[0]
        return super().reward(rew)
    def begin(self):
        ##Augmenting AIAgent
        self.rewL+=[self.tot_rew]
        super().begin()

In [None]:
train_ds, test_ds, dloader = utils.euclideanDataset(n_samples=10000,n_features=20,n_classes=10,batch_size=32)

In [None]:
train_ds=[(s.unsqueeze(0),l.unsqueeze(0)) for s,l in train_ds]

In [None]:
test_ds=[(s.unsqueeze(0),l.unsqueeze(0)) for s,l in test_ds]

In [None]:
net=models.MLP(dims=[20,32,10])

In [None]:
# net,_,_=models.Train(net,train_ds,epochs=5,verbose=True)

In [None]:
slworld=SLWorld(train_ds,test_ds,n_classes=10)

In [None]:
agent=MLPAgent(slworld.action_space,net)

In [None]:
slworld.run(agent=agent,n_episodes=5)

### Training an AI Agent's Model using an off-the shelf RL procedure
Create a local environment called by a Monitor thread running within Agent, which implements or
re-uses an on-policy RL training procedure (such as PPO etc.). The env has an input and output queue. If a training flag is set, the Agent uses a ProxyModel place of its normal model to compute actions: The actor-state is placed in the env's input queue and an action is awaited from the env's output queue.

The monitor starts by calling the env.reset method that waits on the input queue to receive 
and then return a state. The monitor thread computes an action on the current state and calls
env.step(action), which places the action in the output queue and awaits a reward from the input
queue. After receiving a reward, step again waits for the next state on the input queue.
Once this is also received, both next stte and reward are returned.

In [None]:
from queue import Queue
from threading import Thread
import threading

In [None]:
class PPOAgent(RandomAIAgent):
    def __init__(self,action_space,observation_space,training_steps=20000):
        ##Augmenting AIAgent
        # self.model=model = PPO.load('ReinforcementLearningCourse-main/Training/Saved Models/PPO_model', env=env)
        super().__init__(action_space)
        self.env=self.TrainingEnv(parent=self)
        self.env.observation_space=observation_space
        self.model=PPO('MlpPolicy', self.env, verbose=0)
        self.monitor=self.Monitor(parent=self)
        # self.monitorthread=Thread(target=self.monitor.run) #For dubugging
        self.set_training(True)
        self.monitorthread=Thread(target=self.monitor.train,args=(training_steps,))
        self.monitorthread.start()
        self.tot_rew=0
        self.logL=[]
    
    def log(self,entry):
        self.logL+=[entry]
        
    def set_training(self,value):
        self.training=value
        
    class TrainingEnv(gym.Env):
        def __init__(self,parent):
            self.parent=parent
            self.action_space=spaces.Discrete(2)
            # self.observation_space=spaces.Box(
            #     low=np.array([-4.8000002e+00, -3.4028235e+38, -4.1887903e-01, -3.4028235e+38]), 
            #     high=np.array([4.8000002e+00, 3.4028235e+38, 4.1887903e-01, 3.4028235e+38]), 
            #     shape=(4,), dtype=np.float32)
            self.inputS=Queue() #written by act read by reset and step
            self.outputS=Queue() #written by act read by act
            self.rewardI=Queue() #written by act and read by step
            self.actionO=Queue() #written by step and read by act
            self.counter=0
        def reset(self):
            # print('reset')
            state=self.inputS.get()
            return state
        def step(self,action):
            # print('step')
            self.actionO.put(action)
            reward,done,info=self.rewardI.get()
            next_state=self.inputS.get()
            self.counter+=1
            # print(self.counter,done)
            return next_state,reward,done,info
        def print_queues(self):
            print('inputS',self.inputS.queue)
            print('actionO',self.actionO.queue)
            print('rewardI',self.rewardI.queue)
            print('outputS',self.outputS.queue)
             
    class Monitor():
        def __init__(self,parent):
            self.parent=parent
        def run(self):
            state=env.reset()
            for episode in range(600):
                # env.render()
                action=self.parent.env.action_space.sample()
                next_state, reward, done, info = self.parent.env.step(action)
                print(next_state, reward, done, info, action)
            self.parent.monitorthread.join()
        def train(self,training_steps):
            global world_over
            self.parent.model.learn(total_timesteps=training_steps)
            print('Training Over')
            self.parent.set_training(False)
            self.parent.log((self.parent.training,self.parent))
    
    def call_model(self,state):
        ##Overriding AIAgent
        if self.training:
            self.env.inputS.put(state)
            try: action = self.env.actionO.get(timeout=5)
            except: action=0
        else: action, _states = self.model.predict(state)
        return action
    def reward(self,reward):
        ##Augmenting AIAgent
        reward=super().reward(reward)
        if self.training: self.env.rewardI.put(reward)
        # super().reward(reward)
    def begin(self,state):
        ##Augmenting AIAgent
        if self.training: self.env.inputS.put(state)
        super().begin(state)

In [None]:
agent=PPOAgent(env.action_space,env.observation_space,training_steps=300000)

In [None]:
agent.debug=False
agent.use_memory=False

In [None]:
agent.rewL=[]
agent.tot_rew=0

In [None]:
world=CartWorld(env=env)

In [None]:
world.run(agent,n_episodes=2000,episode_maxlen=200)

In [None]:
from matplotlib import pyplot as plt

In [None]:
np.gradient(agent.rewL).mean()

In [None]:
plt.plot(np.gradient(agent.rewL))

In [None]:
for thread in threading.enumerate(): 
    print(thread.name)