In [1]:
import json
import websockets, asyncio
import threading

class WaitableQueue(asyncio.Queue):
    def __init__(self):
        super().__init__()
        self.event = threading.Event()

    def put(self, item):
        super().put_nowait(item)
        self.event.set()

    def get(self,timeout=3):
        if self.event.wait(timeout):
            res = super().get_nowait()
            if super().empty():
                self.event.clear()
            return res
        else:
            raise TimeoutError("Environement is not responding.")

# the server
class Server:
    def __init__(self):
        self.inQueue = WaitableQueue()
        self.outQueue = WaitableQueue()
        self.debug = True
        self.ws = None

    def start(self):
        threading.Thread(target=self.message_sender_loop).start()
        asyncio.run(self.main())

    async def main(self):
        try:
            async with websockets.serve(self.echo, "localhost", 8765):
                await asyncio.Future()  # run forever
        except websockets.exceptions.ConnectionClosedError as e: print(e)

    async def echo(self,websocket):
        self.ws = websocket
        print('connect')
        #asyncio.create_task(self.message_sender_loop())
        async for message in websocket:
            try:
                self.recv(json.loads(message))
            except json.decoder.JSONDecodeError:
                self.recv(message)

    def recv(self,message):
        self.inQueue.put(message)
        
        if self.debug:
            print("recv: ",message)
    
    def send(self,command:str, content):
        self.outQueue.put({'command':command,'content':content})

    def message_sender_loop(self):
        while True:
            try:
                message = self.outQueue.get(None)
                asyncio.run(self.ws.send(json.dumps(message, indent=4)))
            except websockets.exceptions.ConnectionClosedError:
                print("Connection closed")
            except Exception as e:
                print(e)

    def update(self,handler):
        while not self.inQueue.empty:
            message = self.inQueue.get()
            getattr(handler, message["command"])(message["content"])

                
# start the server in a separate thread to avoid blocking
import threading
server = Server()
t=threading.Thread(target=server.start)
t.start()

# the interface to the server
class WSManager:
    def __init__(self,server:Server):
        self.debug = False
        self.server = server

#server.send("action",{"voltage":[1,0,0,0,100,200,100,100]})

In [2]:
import numpy as np
def flatten(list_of_lists):
    if len(list_of_lists) == 0:
        return list(list_of_lists)
    if hasattr(list_of_lists[0], '__iter__'):
        return flatten(list_of_lists[0]) + flatten(list_of_lists[1:])
    return list(list_of_lists[:1]) + flatten(list_of_lists[1:])
def decomposeCosSin(angle):
    return [np.cos(angle), np.sin(angle)]


In [9]:

from torch import nn
import gym
import numpy as np
import time
class Environment(gym.Env):
    def __init__(self,ws_server : Server,device = 'cpu'):
        self.ws = ws_server
        self.t = 0
        self.t_episode = 0
        self.device = device
        self.prevState = None
        self.prevAction = None
        self.pos = None
        self.targetPos = None
        self.noiseIntensity = 0.5
        self.targetRelPos = np.array([0.,-3.])

        # Implement gym.Env
        self.observation_space = gym.spaces.Box(-np.inf,np.inf,shape=(19,),dtype=float)
        self.action_space = gym.spaces.Box(-1,1,shape=(8,),dtype=float)

    def processFeature(self,state:dict):
        feature = []
        feature.append(state['baseLinkPos']['x']-self.targetPos[0].item())
        feature.append(state['baseLinkPos']['y']-self.targetPos[1].item())
        feature.append(decomposeCosSin(state['baseLinkOrientation']))
        feature.append(state['baseLinkVelocity']['x'])
        feature.append(state['baseLinkVelocity']['y'])
        feature.append(state['baseLinkAngularVelocity'])
        feature.append(decomposeCosSin(state['wheelBaseOrientation']))
        feature.append(state['wheelSpeed'])
        feature = flatten(feature)
        return feature

    def getObservation(self):
        return self.processFeature(self.state)

    def calculateReward(self,pos,targetPos):
        return -np.linalg.norm(pos-targetPos,2)

    def terminateCondition(self,pos,targetPos):
        d = np.linalg.norm(pos-targetPos,2)
        return d<0.5 or d>10 or self.t_episode>100, d<0.5

    def getPos(self,state):
        return np.array([state['baseLinkPos']['x'],state['baseLinkPos']['y']],dtype=float)

    # Implement gym.Env
    
    def reset(self):
        self.pos = np.array([0.,0.])
        self.targetPos = self.pos + self.targetRelPos
        self.t_episode = 0
        self.prevState = None
        self.ws.send("target",{"pos":{'x':self.targetPos[0].item(),'y':0, 'z':self.targetPos[1].item()}})
        self.ws.send("pos",{'x':0,'y':0, 'z':0})
        self.ws.send("require state",None)

        # return the initial observation
        message = self.ws.inQueue.get()
        assert message["command"]=="state"
        self.state = message["content"]
        return self.getObservation()
        
    def step(self, action):
        self.t_episode +=1
        # Send an action then wait for the env to run one step
        action[0:4]*=50
        action[4:8]*=2000
        self.ws.send("action", {"voltage":action.tolist()})

        self.prevPos = self.getPos(self.state)

        # Get state and calculate stuffs in the step
        
        message = self.ws.inQueue.get()
        while not self.ws.inQueue.empty:
            message = self.ws.inQueue.get()
        assert message["command"]=="state"
        self.state = message["content"]
        #self.targetPos = np.array([self.state["targetPos"]["x"],self.state["targetPos"]["z"]])
        observation = self.getObservation()
        self.pos = self.getPos(self.state)
        reward = self.calculateReward(self.pos,self.targetPos) - self.calculateReward(self.prevPos,self.targetPos)

        done, goal = self.terminateCondition(self.pos,self.targetPos)

        if goal:
            reward += 10

        info = {}

        return observation, reward, done, info


    

#env = Environment(server,device)
from torch.nn import functional as F
def soft_update_target(target:nn.Module, source:nn.Module,tau):
    for t, s in zip(target.parameters(), source.parameters()):
        t.data.copy_(
            (1. - tau) * t.data + tau * s.data)

In [10]:
env = Environment(server,'cuda')

In [11]:
env.targetRelPos = np.array([-2,-2])
env.noiseIntensity = 0.1
server.debug = False
gamma = 0.5**(1/50)

In [12]:
from stable_baselines3 import PPO
import datetime
model = PPO("MlpPolicy", env, verbose=1,device = "cuda",tensorboard_log="runs/"+datetime.datetime.now().strftime("%m_%d_%Y/%H_%M_%S"))

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [14]:
model.learn(total_timesteps=100_000)

Logging to runs/06_24_2022/21_30_21\PPO_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 87.8     |
|    ep_rew_mean     | 0.707    |
| time/              |          |
|    fps             | 104      |
|    iterations      | 1        |
|    time_elapsed    | 19       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 88.2        |
|    ep_rew_mean          | 0.816       |
| time/                   |             |
|    fps                  | 95          |
|    iterations           | 2           |
|    time_elapsed         | 42          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011981447 |
|    clip_fraction        | 0.144       |
|    clip_range           | 0.2         |
|    entropy_loss         | -11.3       |
|    explained_variance   | 0.

In [122]:
import stable_baselines3
stable_baselines3.common.evaluation.evaluate_policy(model, env, n_eval_episodes=10)

(11.281183837861136, 3.386120642213348)

In [114]:
model.env=env

In [19]:
env = gym.make("CartPole-v1")


In [27]:
env.step((1,2))

AssertionError: (1, 2) (<class 'tuple'>) invalid

In [25]:
env.action_space

Discrete(2)

In [28]:
model1 = PPO("MlpPolicy", env, verbose=1)


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [34]:
env.step(0)

(array([-0.03819583, -0.18100157,  0.01109779,  0.3364094 ], dtype=float32),
 1.0,
 False,
 {})

In [32]:
action, _states = model1.predict(obs, deterministic=True)

    action