In [None]:
import torch
import numpy as np
from dynamics import *
import matplotlib.pyplot as plt
from locomotion.envs.gym_envs import A1GymEnv

In [None]:
# Create the robot environment
env = A1GymEnv(action_limit=(0.75, 0.75, 0.75), render=True, on_rack=False, lateralFriction=0.7)
device = "cpu"
num_samples = 1000
horizon = 5
# Init the NN dynamics model
dynamics = Dynamics(
    n_in=31,
    n_hidden=500,
    n_out=19,
    depth=3
)
dynamics.load_state_dict(torch.load("./logs/dynamics.pt"))

In [None]:
def get_rand_at(env):
    at = env.action_space.sample()
    at = np.concatenate([at[:6], [0]*6])
    return at


def MPC(dynamics, s0, env, horizon, num_samples, device):
    st = np.array([s0 for _ in range(num_samples)])
    a0 = np.array([get_rand_at(env) for _ in range(num_samples)])
    rt = torch.zeros(num_samples)
    for t in range(horizon):

        if t != 0:
            at = np.array([get_rand_at(env) for _ in range(num_samples)])
        else:
            at = a0

        X = np.concatenate((st, at), axis=1)
        X = torch.from_numpy(X).to(device).float()

        st_1 = dynamics(X).cpu().detach().numpy()
        rt += -(st_1[:, 0] - st[:, 0])
        st = st_1
    idx = np.argmax(rt)
    return a0[idx]

In [None]:
dynamics.eval()
dynamics = dynamics.to(device)
rewards = [0]

st = env.reset()
for _ in tqdm(range(1000)):
    at = MPC(
        dynamics,
        st,
        env,
        1,
        5000,
        device=device
    )
    st_1, reward, done, info = env.step(at)
    rewards.append(rewards[-1] + reward)

    st = st_1

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

rt = np.array(rewards) * -1.0
plt.plot(rt)
plt.xlabel("Episode")
plt.ylabel("Cumulative Reward")