# Reflex RL - Modulating reflex module gains with RL

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
import matplotlib.pyplot as plt

import ReflexInterface_RL
import numpy as np

import gymnasium
import argparse
from datetime import datetime

from gymnasium.envs.registration import register

from stable_baselines3 import PPO
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.callbacks import CheckpointCallback

from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.env_checker import check_env
#from stable_baselines3.common.utils import get_device

import os
import skvideo.io
import copy

from base64 import b64encode
from IPython.display import HTML

def show_video(video_path, video_width = 500):
    video_file = open(video_path, "r+b").read()
    video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"
    return HTML(f"""<video autoplay width={video_width} controls><source src="{video_url}"></video>""")

register(
    id="MyoReflex_RL-v0",
    entry_point="ReflexInterface_RL:ReflexEnv",
    max_episode_steps=2000,
)

In [None]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

# param_filename = 'myorfl_Kine_2D_1_25_2024Nov25_1718_22mus_DelayKine_Best'
# params_0 = np.loadtxt(f"reflex_RL/reflex_param/{param_filename}.txt")
params_0 = np.ones(73,)

test = gymnasium.make('MyoReflex_RL-v0', init_pose='walk', dt=0.01, mode='2D', tgt_field_ver=0, reflex_params=params_0, delta_control_mode='sym')
test.reset()

check_env(test)

In [None]:
params_0 = np.loadtxt(os.path.join(os.getcwd(),'..','reflex_RL', 'reflex_param', 
                                   'myorfl_Kine_2D_1_25_2024Nov25_1718_22mus_DelayKine_Best.txt')) # Load your params

train_env = gymnasium.make('MyoReflex_RL-v0', render_mode=None, reflex_params=params_0, init_pose='walk', dt=0.01, 
                           mode='2D', tgt_field_ver=0, 
                           episode_limit=20, 
                           reward_wt=None,
                           obs_param=None, rew_type=0, stim_mode='reflex', 
                           tgt_vel_mode='eval', sine_vel_args=None, delta_mode='delayed', delta_control_mode='sym')
train_env.reset()

In [None]:
model = PPO("MlpPolicy", train_env, learning_rate=0.0001, n_steps=4096, 
    batch_size=1024, n_epochs=4, gae_lambda=0.99, target_kl=0.01,
    policy_kwargs = dict(net_arch=[128,128]), verbose=1)

In [None]:
# model.learn(total_timesteps=10000)
# #train_env.unwrapped.JNT_OPTIM

In [None]:
# Run clean
warnings.filterwarnings("ignore", category=DeprecationWarning)

param_filename = 'myorfl_Kine_2D_1_25_2024Nov25_1718_22mus_DelayKine_Best'
params = np.loadtxt(f"reflex_param/{param_filename}.txt")

eval_env = gymnasium.make('MyoReflex_RL-v0', init_pose='walk', dt=0.01, mode='2D', tgt_field_ver=0, reflex_params=params, episode_limit=2000, 
                          stim_mode='reflex', tgt_vel_mode='eval', delta_mode='delayed', delta_control_mode='sym')
obs, _ = eval_env.reset()
test_model = PPO.load("PPO_outputV3\PPO_2024Nov29_1816_2D_TrainedModel", env=eval_env)


frames = []
# obs = eval_env.reset()
for timestep in np.arange(2000):
    frame = eval_env.MyoEnv.sim.renderer.render_offscreen(camera_id=1)
    frames.append(frame)

    action, _states = test_model.predict(obs)
    obs, rewards, is_done, info, _ = eval_env.step(action)

    if is_done:
        #print(f"Succed at {time_step}")
        break


param_filename = '2D_PPO_Test'
skvideo.io.vwrite(f"{param_filename}.mp4",
                  np.asarray(frames),inputdict={"-r":"100"}, outputdict={"-r" : "100", "-pix_fmt": "yuv420p"})
show_video(f"{param_filename}.mp4")

In [None]:
## Save states

# Wang algorithm
#reward_type = 1
#rw_and_wts = dict(zip(['v_tgt'], [1]) )

params_0 = np.loadtxt(os.path.join(os.getcwd(),'..','reflex_RL', 'reflex_param', 'myorfl_Kine_2D_1_25_2024Nov25_1718_22mus_DelayKine_Best.txt')) # Load your params
# 
# Others
reward_type = 0
rw_and_wts = None

rollout_env = gymnasium.make('MyoReflex_RL-v0', init_pose='walk', dt=0.01, mode='2D', tgt_field_ver=0, reflex_params=params_0, episode_limit=2000, 
                          obs_param=None, rew_type=reward_type, reward_wt=rw_and_wts, stim_mode='reflex', 
                          tgt_vel_mode='eval', sine_vel_args=None, delta_mode='delayed', delta_control_mode='asym')
# check_env(rollout_env)
#rollout_env.reset()
#rollout_env.run_reflex_step()

In [None]:
rollout_env.reset()

frames = []
pelvisV = []

timesteps = int(20 / rollout_env.dt)

for i in range(timesteps):
    rollout_env.MyoEnv.sim.data.camera(4).xpos[2] = 2.181
    frame = rollout_env.MyoEnv.sim.renderer.render_offscreen(camera_id=4)
    frames.append(frame)

    pelvisV_curr = rollout_env.SENSOR_DATA['body']['pelvis_vel']
    pelvisV.append(pelvisV_curr)

    _ , is_done = rollout_env.run_reflex_step()

    if is_done:
        print(f"Stopped at {i}")
        break

skvideo.io.vwrite("Test.mp4", 
                  np.asarray(frames),inputdict={"-r":"100"}, outputdict={"-r" : "100", "-pix_fmt": "yuv420p"})
# show in the notebook
show_video("Test.mp4")

In [None]:
# from above dt = 0.01
dt = 0.01

first_values = [array[0] for array in pelvisV]
indices = np.arange(len(first_values))*dt

window_size = 400
moving_avg = np.convolve(first_values, np.ones(window_size)/window_size, mode='valid')

plt.plot(indices, first_values, label='First Value', alpha=0.6)
plt.plot(indices[window_size-1:], moving_avg, label=f'Moving Average ({window_size} indices)', color='orange', linewidth=2)
plt.xlabel('Time (s)')
plt.ylabel('Pelvis Velocity (m/s)')
plt.title('Pelvis Velocity')
plt.grid(True)
plt.show()

In [None]:
rollout_env.reset()

frames = []

timesteps = int(20 / rollout_env.dt)

for i in range(timesteps):
    rollout_env.MyoEnv.sim.data.camera(4).xpos[2] = 2.181
    frame = rollout_env.MyoEnv.sim.renderer.render_offscreen(camera_id=4)
    frames.append(frame)
    
    """
    TODO: Data capture here
    exo_r_torque = MyoEnv.env.sim.data.actuator('Exo_R').actuator_force[0]
    MyoEnv.env.sim.data.joint('r_ankle').qpos[0]
    """

    _ , is_done = rollout_env.run_reflex_step()

    if is_done:
        print(f"Stopped at {i}")
        break

skvideo.io.vwrite("Test.mp4", 
                  np.asarray(frames),inputdict={"-r":"100"}, outputdict={"-r" : "100", "-pix_fmt": "yuv420p"})
# show in the notebook
show_video("Test.mp4")

In [None]:
# Using 'No_Exo_Retrain', 'myorfl_Kine_2D_1_25_2024Nov25_1718_22mus_DelayKine_Best.txt'

# Test Initialize with parts of gait cycle
# Collect rollouts of best walking from reflex ctrl
# rollout_env.reset()
# state_list, reflex_list, ref_act_list, grf_list = rollout_env.collect_reflex_rollouts()

# # Save environment and reflex controller state

# print(len(grf_list))

# 0 - Right
# 1 - Left
# plt.plot(range(2000), grf_list[:,0], label='Right')
# plt.plot(range(2000), grf_list[:,1], label='Left')
# plt.xlim([1233, 1948])
# plt.legend()

# # Check the GRF for segmenting
# np.where(grf_list[1233:1948,0] > 0)

# plt.figure()
# plt.plot(grf_list[1233:1948, 0])
# plt.plot(grf_list[1233:1948, 1])

# # Extract based on GRF values
# save_state_list = state_list[1233:1948]
# save_reflex_list = reflex_list[1233:1948]

# np.save('save_state_list_2D.npy', save_state_list)
# np.save('save_reflex_list_2D.npy', save_reflex_list)

# # ------ Rendering check -----
# # Rendering to check
# # Testing of state reset
# # 1. Set state
# # 2. Run step normally
# frames = []

# env.reset()

"""
# !! REMEMBER TO Disable warmstart for Mujoco in the XML file options
# https://github.com/google-deepmind/mujoco/issues/493
"""

rollout_env.reset()

# loadedState = np.load("save_state_list_2D.npy", allow_pickle=True)
# loadedReflex = np.load("save_reflex_list_2D.npy", allow_pickle=True)
test_state = copy.deepcopy(loadedState[40])
test_reflex = copy.deepcopy(loadedReflex[40])

rollout_env.set_reflex_env_state(test_state)
rollout_env.unwrapped.ReflexCtrl = test_reflex

test_frame = rollout_env.MyoEnv.unwrapped.sim.renderer.render_offscreen(camera_id=4)

plt.imshow(test_frame)

# Scripts to evaluate and generate videos of policies

In [None]:
# Collect actions from best and worst policy
warnings.filterwarnings("ignore", category=DeprecationWarning)

import copy
import skvideo.io

def evaluate_trained_policy(test_env, test_policy, max_timestep, n_eval_ep, vel_sequence):
    print(f"Max timestep : {max_timestep}, ")
    best_action = np.zeros((1, 94))
    longest_action = np.zeros((1, 94))
    action_queue = np.zeros((1, 94))
    all_ep_len = []
    all_rewards = []
    ep_reward = 0
    best_reward = 0
    longest_reward = 0

    frames = []
    #act_list = np.zeros((max_timestep,22))
    obs_list = np.zeros((max_timestep,25))
    
    obs_duringEval = 0
    obs_replayEval = 0
    obs_replaySaved = 0

    act_mat = np.zeros((max_timestep,94))
    
    #print(f"In func:{test_env.target_x_vel}")
    test_env.unwrapped.eval_x_vel = vel_sequence[0]
    
    for eps in range(n_eval_ep):
        obs,_ = test_env.reset()
        #print(f"Policy_obs : {obs}")
        
        #print(f"In loop:{test_env.target_x_vel}")
        
        if eps % np.int32(n_eval_ep/4) == 0:
            print(f"Eval Episode: {eps}")
        #print(f"Eval Episode: {eps}")
        
        for timestep in range(max_timestep):
            test_env.unwrapped.eval_x_vel = vel_sequence[timestep] # Override target velocity with input sequence for evaluation
            
            obs_list[timestep, :] = obs.copy()
            #print(f"tgt_vel {obs[0]}")
            action, _states = test_policy.predict(obs, deterministic=True)
            obs, rewards, is_done, failed, info = test_env.step(action)
            
            # Collect data
            if action_queue.shape[0] != 1:
                action_queue = np.vstack((action_queue, action))
            else:
                action_queue = action
    
            ep_reward += rewards
            
            #if timestep % 100 == 0:
            #    print(f"Time is {timestep}")
            
            #if failed:
            #    print(f"Stopped at {timestep}")
            #    break
                
            if is_done:
                print(f"Succed at {timestep}")
                break
        
        if action_queue.shape[0] > best_action.shape[0] and best_reward < ep_reward:
            best_action = action_queue.copy()
            best_reward = ep_reward

        # Collect best longest surviving
        if action_queue.shape[0] > longest_action.shape[0]:
            longest_action = action_queue.copy()
            longest_reward = ep_reward
    
        all_ep_len.append(action_queue.shape[0])
        all_rewards.append(ep_reward)

        ep_mean = np.mean(np.array(all_ep_len))
        ep_std = np.std(np.array(all_ep_len))

        rew_mean = np.mean(np.array(all_rewards))
        rew_std = np.std(np.array(all_rewards))
        
        ep_reward = 0
        action_queue = np.zeros((1, 94))

        print(f"Best length : {best_action.shape[0]}")
    
    eval_dict = {'obs': obs_list, 'best_action_seq': best_action, 'best_reward': best_reward, 
                 'longest_action_seq': longest_action, 'longest_reward': longest_reward, 
                 'all_ep_len': all_ep_len, 'all_rewards': all_rewards, 
                 'ep_mean': ep_mean, 'ep_std': ep_std, 'rew_mean': rew_mean, 'rew_std': rew_std}
    
    return eval_dict

In [None]:
import copy
def evaluate_30Hz_policy(test_env, test_policy, max_timestep, vel_sequence, path, filename, reward_wt_param):
    frames = []
    print(f"Max timestep : {max_timestep}, ")
    action_queue = np.zeros((1, 36))
    ep_reward = 0
    
    test_env.unwrapped.eval_x_vel = vel_sequence[0]
    eval_vec = np.zeros((max_timestep, 8))
    
    obs, _ = test_env.reset()
    obs_list = np.zeros((max_timestep,25))

    print(f"Ep limit: {test_env.unwrapped.episode_limit}")
    
    for timestep in range(max_timestep):
        # Setting target velocities manually in the observation for the policy 
        # 
        #obs[0] = vel_sequence[timestep]
        test_env.unwrapped.eval_x_vel = vel_sequence[timestep]
        
        #frame = test_env.MyoEnv.sim.renderer.render_offscreen(camera_id=1)
        #frames.append(frame)
        
        # if timestep % 3 == 0:
        #     action, _states = test_policy.predict(obs, deterministic=True)
        #     stored_action = action
        #     #print(f"Action at {timestep}")
        action, _states = test_policy.predict(obs, deterministic=True)
        
        obs_list[timestep, :] = obs.copy()

        obs, rewards, is_done, failed, info = test_env.step(action) # have the same delta action for 3 timesteps
        #obs, rewards, is_done, failed, info = test_env.step(np.zeros(94,)) # Debugging forward velocity
        diag_rew_dict = copy.deepcopy(test_env.debug_reward_dict)

        test_env.unwrapped.debug_actions
        
        eval_vec[timestep, 0] = obs[0]
        eval_vec[timestep, 1] = obs[9]
        eval_vec[timestep, 2] = test_env.unwrapped.avg_vel
        eval_vec[timestep, 3] = diag_rew_dict['v_tgt'] * reward_wt_param['v_tgt']
        eval_vec[timestep, 4] = diag_rew_dict['alive_rew'] * reward_wt_param['alive_rew']
        eval_vec[timestep, 5] = diag_rew_dict['footstep'] * reward_wt_param['footstep']
        eval_vec[timestep, 6] = diag_rew_dict['effort'] * reward_wt_param['effort']
        eval_vec[timestep, 7] = diag_rew_dict['action_penalty_zero'] * reward_wt_param['action_penalty_zero']
        
        # Collect data
        if action_queue.shape[0] != 1:
            action_queue = np.vstack((action_queue, action))
        else:
            action_queue = action
            
        ep_reward += rewards

        if timestep == max_timestep-1:
            print(f"internal time step: {test_env.unwrapped.time_step}")
            print(f"Terminated: {is_done}, Truncated : {failed}")
        
        if is_done:
            print(f"Succed at {timestep}")
            break
        if failed:
            print(f"Failed at {timestep}")
            break
        
    eval_dict = {'obs': obs_list, 'best_action_seq': action_queue, 'best_reward': ep_reward, 
                 'longest_action_seq': action_queue, 'longest_reward': ep_reward}

    print(f"Recorded best reward: {ep_reward}, ep_len: {action_queue.shape[0]}")    
    
    # skvideo.io.vwrite(os.path.join(path, f"{filename}_30Hz.mp4"),
    #                   np.asarray(frames),inputdict={"-r":"100"}, outputdict={"-r" : "100", "-pix_fmt": "yuv420p"})
    # print('Best ep video rendered')

    if action_queue.shape[0] != eval_vec.shape[0]:
        eval_vec = eval_vec[0:action_queue.shape[0], :]
    
    # Average velocity plotting
    step_idx = np.where(eval_vec[0:action_queue.shape[0],2] > 0)[0]
    step_vec = eval_vec[0:action_queue.shape[0],2].copy()
    
    step_idx = np.hstack((np.array(0), step_idx))
    
    for idx in np.arange(len(step_idx)-1):
        step_vec[step_idx[idx]:step_idx[idx+1]] = step_vec[step_idx[idx+1]]

    # Assume the last step has the same average velocity as the last 2nd
    step_vec[step_idx[-1]::] = step_vec[step_idx[-1]]
    
    fig_1 = plt.figure()
    plt.plot(np.arange(action_queue.shape[0]), eval_vec[0:action_queue.shape[0],0], label='Target velocity')
    plt.plot(np.arange(action_queue.shape[0]), eval_vec[0:action_queue.shape[0],1], label='Current velocity')
    plt.plot(np.arange(action_queue.shape[0]), step_vec, label='Step velocity')
    #plt.plot(np.arange(action_queue.shape[0]), eval_vec[0:action_queue.shape[0],2], label='Step velocity')
    #plt.plot(np.arange(max_timestep), cvel_vec[:,0], label='COM velocity')
    plt.legend()
    fig_1.savefig(os.path.join(path, f"{filename}_velocities.png"))

    # fig_2 = plt.figure()
    # plt.plot(np.arange(action_queue.shape[0]), eval_vec[:, 3], label='Tgt velocity reward')
    # plt.plot(np.arange(action_queue.shape[0]), eval_vec[:, 4], label='Footstep reward')
    # plt.plot(np.arange(action_queue.shape[0]), eval_vec[:, 5], label='Effort reward')
    # plt.plot(np.arange(action_queue.shape[0]), eval_vec[:, 6], label='Action Penalty reward')
    # plt.legend()
    # plt.title(f"Total reward: {ep_reward}")
    # fig_2.savefig(os.path.join(path, f"{filename}_rewards.png"))

    #print(eval_vec[:, 2:7])
    
    sumedRew = np.sum(eval_vec[:, 3:8], axis=0).tolist()
    labels = ['Alive', 'Tgt_vel', 'Footstep', 'Effort', 'Action_Penalty']

    fig_2, ax = plt.subplots(figsize =(16, 9))
    ax.barh(labels, sumedRew)
    
    # Remove axes splines
    for s in ['top', 'bottom', 'left', 'right']:
        ax.spines[s].set_visible(False)
        
    # Remove x, y Ticks
    ax.xaxis.set_ticks_position('none')
    ax.yaxis.set_ticks_position('none')
    
    # Add padding between axes and labels
    ax.xaxis.set_tick_params(pad = 5)
    ax.yaxis.set_tick_params(pad = 10)
    
    # Add annotation to bars
    for i in ax.patches:
        plt.text(i.get_width()+0.2, i.get_y()+0.5, 
                 str(round((i.get_width()), 2)),
                 fontsize = 10, fontweight ='bold',
                 color ='grey')
    ax.set_title(f"Total reward: {ep_reward}", loc ='left')
    
    # fig_2 = plt.figure()
    # plt.subplot2grid((5, 4), (0, 0), rowspan=5, colspan=2)
    # plt.barh(labels, sumedRew)
    
    # plt.subplot2grid((5, 4), (0, 3))
    # plt.plot(np.arange(action_queue.shape[0]), eval_vec[:, 2], label='Alive reward')
    # plt.ylabel('Alive reward')
    # plt.subplot2grid((5, 4), (1, 3))
    # plt.plot(np.arange(action_queue.shape[0]), eval_vec[:, 3], label='Tgt velocity reward')
    # plt.ylabel('Tgt reward')
    # plt.subplot2grid((5, 4), (2, 3))
    # plt.plot(np.arange(action_queue.shape[0]), eval_vec[:, 4], label='Footstep reward')
    # plt.ylabel('Footstep reward')
    # plt.subplot2grid((5, 4), (3, 3))
    # plt.plot(np.arange(action_queue.shape[0]), eval_vec[:, 5], label='Effort')
    # plt.ylabel('Effort reward')
    # #plt.subplot2grid((5, 4), (4, 3))
    # #plt.plot(np.arange(action_queue.shape[0]), eval_vec[:, 6], label='Action Penalty')
    # #plt.ylabel('Action penalty')
    
    #plt.suptitle(f"Total reward: {ep_reward}")
    fig_2.savefig(os.path.join(path, f"{filename}_rewards.png"))
    
    fig_3 = plt.figure()
    plt.plot(action_queue)
    fig_3.savefig(os.path.join(path, f"{filename}_Actions.png"))

    
    plt.close(fig_1)
    plt.close(fig_2)
    plt.close(fig_3)
    
    return eval_dict, eval_vec

In [None]:
import json
import matplotlib.pyplot as plt
import copy

class NumpyEncoder(json.JSONEncoder):
    """ Special json encoder for numpy types """
    def default(self, obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
                            np.int16, np.int32, np.int64, np.uint8,
                            np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32,
                              np.float64)):
            return float(obj)
        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

def save_data_plots(eval_dict, path, filename):
    fig_1 = plt.figure()
    plt.subplot(2,1,1)
    plt.hist(eval_dict['all_ep_len'], bins=np.arange(0,2001,200))
    ep_success = np.where(np.array(eval_dict['all_ep_len']) == 2000)[0].shape[0]
    plt.title(f"MyoReflexRL - Ep length distributions (Success: {ep_success / len(eval_dict['all_ep_len'])})")
    
    plt.subplot(2,1,2)
    plt.hist(eval_dict['all_rewards'], bins=np.arange(0,501,50))
    
    fig_1.savefig(os.path.join(path, f"{filename}_eval_stats.png"))

    np.save( os.path.join(main_path, f"{directory_list[idx]}_{suffix}_eval"), eval_dict)
    # dumped = json.dumps(eval_dict, cls=NumpyEncoder)
    # with open(os.path.join(path, f"{filename}_eval.json"), 'a') as f:
    #     f.write(dumped + '\n')
    # Use numpy save instead to save dict

def extract_reward_terms(test_env, best_replay, max_timestep, vel_sequence, path, filename):
    # Additional section to track the current and target velocitiy of the best
    test_env.unwrapped.eval_x_vel = vel_sequence[0]
    #test_env.unwrapped.target_vel_type = 'constant'
    
    test_env.reset()
    
    # 0 - Tgt velocity (obs)
    # 1 - Current velocity
    # 2 - tgt reward
    # 3 - avg step velocity
    # 4 - foot step (boolean) - As we only get tgt reward when there is a step
    eval_vec = np.zeros((max_timestep, 6))
    cvel_vec = np.zeros((max_timestep, 1))
    #print(f"In func:{test_env.target_x_vel}")
    
    for timestep in range(max_timestep):
        test_env.unwrapped.eval_x_vel = vel_sequence[timestep]
        
        obs, _, is_done, failed, _ = test_env.step(best_replay[timestep,:])
        # Getting rewarddict of current step
        curr_reward_dict = copy.deepcopy(test_env.get_reward_dict_old(is_done, best_replay[timestep,:]))
        
        eval_vec[timestep, 0] = obs[0]
        eval_vec[timestep, 1] = obs[9]
        eval_vec[timestep, 2] = curr_reward_dict['alive']
        eval_vec[timestep, 3] = curr_reward_dict['v_tgt']
        eval_vec[timestep, 4] = curr_reward_dict['footstep']
        eval_vec[timestep, 5] = curr_reward_dict['effort']

        # mass = np.expand_dims(eval_env.unwrapped.MyoEnv.sim.model.body_mass, -1) 
        # cvel = eval_env.unwrapped.MyoEnv.sim.data.cvel
        # cvel_vec[timestep,0] = (np.sum(mass * cvel, 0) / np.sum(mass))[3]

    fig_1 = plt.figure()
    plt.plot(np.arange(max_timestep), eval_vec[:,0], label='Target velocity')
    plt.plot(np.arange(max_timestep), eval_vec[:,1], label='Current velocity')
    #plt.plot(np.arange(max_timestep), cvel_vec[:,0], label='COM velocity')
    plt.legend()
    fig_1.savefig(os.path.join(path, f"{filename}_velocities.png"))

    fig_2 = plt.figure()
    plt.plot(np.arange(max_timestep), eval_vec[timestep, 3], label='Tgt velocity reward')
    plt.plot(np.arange(max_timestep), eval_vec[timestep, 4], label='Footstep reward')
    plt.plot(np.arange(max_timestep), eval_vec[timestep, 5], label='Effort reward')
    plt.legend()
    fig_2.savefig(os.path.join(path, f"{filename}_rewards.png"))

    plt.close(fig_1)
    plt.close(fig_2)
    
    return eval_vec

In [None]:
import skvideo.io

def save_video_replay(test_env, eval_dict, path, filename):
    # Generating the best result
    # best_action and best_reward

    best_reward = eval_dict['best_reward']
    best_ep_len = len(eval_dict['best_action_seq'])
    longest_reward = eval_dict['longest_reward']
    longest_ep_len = len(eval_dict['longest_action_seq'])
    
    print(f"Recorded best reward: {best_reward}, ep_len: {best_ep_len}")    
    print(f"Recorded longest reward: {longest_reward}, ep_len: {longest_ep_len}")

    # Recording 2 videos, instead of 1
    out_action = eval_dict['best_action_seq']
    
    frames = []
    data_store = []
    #out_rewards = np.zeros((len(out_action), ))
    
    obs, _ = test_env.reset()
    
    for time_step in range(len(out_action)):
        frame = test_env.MyoEnv.sim.renderer.render_offscreen(camera_id=1)
        frames.append(frame)
    
        obs, rewards, is_done, failed, info = test_env.step(out_action[time_step,:])
            
        if is_done:
            #print(f"Succed at {time_step}")
            break
    skvideo.io.vwrite(os.path.join(path, f"{filename}_best_reward_video.mp4"),
                      np.asarray(frames),inputdict={"-r":"33"}, outputdict={"-r" : "100", "-pix_fmt": "yuv420p"})
    print('Best reward video rendered')


In [None]:
def evaluate_constant_vel(test_env, test_policy, path, filename, reward_wt_param, velocities=np.array([0.8, 1.2, 1.6, 1.8]), max_timestep=2000):
    # 0.8, 1.2, 1.6, 1.8
    for vel_idx in range(velocities.shape[0]):
        print(f"Evaluating constant velocity: {velocities[vel_idx]}")
        vel_seq = np.ones(max_timestep)*velocities[vel_idx]
        convert = str(velocities[vel_idx]).replace('.','_')
        
        newfilename = f"{filename}_constant_{convert}"

        eval_dict, eval_vec = evaluate_30Hz_policy(test_env, test_policy, max_timestep, vel_seq, path, newfilename, reward_wt_param)
    #return eval_vec

In [None]:
def evaluate_change_vel(test_env, test_policy, path, filename, reward_wt_param, velocities=np.array([0.8, 1.2, 1.6, 1.8]), max_timestep=2000):

    velocities = np.array([[1.2, 0.8], [1.2, 1.6], [0.8, 1.6], [1.6, 0.8], [0.8, 1.8]])
    #seq = np.zeros((5, 2000))
    
    for vel_idx in range(velocities.shape[0]):
        print(f"Evaluating changing velocity: {velocities[vel_idx, :]}")
        vel_seq = np.ones(max_timestep)*velocities[vel_idx, 0]
        vel_seq[1000:2000] = velocities[vel_idx, 1]
        #seq[vel_idx, :] = vel_seq.copy() 
        convert = str(velocities[vel_idx,0]).replace('.','_')
        convert_2 = str(velocities[vel_idx,1]).replace('.','_')
        
        newfilename = f"{filename}_change_{convert}_to_{convert_2}"
        
        eval_dict, eval_vec = evaluate_30Hz_policy(test_env, test_policy, max_timestep, vel_seq, path, newfilename, reward_wt_param)
        
        #print(f"Vel seq {vel_seq}")
        #print(f"path: {path}, filename: {newfilename}")
        #eval_dict = evaluate_trained_policy(test_env, test_policy, max_timestep=max_timestep, n_eval_ep=1, vel_sequence=vel_seq)
        #extract_reward_terms(test_env, np.array(eval_dict['best_action_seq']), max_timestep=np.array(eval_dict['best_action_seq']).shape[0], vel_sequence=vel_seq, 
        #                     path=path, filename=newfilename)
        #print("Rendering video...")
        #save_video_replay(test_env, eval_dict, path=path, filename=newfilename)

In [None]:
# Sample
def evaluate_sinusoidal_vel(test_env, test_policy, path, filename, reward_wt_param, velocities=np.array([0.8, 1.2, 1.6, 1.8]), max_timestep=2000):

    velocities = np.array([0.8, 1.4])
    periods = np.array([2000, 1000, 500, 300])
    seq = np.zeros((4, 2000))
    
    for vel_idx in range(periods.shape[0]):
        print(f"Evaluating sinusoidal velocity: {velocities} on period {periods[vel_idx]}")
        
        vel_seq = np.zeros(2000,)
        for time in range(2000):
            vel_seq[time] = get_sinusoidal_vel(0.8, 1.4, periods[vel_idx], time)[0]
        
        convert = str(velocities[0]).replace('.','_')
        convert_2 = str(velocities[1]).replace('.','_')
        
        newfilename = f"{filename}_sine_{convert}_to_{convert_2}_period_{periods[vel_idx]}"
        
        eval_dict, eval_vec = evaluate_30Hz_policy(test_env, test_policy, max_timestep, vel_seq, path, newfilename, reward_wt_param)

In [None]:
def get_sinusoidal_vel(sine_min, sine_max, sine_period, current_time):
    """
    Compute the value of a sine wave at a specific time.
    Current time: Given in milliseconds
    """
    #phase_shift = 0
    
    amplitude = (sine_max - sine_min) / 2
    offset = (sine_min + sine_max) / 2

    frequency = 1 / sine_period
    value = amplitude * np.sin(2 * np.pi * frequency * current_time) + offset

    return np.array([value, 0]) # Currently only for 2D walking

In [None]:
# Scripting to automatically generate all the reports and videos in a loop. Eventually make it into a python file
import os
from glob import glob
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# main_path = os.path.join('params', '11mus_RL', '2024_02_29', 'Asym_Simplified_256_128_RealTime') # , '2024_01_31_to_02_05', 'TgtPosVel'
main_path = os.path.join('reflex_RL', 'reflex_param', 'myorfl_Kine_2D_1_25_2024Nov25_1718_22mus_DelayKine_Best') # , '2024_01_31_to_02_05', 'TgtPosVel'
param_filename = 'myorfl_Kine_2D_1_25_2024Nov25_1718_22mus_DelayKine_Best'
params = np.loadtxt(os.path.join(main_path, f"{param_filename}.txt"))

obs_param = None #['spinal_phase', 'mus_f', 'mus_l', 'mus_v']
reward_dict = {'alive_rew': 1, 'footstep': 1, 'action_penalty_zero': 0.1, 'effort': 0, 'v_tgt': 5}
#reward_dict = {'alive_rew': 1, 'footstep': 1, 'v_tgt': 1}
reward_func = 2

# Modify to change evaluation environment
# Fixed constant velocity
tgt_vel = np.array([1.25,0])
sine_vel_args=None

# Randomized constant velocity
# tgt_vel = np.array([-1,-1])
# tgt_vel_mode='constant'
# sine_vel_args=None

# Fixed Sinusoidal velocity
# tgt_vel = np.array([-1,-1])
# tgt_vel_mode='sine' #'sine'
# sine_vel_args=dict(sine_min=0.8, sine_max=1.8, sine_period=4000)

# Randomized Sinusoidal velocity
# tgt_vel = np.array([-1,-1])
# tgt_vel_mode='sine' #'sine'
# sine_vel_args=None

eval_env = gymnasium.make('MyoReflex_RL-v0', init_pose='walk', dt=0.01, mode='2D', tgt_field_ver=0, reflex_params=params, episode_limit=2000, 
                          obs_param=obs_param, rew_type=reward_func, reward_wt=reward_dict, stim_mode='reflex', 
                          target_vel=tgt_vel, tgt_vel_mode='eval', sine_vel_args=sine_vel_args, delta_mode='realtime', delta_control_mode='sym')
eval_env.reset()

# test_model = PPO.load("params/11mus_RL/2023_12_27/Obs_dist/2D_PPO_rewOld_Obs_dist_128_128/PPO_2023Dec27_2006_2D_TrainedModel.zip", env=eval_env)
test_model = PPO.load("reflex_RL/PPO_outputV3/PPO_2023Dec27_2006_2D_TrainedModel.zip", env=eval_env)

gen_path = os.listdir(main_path)

directory_list = [gen_path[i] for i in range(len(gen_path)) if os.path.isdir(os.path.join(main_path, gen_path[i]))]

print(f"Directory: {directory_list}")

for idx in range(len(directory_list)): # len(directory_list)
    print(glob( os.path.join(main_path, directory_list[idx], '*.zip') ))
    #print(directory_list[idx])

    test_model = PPO.load(glob( os.path.join(main_path, directory_list[idx], '*.zip') )[0])
    # suffix = str(0.8).replace('.','_')

    evaluate_constant_vel(eval_env, test_model, main_path, f"{directory_list[idx]}", reward_dict)
    #evaluate_change_vel(eval_env, test_model, main_path, f"{directory_list[idx]}", reward_dict)
    #evaluate_sinusoidal_vel(eval_env, test_model, main_path, f"{directory_list[idx]}", reward_dict)
    
    print('Done')
    #eval_dict = evaluate_trained_policy(eval_env, test_model, vel_seq.shape[0], 1, vel_sequence=vel_seq, path=main_path, filename=f"{directory_list[idx]}_{suffix}")
    #save_data_plots(eval_dict, path=main_path, filename=f"{directory_list[idx]}_{suffix}")
    #extract_reward_terms(eval_env, np.array(eval_dict['best_action_seq']), np.array(eval_dict['best_action_seq']).shape[0], vel_sequence=vel_seq, path=main_path, filename=f"{directory_list[idx]}_{suffix}")
    #save_video_replay(eval_env, eval_dict, path=main_path, filename=f"{directory_list[idx]}_{suffix}")
    
    # out_dict = evaluate_30Hz_policy(test_env=eval_env, test_policy=test_model, max_timestep=vel_seq.shape[0], 
    #                                  vel_sequence=vel_seq, path=main_path, filename=f"{directory_list[idx]}_{suffix}")
    # np.save( os.path.join(main_path, f"{directory_list[idx]}_{suffix}_out_vel"), out_dict)
    # print('Done')
    
    # for tgt_vel in target_vel:
    
    #     eval_env = gymnasium.make('MyoReflex_RL-v0', init_pose='walk', dt=0.01, mode='2D', tgt_field_ver=0, reflex_params=params, episode_limit=2000, 
    #                               obs_param=obs_param, rew_type=reward_func, reward_wt=reward_dict, 
    #                               target_vel=np.array([tgt_vel, 0]), tgt_vel_mode=tgt_vel_mode, sine_vel_args=sine_vel_args)
    #     eval_env.reset()
    
    #     suffix = str(tgt_vel).replace('.','_')
    
    #     eval_dict = evaluate_trained_policy(eval_env, test_model, 2000, 10, target_x_vel=tgt_vel)
    
    #     save_data_plots(eval_dict, path=main_path, filename=f"{directory_list[idx]}_{suffix}")
    #     extract_reward_terms(eval_env, np.array(eval_dict['best_action_seq']), 2000, path=main_path, filename=f"{directory_list[idx]}_{suffix}")
    #     save_video_replay(eval_env, eval_dict, path=main_path, filename=f"{directory_list[idx]}_{suffix}")
    