In [1]:
import yaml
from AntController.HyperparamSearchUtils import create_data_from_data_stores, \
            create_actor_func_from_hyper_param_search, \
            create_value_func_from_hyper_param_search, \
            pretrain_predictor_as_value_func, \
            get_value_func_loss, generate_predictor_from_random_config

from AntController.AntEnvironment import EpisodeData
from AntController.HaikuPredictor import HaikuPredictor
from ServoController.WalktCycleConfigParser import WalkCycle
import os
import numpy as np

In [13]:
actor_config_path = "AntController/configs/actor_hyperparam_search_config_no_sensor_multi_step.yaml"
critic_config_path = "AntController/configs/critic_hyperparam_search_config_no_sensor_multi_step.yaml"
training_data_dir = "TrainingData/Fixed_Walk_With_Sensor"
data = create_data_from_data_stores(training_data_dir, sensor_enabled = False)


In [3]:
rewards, states, actions, shifted_states, is_non_terminal = data
discount = 0.9
test_pivot = int(0.1 * len(data[0]))

In [None]:
action = action - states

In [4]:
def get_old_state(arr, mult):
    prev = None
    out = []
    for i, ele in enumerate(arr):
        if prev is None:
            out.append(ele)
            
        else:
            out.append(prev)
        prev = ele
        if mult[i] == 0:
            prev = None
    return np.asarray(out)

In [5]:
states = np.concatenate((get_old_state(states, is_non_terminal), states), axis=1)
shifted_states = np.concatenate((get_old_state(shifted_states, is_non_terminal), shifted_states), axis=1)

array([-0.15      , -0.15      ,  0.15      ,  0.15      ,  1.        ,
        0.8       ,  1.        ,  0.8       ,  0.06658623,  0.38680655,
        0.9939087 ,  0.11020683, -0.15      , -0.15      ,  0.15      ,
        0.15      ,  1.        ,  0.8       ,  1.        ,  0.8       ,
        0.06658623,  0.38680655,  0.9939087 ,  0.11020683])

In [6]:
data = rewards, states, actions, shifted_states, is_non_terminal

In [7]:
import pickle


def save_models(path, config, model):
    saved_params = (config, model.params, model.optimizer_state)
    pickle.dump(saved_params, open( path, "wb" ) )
    
def get_model(path):
    config, params, optimizer_state = pickle.load(open(path, "rb" ))
    predictor = HaikuPredictor.generate_controller_from_config(config)
    predictor.params = params
    predictor.optimizer_state = optimizer_state
    return predictor, config


In [8]:
save_dir = "AntController/configs/"

In [9]:
epochs = 1024
batch_size = 256


In [16]:
selected_actor, selected_actor_config = get_model(os.path.join(save_dir, "selected_actor_no_sensor_multi_step.p"))

In [22]:
selected_actor.params = selected_actor.net_t.init(42, np.ones(24))

In [18]:
wc = WalkCycle("WalkConfigs/nn_training_walk_config.yaml", speed=0.3)

In [24]:
import collections
def train_in_loop(predictor):
    frames = wc.get_frames()
    cur_frame = next(frames)
    pad =  [3.60094077e-01,  4.30332007e-01,  8.99514952e-01,  8.86656866e-02]
    state = np.concatenate((cur_frame, pad, cur_frame, pad))
    actual_state = np.concatenate((cur_frame, pad, cur_frame, pad))
    actual_pos = None
    r = collections.deque(maxlen=100)
    for i in range(100000):
        next_frame = next(frames)
        action_label = next_frame - cur_frame
        
        loss1 = predictor.train_batch(state, action_label)
        
        if actual_pos is not None:
            correcting_step = next_frame - actual_pos
            actual_state = np.concatenate((actual_state,actual_pos, np.random.normal(pad, 0.1)))[12:]
            loss2 = predictor.train_batch(actual_state, correcting_step)
        

        state = np.concatenate((state,next_frame, np.random.normal(pad, 0.1)))[12:]

        
        actual_pos = cur_frame + predictor.evaluate(state)
        
        
        curr_frame = next_frame
        
        r.append(loss1)
    
        if i%1000==0:
            print(sum(r)/100)
    return sum(r)/100

In [25]:
train_in_loop(best_actor)

0.009857341
2.7675567
2.709035
2.5757046
2.751356
2.734035
2.645226
2.7444093
2.6920125
2.7034209
2.6442974
2.711759
2.6423407
2.5667794
2.6631398
2.6976268
2.672373
2.8129833
2.7427974
2.736737
2.700286
2.705213
2.6857135


KeyboardInterrupt: 

In [16]:
from tqdm import trange
def create_predictor_from_hyper_param_search(config):
    test_split_proportion = config["test_split_proportion"]
    test_pivot = int(test_split_proportion * len(data))
    best_predictor, best_config = generate_predictor_from_random_config(config)
    
    best_test_error = train_in_loop(best_predictor)
    progress_range = trange(config["model_count"])
    for i in progress_range:
        new_predictor, new_config = generate_predictor_from_random_config(config)
       
        new_test_error = train_in_loop(best_predictor)
        if best_test_error > new_test_error:
            best_predictor, best_config = new_predictor, new_config
            best_test_error = new_test_error
        progress_range.set_description(
            "Trained Model {} with Loss {:.5f}. Current Optimal Loss: {:.5f}".format(
                i, new_test_error, best_test_error
            )
        )
        progress_range.refresh()
    return best_predictor, best_test_error, best_config



In [22]:
with open(actor_config_path) as file:
    actor_config = yaml.load(file, Loader=yaml.FullLoader)
actor_config['scale_config'] =  {'input_scale': np.asarray([0.10625633, 0.10625599, 0.10625599, 0.10625633, 0.14888737,
         0.14887016, 0.14888737, 0.14887016, 0.19920768, 0.10317653,
         0.1176432 , 0.40977577, 0.10620595, 0.10620583, 0.10620583,
         0.10620595, 0.14944643, 0.14943832, 0.14944643, 0.14943832,
         0.20073376, 0.1061449 , 0.11821892, 0.41113931]),
  'input_shift': np.asarray([-4.48807042e-04, -5.24090804e-04,  5.24090804e-04,  4.48807042e-04,
          8.51440043e-01,  8.51602193e-01,  8.51440043e-01,  8.51602193e-01,
          3.52327499e-01,  4.29775303e-01,  9.00457199e-01,  8.61436631e-02,
         -2.17164698e-04, -2.69284225e-04,  2.69284225e-04,  2.17164698e-04,
          8.50725813e-01,  8.50795305e-01,  8.50725813e-01,  8.50795305e-01,
          3.60094077e-01,  4.30332007e-01,  8.99514952e-01,  8.86656866e-02]),
  'output_scale': np.asarray([0.06888717, 0.06899398, 0.06899398, 0.06888717, 0.13558397,
         0.13555782, 0.13558397, 0.13555782]),
  'output_shift': np.asarray([ 0.00020269,  0.00028376, -0.00028376, -0.00020269, -0.00071423,
         -0.00080689, -0.00071423, -0.00080689])}
best_actor, best_actor_test_error, best_actor_config = \
    create_predictor_from_hyper_param_search(actor_config)

Trained Model 63 with Loss 2.65631. Current Optimal Loss: 2.14939: 100%|██████████| 64/64 [13:06<00:00, 12.29s/it]


In [17]:
import optax
selected_actor.optimizer = optax.adam(0.001, b1=0.5, b2=0.9)

In [16]:
selected_actor_config

{'loss': 'squared_loss',
 'decay': 0.999999,
 'input_shape': 24,
 'name': 'test_hyperparam',
 'rng_key': 42,
 'scale_config': {'input_scale': array([0.10625633, 0.10625599, 0.10625599, 0.10625633, 0.14888737,
         0.14887016, 0.14888737, 0.14887016, 0.19920768, 0.10317653,
         0.1176432 , 0.40977577, 0.10620595, 0.10620583, 0.10620583,
         0.10620595, 0.14944643, 0.14943832, 0.14944643, 0.14943832,
         0.20073376, 0.1061449 , 0.11821892, 0.41113931]),
  'input_shift': array([-4.48807042e-04, -5.24090804e-04,  5.24090804e-04,  4.48807042e-04,
          8.51440043e-01,  8.51602193e-01,  8.51440043e-01,  8.51602193e-01,
          3.52327499e-01,  4.29775303e-01,  9.00457199e-01,  8.61436631e-02,
         -2.17164698e-04, -2.69284225e-04,  2.69284225e-04,  2.17164698e-04,
          8.50725813e-01,  8.50795305e-01,  8.50725813e-01,  8.50795305e-01,
          3.60094077e-01,  4.30332007e-01,  8.99514952e-01,  8.86656866e-02]),
  'output_scale': array([0.06888717, 0.0689939

In [34]:
def train_actor_data_with_noise(predictor, batch_size, epochs, data, labels):
        losses = []
        for _ in range(epochs):
            noise = np.pad(np.random.normal(0,0.1, (batch_size, 8)), ((0,0), (0,4)), 'constant',constant_values= 0)
            action_noise = np.random.normal(0,0.1, (batch_size, 8))
            noise2 = np.pad(action_noise, ((0,0), (0,4)), 'constant',constant_values= 0)
            batch_index = np.random.choice(range(len(data)), batch_size)
            loss = predictor.train_batch(data[batch_index] + np.concatenate((noise, noise2), axis=1), labels[batch_index]-action_noise)
            losses.append(loss)
        return np.mean(losses)/batch_size

In [14]:
for _ in range(1000):
    print("train error: ", train_actor_data_with_noise(selected_actor, batch_size, epochs, states, actions))

NameError: name 'train_actor_data_with_noise' is not defined

In [32]:
save_models(os.path.join(save_dir, "selected_actor_trained_with_loop_no_sensor_multi_step.p"), selected_actor_config, selected_actor)

In [1]:
import numpy as np
from AntController.JaxUtils import *

In [12]:
with open(actor_config_path) as file:
    actor_config = yaml.load(file, Loader=yaml.FullLoader)
best_actor, best_actor_test_error, best_actor_config = \
    create_actor_func_from_hyper_param_search(actor_config, data)

FileNotFoundError: [Errno 2] No such file or directory: 'AntController/configs/actor_hyperparam_search_config_no_sensor_mult_step.yaml'

In [9]:
actor_config

{'mlp_search_config': {'permitted_activations': ['relu', 'sigmoid', 'lrelu'],
  'hidden_layer_size_exp_range': [5, 11],
  'hidden_layer_count_range': [1, 3],
  'output_size': 8,
  'scale': 1,
  'shift': 0},
 'loss': 'squared_loss',
 'rng_key': 42,
 'learning_rate_exp_range': [-7, -4],
 'decay': 0.999999,
 'input_shape': 12,
 'name': 'test_hyperparam',
 'batch_size': 256,
 'epochs': 1024,
 'model_count': 256,
 'test_split_proportion': 0.1,
 'discount': 0.98,
 'scale_config': {'input_scale': array([0.10625633, 0.10625599, 0.10625599, 0.10625633, 0.14888737,
         0.14887016, 0.14888737, 0.14887016, 0.19920768, 0.10317653,
         0.1176432 , 0.40977577, 0.10620595, 0.10620583, 0.10620583,
         0.10620595, 0.14944643, 0.14943832, 0.14944643, 0.14943832,
         0.20073376, 0.1061449 , 0.11821892, 0.41113931]),
  'input_shift': array([-4.48807042e-04, -5.24090804e-04,  5.24090804e-04,  4.48807042e-04,
          8.51440043e-01,  8.51602193e-01,  8.51440043e-01,  8.51602193e-01,
   

In [10]:
save_models(os.path.join(save_dir, "selected_actor_no_sensor_multi_step.p"), best_actor_config, best_actor)

NameError: name 'best_actor_config' is not defined

In [22]:
save_models(os.path.join(save_dir, "selected_critic_no_sensor_multi_step.p"), best_critic_config, best_critic)

In [25]:
get_old_state(states, is_non_terminal)

array([[-0.15      , -0.15      ,  0.15      , ...,  0.38680655,
         0.9939087 ,  0.11020683],
       [-0.15      , -0.15      ,  0.15      , ...,  0.38680655,
         0.9939087 ,  0.11020683],
       [-0.15      ,  0.15      , -0.15      , ...,  0.38806674,
         0.99639404,  0.0848466 ],
       ...,
       [ 0.075     , -0.075     ,  0.075     , ...,  0.42561141,
         0.97543067,  0.22030647],
       [ 0.15      , -0.15      ,  0.15      , ...,  0.42993438,
         0.97444797,  0.2246134 ],
       [ 0.15      , -0.15      ,  0.15      , ...,  0.43688837,
         0.97716719,  0.21247165]])

In [18]:
with open(critic_config_path) as file:
    critic_config = yaml.load(file, Loader=yaml.FullLoader)
best_critic, best_critic_test_error, best_critic_config = \
    create_value_func_from_hyper_param_search(critic_config, data)


Running random search on value critic...


Trained Model 63 with Loss 0.08863. Current Optimal Loss: 0.00838: 100%|██████████| 64/64 [07:56<00:00,  7.45s/it]


In [34]:
states.shape

(25902, 24)

In [None]:

selected_critic_no_sensor, selected_critic_config = get_model(os.path.join(save_dir, "selected_critic_no_sensor.p"))

In [17]:
pretrain_predictor_as_value_func(
        selected_critic_no_sensor, discount, data, epochs, batch_size, test_pivot
)

ValueError: Incompatible shapes for broadcasting: ((256, 24), (1, 12))