In [1]:
from imitation_workshop.iqlearn import IQLearn
from imitation_gym_wrappers.recorder_wrapper import RecorderWrapper
import imitation_workshop.envs
import gymnasium as gym

def regularizer(x):
  return x**2/40

device = 'cuda'
env = gym.make("InternalStateCarRacing-v0")
iqlearn = IQLearn(env, regularizer=regularizer, sac_args={'device': device})


In [2]:
recording_name = 'recordings/recording'

recorder = RecorderWrapper(env, 10000)
recorder.load_buffer(recording_name)
iqlearn.set_demonstration_buffer(recorder.get_sb3_buffer())

In [None]:
iqlearn.learn(5000)

In [None]:
import pickle

agent_name = 'carracing'
with open(f'agents/{agent_name}_{iqlearn.global_step}.agent', 'wb') as f:
  pickle.dump(iqlearn, f)

In [3]:
import pickle
with open('agents/carracing_45693.agent', 'rb') as f:
    iqlearn = pickle.load(f)
iqlearn.actor.to(device)

Actor(
  (fc1): Linear(in_features=10, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=32, bias=True)
  (fc_mean): Linear(in_features=32, out_features=2, bias=True)
  (fc_logstd): Linear(in_features=32, out_features=2, bias=True)
)

In [18]:
from stable_baselines3 import SAC
import torch
import time

class KLShaping(gym.Wrapper):
    def __init__(self, env, iqlearn, mu=1):
        super().__init__(env)
        self.iqlearn = iqlearn
        self.model = None
        self.mu = mu

    def set_model(self, sac):
        self.model = sac

    def step(self, action):
        obs, reward, terminated, truncated, info = super().step(action)

        torch_obs = torch.Tensor(obs).unsqueeze(0).to(device)
        
        iqlearn_mean, iqlearn_log_std = self.iqlearn.actor(torch_obs)
        iqlearn_std = iqlearn_log_std.exp()

        model_mean, model_log_std = self.model.actor(torch_obs)
        model_std = model_log_std.exp()

        # from https://mr-easy.github.io/2020-04-16-kl-divergence-between-2-gaussian-distributions, but summing as two univariate
        kl_divergence = torch.log(iqlearn_std/model_std) - 1 + (model_mean-iqlearn_mean)**2/iqlearn_std + model_std/iqlearn_std
        kl_divergence = 0.5*kl_divergence.detach().cpu().numpy().sum()

        self.model.writer.add_scalar("train/kl_divergence", 
                                     kl_divergence,               
                                     self.model.n_updates)
        self.model.writer.add_scalar("train/reward", 
                                     reward,               
                                     self.model.n_updates)
        
        reward -= self.mu * kl_divergence
        
        return obs, reward, terminated, truncated, info
        

shaping_env = KLShaping(env, iqlearn, 0.0)
sac = IQLearn(shaping_env, sac_args={'use_targets': True, 'buffer_size': 10000, 'tau': 0.0005, 'device': device})
sac.actor.load_state_dict(iqlearn.actor.state_dict())
sac.actor_optimizer = torch.optim.Adam(
    list(sac.actor.parameters()), lr=0.0
)
sac.a_optimizer = torch.optim.Adam([sac.log_alpha], lr=0.0)


shaping_env.set_model(sac)

In [27]:
sac.sac_learn(10000)

Output()

In [21]:
sac.recreate_optimizers()

Output()

In [26]:
import pickle

agent_name = 'sac'
with open(f'agents/{agent_name}_{sac.n_updates}.agent', 'wb') as f:
  pickle.dump(sac, f)

In [9]:
model.n_updates

5000