In [2]:
%load_ext autoreload
%autoreload 2
import os
os.chdir("..")

In [3]:
import torch
import pandas as pd
import d3rlpy
import numpy as np

import warnings

warnings.filterwarnings("ignore")

from augrl.augmentations import exrp
from augrl.augmentations.spin_cartpole.cartpole import CartPoleEnv

In [4]:
env = CartPoleEnv(game=False) # parameter update starts after 1K steps

## Train on manually collected data
Because of what the model is trying to approximate it's hard to directly evaluate.

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")
device

device(type='cpu')

In [6]:
predictor = exrp.ExplicitRewardPredictor.from_env(env, device, lr=7e-5)

In [12]:
user_rollouts = pd.read_pickle("augrl/augmentations/spin_cartpole/preferences/handmade_results_150.pickle")
segments = user_rollouts["segment"].max() + 1
eval_segments = np.random.choice(segments, size=int(0.1 * segments))

train_rollouts = user_rollouts[~user_rollouts["segment"].isin(eval_segments)]
eval_rollouts = user_rollouts[user_rollouts["segment"].isin(eval_segments)]
user_rollouts_spin = train_rollouts[train_rollouts["preference"] == 1]
user_rollouts_no_spin = train_rollouts[train_rollouts["preference"] == 0]
user_rollouts_spin_eval = eval_rollouts[eval_rollouts["preference"] == 1]
user_rollouts_no_spin_eval = eval_rollouts[eval_rollouts["preference"] == 0]

print("Training on {} segments ({} with spin, {} without)".format(len(train_rollouts), len(user_rollouts_spin), len(user_rollouts_no_spin)))
print("Evaluating on {} segments ({} with spin, {} without)".format(len(eval_rollouts), len(user_rollouts_spin_eval), len(user_rollouts_no_spin_eval)))

obs_spin = torch.tensor(list(user_rollouts_spin["state"].values), dtype=torch.float32)
act_spin = torch.tensor(list(user_rollouts_spin["action"].values), dtype=torch.float32)
obs_no_spin = torch.tensor(list(user_rollouts_no_spin["state"].values), dtype=torch.float32)
act_no_spin = torch.tensor(list(user_rollouts_no_spin["action"].values), dtype=torch.float32)

obs_spin_eval = torch.tensor(list(user_rollouts_spin_eval["state"].values), dtype=torch.float32)
act_spin_eval = torch.tensor(list(user_rollouts_spin_eval["action"].values), dtype=torch.float32)
obs_no_spin_eval = torch.tensor(list(user_rollouts_no_spin_eval["state"].values), dtype=torch.float32)
act_no_spin_eval = torch.tensor(list(user_rollouts_no_spin_eval["action"].values), dtype=torch.float32)

Training on 436 segments (228 with spin, 208 without)
Evaluating on 45 segments (20 with spin, 25 without)


### Collect segment tuples with preference

In [7]:
segments = exrp.get_segments(obs_spin, act_spin, obs_no_spin, act_no_spin)
segments_eval = exrp.get_segments(obs_spin_eval, act_spin_eval, obs_no_spin_eval, act_no_spin_eval)
print("Training on {} actual sequences".format(len(segments["obs_left"])))
print("Evaluating on {} actual sequences".format(len(segments_eval["obs_left"])))

Training on 47632 actual sequences
Evaluating on 475 actual sequences


### Train

In [8]:
predictor.train(segments, show_pregress=True, epochs=8, batch_size=32)

Epoch  8 Loss 0.175: 100%|██████████| 8/8 [37:30<00:00, 281.33s/it]


# Evaluate

In [13]:
acc = sum(predictor.prefer(segments_eval) - segments_eval["preferences"] == 0) / len(segments_eval["preferences"])
print("Predicting preferences with an accuracy of {:.2f}%".format(100 * acc))

Predicting preferences with an accuracy of 100.00%


In [7]:
predictor.load("augrl/augmentations/spin_cartpole/predictors/predictor_150.pt")

## Generate exploratory data

In [8]:
env = CartPoleEnv(game=False, reward_fn=predictor)

In [None]:
# TODO
agent = None

In [None]:
state = env.reset()
while True:
    action = agent.predict(state.reshape(1, *state.shape))[0]
    if action in (0, 1):
        state, reward, terminal, _ = env.step(action)
        env.render()
        if terminal:
            break