<a href="https://colab.research.google.com/github/eisbetterthanpi/pytorch/blob/main/bprabhakar_upsidedown_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import random, torch
from torch import nn
from torch.utils.data import DataLoader as TorchDataLoader
from torch.distributions.categorical import Categorical
import numpy as np
from tqdm.notebook import tqdm
!pip install gym[box2d]
import gym
# https://github.com/bprabhakar/upside-down-reinforcement-learning
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# https://wandb.ai/quickstart/pytorch
!pip install wandb
import wandb
wandb.login() # 
wandb.init(project="my-test-project", entity="bobdole")


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[34m[1mwandb[0m: Currently logged in as: [33mbobdole[0m. Use [1m`wandb login --relogin`[0m to force relogin


#### Behavior Net

In [None]:
# Behavior Net
from torch import nn
#https://github.com/bprabhakar/upside-down-reinforcement-learning/blob/master/model.py
class BehaviorNet(nn.Module):
    """ Policy network takes state and target commands as input and returns probability distribution over all actions."""
    def __init__(self, state_dim, action_dim):
        super(BehaviorNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim + 2, 32),  # 2 = command_dim
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, action_dim),
            nn.LogSoftmax(dim=-1)
        ).to(device)

    def forward(self, features):
        """Input: features = [state command_steps command_reward] x batch_size
        Output: action_probs = [action_dim] x batch_size"""
        logprobs = self.model(features)
        return logprobs


#### Replay Buffer

In [None]:
# Replay Buffer
import heapq
import random
import numpy as np
# https://github.com/bprabhakar/upside-down-reinforcement-learning/blob/master/replay_buffer.py
class ReplayBuffer(object):
    """Implemented as a priority queue, where the priority value is set to be episode's total reward. Note that unlike usual RL buffers,
    we store entire 'trajectories' together, instead of just transitions."""
    # def __init__(self, size, seed=0):
    def __init__(self, size, seed=None):
        self.size = size
        self.buffer = [] # initialized as a regular list; use heapq functions
        self.rg = np.random.RandomState(seed)

    def __getitem__(self, key):
        return self.buffer[key]
    
    def __len__(self):
        return len(self.buffer)

    def add_episode(self, S, A, R, S_):
        """ all inputs are numpy arrays; num_rows = timesteps
        S  : states
        A  : actions
        R  : rewards
        S_ : next states"""
        episode = (S, A, R, S_)
        episode_reward = np.sum(R)
        if S.shape[0] > 1: # ignore episodes that only last 1 step
            item = (episode_reward, episode) # -1 for desc ordering
            if len(self.buffer) < self.size:
                heapq.heappush(self.buffer, item) 
            else:
                _ = heapq.heappushpop(self.buffer, item) # ignore the popped obj
    
    def top_episodes(self, K):
        """ Returns K episodes with highest total ep rewards.
        Output: [(state_arr, action_arr, reward_arr, next_state_arr), ... ]"""
        episodes = [x[1] for x in self.buffer[-K:]] # buffer has (-reward, episode)
        return episodes

    def sample_episodes(self, K):
        """ Returns random K episodes.
        Output: [(state_arr, action_arr, reward_arr, next_state_arr), ... ]"""
        sampled_items = random.choices(self.buffer, k=K)
        episodes = [x[1] for x in sampled_items] # buffer has (-reward, episode)
        return episodes


#### utils

In [None]:
# utils
import random
import numpy as np
import torch
from torch.utils.data import DataLoader as TorchDataLoader
from torch.utils.data import Dataset as TorchDataset

# https://github.com/bprabhakar/upside-down-reinforcement-learning/blob/master/utils.py
def augment_state(state, command, command_scale):
	""" Appends scaled command values (horizon, reward) to the original state vector."""
	tgt_horizon, tgt_return = command
	horizon_scale, return_scale = command_scale
	# horizon_scale = 0; return_scale = 0 #wth is this?!
	tgt_horizon *= horizon_scale
	tgt_return *= return_scale
	state_ = np.append(state, [tgt_horizon, tgt_return])
	return state_

class BehaviorDataset(TorchDataset):
    """ Samples behavior segments for supervised learning from given input episodes."""
    def __init__(self, episodes, size, horizon_scale, return_scale):
        super(BehaviorDataset, self).__init__()
        self.episodes = episodes
        self.horizon_scale = horizon_scale
        self.return_scale = return_scale
        self.size = size

    def __len__(self):
        return self.size # just returning a placeholder number for now

    def __getitem__(self, idx): # get episode
        if torch.is_tensor(idx):
            idx = idx.tolist()[0]
        episode = random.choice(self.episodes) # randomly sample an episode
        S, A, R, S_ = episode
        # extract behavior segment
        episode_len = S.shape[0]
        start_index = np.random.choice(episode_len - 1) # ensures cmd_steps >= 1
        command_horizon = (episode_len - start_index - 1)
        command_return = np.sum(R[start_index:])
        command = command_horizon, command_return
        command_scale = self.horizon_scale, self.return_scale
        # construct sample
        features = augment_state(S[start_index,:], command, command_scale)
        label = A[start_index]               # action taken
        sample = {
            'features': torch.tensor(features, dtype=torch.float), 
            'label': torch.tensor(label, dtype=torch.long) # categorical val
        }        
        return sample


#### wwwww

#### Initialize replay buffer and warm-up using random policy

In [None]:

# general hyperparams
NUM_WARMUP_EPISODES = 10      # No of warm-up episodes at the beginning
REPLAY_SIZE = 300             # Max size of the replay buffer (in episodes)
RETURN_SCALE = 0.01           # Scaling factor for desired horizon input (reward)
HORIZON_SCALE = 0.01          # Scaling factor for desired horizon input (steps)
# training hyperparams
BATCH_SIZE = 512              # No of (input, target) pairs/batch for training the behavior function
NUM_UPDATES_PER_ITER  = 100   # No of gradient-based updates of the behavior function per step of UDRL training
LEARNING_RATE = 1e-3          # LR for ADAM optimizer
# generating episodes hyperparams
NUM_EPISODES_PER_ITER = 10    # No of exploratory episodes generated per step of UDRL training
LAST_FEW = 25                 # No of episodes from the end of the replay buffer used for sampling exploratory commands


# Initialize replay buffer and warm-up using random policy
training_step = 0
replay_buffer = ReplayBuffer(REPLAY_SIZE)   # init replay buffer
env = gym.make("LunarLander-v2")            # init gym env
for _ in tqdm(range(NUM_WARMUP_EPISODES)):
    episode = {'states': [], 'actions': [], 'rewards': [], 'next_states': [],}
    episode_reward = 0
    state = env.reset()
    done = False
    while not done:
        episode['states'].append(state)
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        episode_reward += reward
        episode['actions'].append(action)
        episode['next_states'].append(state)
        if not done: 
            episode['rewards'].append(0)         # because 'sparse' lunar lander
    episode['rewards'].append(episode_reward)    # finally add total episode reward
    # wandb.log({"episode_reward": episode_reward})
    # add episode data to the replay buffer
    replay_buffer.add_episode(
        np.array(episode['states'], dtype=np.float),
        np.array(episode['actions'], dtype=np.int),
        np.array(episode['rewards'], dtype=np.float),
        np.array(episode['next_states'], dtype=np.float),
    )

  0%|          | 0/10 [00:00<?, ?it/s]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [None]:

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
policy = BehaviorNet(state_dim, action_dim)
loss_func = nn.NLLLoss()
optimizer = torch.optim.Adam(policy.parameters(), lr=LEARNING_RATE)


#### Main learning loop

In [None]:
# Main learning loop

# while(1): # keep cycling indefinitely
for _ in tqdm(range(200)):
    # 1 - Train Policy Network by sampling behavior segments from buffer.
    episodes_to_train = replay_buffer.sample_episodes(5)
    train_dset = BehaviorDataset(episodes_to_train, 
                                size=BATCH_SIZE*NUM_UPDATES_PER_ITER, 
                                horizon_scale=HORIZON_SCALE, 
                                return_scale=RETURN_SCALE)
    training_behaviors = TorchDataLoader(train_dset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    if not policy.training: policy.train();
    for behavior_batch in training_behaviors: # this runs for NUM_UPDATES_PER_ITER rounds
        policy.zero_grad()
        # logprobs = policy(behavior_batch['features'])
        logprobs = policy(behavior_batch['features'].to(device))
        # loss = loss_func(logprobs, behavior_batch['label'])
        loss = loss_func(logprobs, behavior_batch['label'].to(device))
        loss.backward()
        optimizer.step()
        training_step += 1
        wandb.log({"batch_loss": loss.cpu().detach()})


    # 2 - Sample exploratory target commands for future exploration.
    top_episodes = replay_buffer.top_episodes(LAST_FEW) # [(S,A,R,S_), ... ]
    tgt_horizon = int(np.mean([x[0].shape[0] for x in top_episodes]))
    tgt_reward_mean = np.mean([np.sum(x[2]) for x in top_episodes])
    tgt_reward_std = np.std([np.sum(x[2]) for x in top_episodes])

    def generate_command(tgt_horizon, tgt_reward_mean, tgt_reward_std):
        tgt_horizon = min(tgt_horizon, 200)
        tgt_reward = round(np.random.random_sample()*tgt_reward_std + tgt_reward_mean, 0)
        return tgt_horizon, tgt_reward

    wandb.log({"tgt_reward_mean": tgt_reward_mean})

    # 3 - Generate new trajectories using latest policy network and generated commands
    # use the latest policy network & sampled commands to generate new trajectories & add them to the replay buffer
    for ep in range(NUM_EPISODES_PER_ITER):
        episode = {'states': [], 'actions': [], 'rewards': [], 'next_states': [],}
        episode_reward = 0
        # start interactions
        state = env.reset()
        done = False
        command_horizon, command_reward = generate_command(tgt_horizon, tgt_reward_mean, tgt_reward_std)
        wandb.log({"command_horizon": command_horizon})
        wandb.log({"command_reward": command_reward})
        while not done:
            episode['states'].append(state)
            state_ = augment_state(state, 
                                command=(command_horizon, command_reward), 
                                command_scale=(HORIZON_SCALE, RETURN_SCALE))
            state_ = torch.tensor(state_, dtype=torch.float)
            with torch.no_grad():
                # action_logprobs = policy(state_)
                action_logprobs = policy(state_.to(device))
                action_distribution = Categorical(logits=action_logprobs)
                action = action_distribution.sample().item()
            state, reward, done, info = env.step(action)

            episode['actions'].append(action)
            episode['next_states'].append(state)
            command_horizon = max(1, command_horizon-1)

            # episode['rewards'].append(episode_reward)     # regular lunar lander 
            # command_reward -= episode_reward

            episode_reward += reward
            if not done:
                episode['rewards'].append(0) # sparse lunar lander
                command_reward -= 0
            else:
                episode['rewards'].append(episode_reward)     # sparse lunar lander 
                command_reward -= episode_reward

        wandb.log({"episode_reward": episode_reward})
        replay_buffer.add_episode(
            np.array(episode['states'], dtype=float),
            np.array(episode['actions'], dtype=int),
            np.array(episode['rewards'], dtype=float),
            np.array(episode['next_states'], dtype=float),
        )


  0%|          | 0/200 [00:00<?, ?it/s]

#### save

In [None]:
name = "model.pth"
torch.save(policy.state_dict(), name)


In [None]:

max_reward=200
state_ = np.append(state, [max_reward, max_reward])

# state_ = augment_state(state, command=(command_horizon, command_reward), command_scale=(HORIZON_SCALE, RETURN_SCALE))
state_ = torch.tensor(state_, dtype=torch.float)
with torch.no_grad():
    # action_logprobs = policy(state_)
    action_logprobs = policy(state_.to(device))
    action_distribution = Categorical(logits=action_logprobs)
    action = action_distribution.sample().item()
state, reward, done, info = env.step(action)
print(reward)
# episode_reward += reward


# episodes = 5
# for ep in range(episodes):
#     obs = env.reset()
#     done = False
#     while not done:
#         action, _states = policy(torch.from_numpy(obs).to(device))
#         obs, rewards, done, info = env.step(action)
#         # env.render()
#         print(rewards)

-100
