

*   multiple models that get good accuracy
*   but we need the model with the least weights
*   get linearly dependent data by transforming. fetch y by some model
*   train some model directly and look at the data gradients (average them)
*   train model with saliency and look at the data gradients (average them)

See there is a difference in grads but not the converged loss







In [1]:
import warnings
warnings.filterwarnings("ignore")

import gym
from itertools import chain, combinations
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from torch.autograd import grad, Variable

In [2]:
import sys
from common import SubprocVecEnv

num_envs = 16
env_name = "CartPole-v1"

def make_env():
    def _thunk():
        env = gym.make(env_name, new_step_api=False)
        return env

    return _thunk

envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name, new_step_api=False)

  deprecation(
  deprecation(
  deprecation(
  deprecation(


In [3]:
class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
        super(ActorCritic, self).__init__()
        print(num_inputs, num_outputs, hidden_size)

        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
        
        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_outputs),
            nn.Softmax(dim=1),
        )
        
    def forward(self, x):
        value = self.critic(x)
        probs = self.actor(x)
        dist  = Categorical(probs)
        return dist, value, probs

In [4]:
class Hparams:
    num_inputs = envs.observation_space.shape[0]
    num_outputs = envs.action_space.n
    hidden_size = 64
    lr = 1e-3
    num_steps = 5
    max_frames = 15000
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(Hparams.device)

cpu


In [5]:
def test_env(model, transform=None):
    if transform is None:
        state = env.reset()
    else:
        state = transform(env.reset())

    done = False
    total_reward = 0
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0).to(Hparams.device)
        dist, _, _ = model(state)
        next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
        if transform is None:
            state = next_state
        else:
            state = transform(next_state)
        total_reward += reward

    return total_reward

In [6]:
def compute_returns(next_value, rewards, masks, gamma=0.99):
    R = next_value
    returns = []
    for step in reversed(range(len(rewards))):
        R = rewards[step] + gamma * R * masks[step]
        returns.insert(0, R)
    return returns

In [7]:
class LossObject:
    def __init__(self):
        self.reset()
        self.frame_idx = 0

    def reset(self):
        self.log_probs = []
        self.values = []
        self.rewards = []
        self.masks = []
        self.grad_reg = []
        self.entropy = 0
            
    def update(self, log_prob, value, reward, done, grad_reg_entropy, entropy):
        self.log_probs.append(log_prob)
        self.values.append(value)
        self.rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(Hparams.device))
        self.masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(Hparams.device))
        self.grad_reg.append(grad_reg_entropy)
        self.entropy += entropy
        self.frame_idx += 1

    def compute_loss(self, next_value, gamma=0.99):
        R = next_value
        returns = []
        for step in reversed(range(len(self.rewards))):
            R = self.rewards[step] + gamma * R * self.masks[step]
            returns.insert(0, R)
        
        grad_reg = torch.cat(self.grad_reg)
        log_probs = torch.cat(self.log_probs)
        returns = torch.cat(returns).detach()
        values = torch.cat(self.values)

        advantage = returns - values
        actor_loss = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()

        loss = actor_loss + 0.5 * critic_loss - 0.001 * self.entropy - 3e-3 * grad_reg.mean()
        return loss

In [8]:
def model_env_forward(model, state, envs):
    state = torch.FloatTensor(state).to(Hparams.device)
    state.requires_grad = True
    dist, value, probs = model(state)

    action = dist.sample()
    entropy = dist.entropy().mean()
    next_state, reward, done, _ = envs.step(action.cpu().numpy())

    log_prob = dist.log_prob(action)

    saliency = grad(probs[:,0].sum(), state, retain_graph=True)[0]
    saliency = torch.softmax(saliency**2, axis=1)
    grad_reg_entropy = - (saliency * torch.log(saliency)).sum(axis=1).mean(axis=0, keepdim=True)

    return next_state, log_prob, value, reward, done, grad_reg_entropy, entropy

def train_model():
    transform = None

    # init model and optimizer
    model = ActorCritic(
        num_inputs=Hparams.num_inputs,
        num_outputs=Hparams.num_outputs,
        hidden_size=Hparams.hidden_size,
    ).to(Hparams.device)

    optimizer = optim.Adam(lr=Hparams.lr, params=model.parameters())

    # init env
    max_avg_reward = 0
    state = envs.reset()
    loss_obj = LossObject()
    
    while loss_obj.frame_idx < Hparams.max_frames:
        loss_obj.reset()

        for _ in range(Hparams.num_steps):
            state, *update_args = model_env_forward(model, state, envs)
            loss_obj.update(*update_args)

            if loss_obj.frame_idx % 500 == 0:
                # average reward over 500 episodes
                avg_reward = np.mean([test_env(model) for _ in range(500)])
                max_avg_reward = max(max_avg_reward, avg_reward)
                print(f"Frame={loss_obj.frame_idx} => avg_reward={avg_reward:.4f}")

        state = torch.FloatTensor(state).to(Hparams.device)
        next_value= model(state)[1]

        loss = loss_obj.compute_loss(next_value)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_reward = np.mean([test_env(model) for _ in range(500)])
    max_avg_reward = max(max_avg_reward, avg_reward)
    return max_avg_reward

episode wise loop and avg across training loop

where regularization
how does training look like
concretely formalize idea

saliency?
algorithm? ac or dqn

log rewards
loss
entropy


best model we track
reward
loss_without_entropy
entropy
grads of states in test rollouts (average across states)
perturb useless input then train model using both methods (with and without entropy) then see model does better with entropy

understand scale of loss and entropy
use lightning and tensorboard to log them
start with high weightage

In [9]:
avg_reward = train_model()

4 2 64
Frame=500 => avg_reward=22.1640
Frame=1000 => avg_reward=23.0640
Frame=1500 => avg_reward=16.1780
Frame=2000 => avg_reward=16.3280
Frame=2500 => avg_reward=18.6080
Frame=3000 => avg_reward=23.3320
Frame=3500 => avg_reward=31.4580
Frame=4000 => avg_reward=35.8300
Frame=4500 => avg_reward=41.2000
Frame=5000 => avg_reward=43.6640
Frame=5500 => avg_reward=52.5380
Frame=6000 => avg_reward=59.4160
Frame=6500 => avg_reward=62.5980
Frame=7000 => avg_reward=65.6500
Frame=7500 => avg_reward=77.9020
Frame=8000 => avg_reward=72.6340
Frame=8500 => avg_reward=77.1120
Frame=9000 => avg_reward=103.5940
Frame=9500 => avg_reward=151.3460
Frame=10000 => avg_reward=180.2940
Frame=10500 => avg_reward=199.5440
Frame=11000 => avg_reward=255.5420
Frame=11500 => avg_reward=253.1040
Frame=12000 => avg_reward=291.6420
Frame=12500 => avg_reward=274.2760
Frame=13000 => avg_reward=295.8280
Frame=13500 => avg_reward=168.6960
Frame=14000 => avg_reward=110.0720
Frame=14500 => avg_reward=157.4440
Frame=15000 => 

In [10]:
avg_reward

295.828