In [1]:
import argparse
import math
import random
from copy import deepcopy
from torch.distributions import normal

import numpy as np
import torch
import torch.optim as optim
from helpers import ReplayBuffer, make_atari, make_gym_env, wrap_deepmind, wrap_pytorch
from models import Deep_feature, CnnDQN

In [2]:
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    print("Using GPU: GPU requested and available.")
    dtype = torch.cuda.FloatTensor
    dtypelong = torch.cuda.LongTensor
else:
    print("NOT Using GPU: GPU not requested or not available.")
    dtype = torch.FloatTensor
    dtypelong = torch.LongTensor

NOT Using GPU: GPU not requested or not available.


In [3]:
feature_dimension = 512
learning_rate = 0.0025 
replay_buffer_size = 1000000
max_time_step = 5 * 10**6

sigma = 0.001
sigma_n = 1

start_train_ts = 10*5
batch_size = 32
gamma = 0.99

target_network_update_f = 100#10000

target_W_update = 1 #10
target_batch_size = 500 #5000

log_every = 2000

env_name = "PongNoFrameskip-v4"  # Set the desired environment
env = make_atari(env_name)
env = wrap_pytorch(wrap_deepmind(env, scale=True))
num_action = env.action_space.n

In [4]:
def compute_td_loss(batch_size, replay_buffer, optimizer):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)
    state = torch.tensor(np.float32(state)).type(dtype)
    next_state = torch.tensor(np.float32(next_state)).type(dtype)
    action = torch.tensor(action).type(dtypelong)
    reward = torch.tensor(reward).type(dtype)
    done = torch.tensor(done).type(dtype)

    _, argmax_Q = torch.max(torch.mm(deep_feature(next_state), W_mean.transpose(0, 1)),dim=1,keepdim=True)
    Q_target = torch.mm(deep_target_feature(next_state), W_target.transpose(0, 1))
    Q_target = torch.gather(Q_target, 1, argmax_Q).squeeze() * (1 - done)
    Q = torch.mm(deep_feature(state), W_mean.transpose(0, 1))
    Q = torch.gather(Q, 1, action.type(dtypelong).unsqueeze(1)).squeeze()
    target = (reward + gamma * Q_target).data
    loss = (Q - target).pow(2).mean()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss

In [5]:
def BayesReg(phiphiT, phiY, target_batch_size):
    with torch.no_grad():
        chunk_size = 10#1000
        num_chunks = int(target_batch_size / chunk_size)

        for _ in range(num_chunks):
            state, action, reward, next_state, done = replay_buffer.sample(chunk_size)
            _, argmax_Q = torch.max(torch.mm(deep_feature(next_state), W_mean.transpose(0, 1)),dim=1,keepdim=True)
            Q_target = torch.mm(deep_target_feature(next_state), W_target.transpose(0, 1))
            Q_target = torch.gather(Q_target, 1, argmax_Q).squeeze() * (1 - done)
            target = (reward + gamma * Q_target).data
            
            feature_rep = deep_feature(state).unsqueeze(1).detach()
            for i in range(num_action):
                action_ = action == i # I am not sure it is a right way of doing it
                print(action_)
                feature_rep_of_action = torch.mm(feature_rep,action_)
                phiphiT[i] = torch.mm(feature_rep_of_action.transpose(0, 1),feature_rep_of_action)
                phiY[i] = torch.mm(feature_rep_of_action,target)

        for i in range(num_action):
            inv = np.linalg.inv(
                ((phiphiT[i] / sigma_n + 1 / sigma * eye).cpu()).numpy()
            )
            W[i] = torch.tensor(np.dot(inv, phiY[0].cpu().data) / sigma_n).type(
                dtype
            )
            Cov_W[i] = torch.tensor(sigma * inv).type(dtype)
        return phiphiT, phiY, W_mean, Cov_W

In [6]:
def Sample_W(W_mean, Cov_W_decom):
    dist = normal.Normal(loc=0, scale=1)
    for i in range(num_action):
        sam = dist.sample((feature_dimension, 1)).type(dtype)
        W[i] = W_mean[i] + torch.mm(Cov_W_decom[i], sam)[:, 0]
    return W

In [7]:
deep_feature = Deep_feature(env.observation_space.shape,feature_dimension, env.action_space.n)
deep_target_feature = deepcopy(deep_feature)

optimizer = optim.RMSprop(deep_feature.parameters(), lr=learning_rate)
replay_buffer = ReplayBuffer(replay_buffer_size)

In [8]:
eye = torch.eye(feature_dimension).type(dtype)
dist = normal.Normal(loc=0, scale=0.01)
W = dist.sample((num_action, feature_dimension)).type(dtype)
W_target = dist.sample((num_action, feature_dimension)).type(dtype)
W_mean = dist.sample((num_action, feature_dimension)).type(dtype)
Cov_W = torch.eye(feature_dimension).repeat(num_action, 1, 1).type(dtype)
Cov_W_decom = Cov_W
Cov_W_target = Cov_W
phiphiT = torch.zeros((num_action, feature_dimension, feature_dimension)).type(dtype)
phiY = torch.zeros((num_action, feature_dimension)).type(dtype)

In [9]:
losses, all_rewards = [], []
state = env.reset()
c_t = 0
episode_reward = 0
replay_memory = ReplayBuffer(replay_buffer_size)

for ts in range(1, max_time_step + 1):
    torch_state = torch.tensor(np.float32(state)).type(dtype).unsqueeze(0)
    action = torch.mm(W, deep_feature(torch_state).transpose(0, 1)).squeeze()
    action = torch.argmax(action)
    
    next_state, reward, done, _ = env.step(int(action.cpu()))
    replay_buffer.push(state, action, reward, next_state, done)

    state = next_state
    episode_reward += reward

    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0
        W = Sample_W(W_mean, Cov_W_decom)

    if ( len(replay_buffer) > start_train_ts) and (ts % target_network_update_f == 0):
        loss = compute_td_loss(batch_size, replay_buffer, optimizer)
        losses.append(loss.data)
    
    if ts % target_network_update_f == 0:
        for t_param, param in zip(deep_target_feature.parameters(), deep_feature.parameters()):
            new_param = param.data
            t_param.data.copy_(new_param)
        c_t += 1
        print(c_t == target_W_update)
        if c_t == target_W_update:
            c_t = 0 
            phiphiT, phiY, W_mean, Cov_W = BayesReg(phiphiT, phiY, target_batch_size)
            W_target = W_mean
            Cov_W_target = Cov_W

            for ii in range(num_action):
                Cov_W_decom[ii] = torch.tensor(
                    np.linalg.cholesky(
                        (((Cov_W[ii] + Cov_W[ii].transpose(0, 1))) / 2.0).cpu()
                    )
                ).type(dtype) # in pytorch has stable cholesky decomposinong, it is better to use it, mxnet did not have
        
        if len(replay_memory) < 100000:
            target_batch_size = len(replay_memory)
        else:
            target_batch_size = 100000
            

    if ts % log_every == 0:
        out_str = "Timestep {}".format(ts)
        if len(all_rewards) > 0:
            out_str += ", Reward: {}".format(all_rewards[-1])
        if len(losses) > 0:
            out_str += ", TD Loss: {}".format(losses[-1])
        print(out_str)


True


TypeError: conv2d(): argument 'input' (position 1) must be Tensor, not numpy.ndarray