In [1]:
import Pkg
Pkg.activate("..")

[32m[1m  Activating[22m[39m project at `~/Projects/latentplan.jl`


In [2]:
using Test
using PyCall
using Knet
using Debugger: @enter, @bp, @run
using CUDA

if CUDA.functional()
	atype=KnetArray{Float32}
else	
	atype=Array{Float32}
end
cputype=Array{Float32}

Array{Float32}

In [43]:
include("datasets/sequence.jl")
include("models/common.jl")
include("models/transformers.jl")
include("models/vqvae.jl")
include("setup.jl")

@pyimport torch

weights = torch.load("test/files/gpt_weights_hopper.pt")

@pyimport numpy



# Setup

In [51]:
super_args = Dict{String, Any}(
    "dataset"=> "hopper-medium-replay-v2",
    "exp_name"=> "debug",
    "seed"=> 42,
    "config"=> "../config/vqvae.jl",
)

args = parser(super_args, experiment="train")

config = deepcopy(args)
config["block_size"] = 425
config["observation_dim"] = 11
config["action_dim"] = 3
config["transition_dim"] = 17
config["n_embd"] = args["n_embd"] * args["n_head"]
config["vocab_size"] = args["N"]

vq_model = VQContinuousVAE(config);
vq_model.padding_vector = atype(normalize_joined_single(dataset, atype(zeros(vq_model.transition_dim-1))));

[ utils/setup ] Reading config: ../config/vqvae.jl:hopper_medium_replay_v2
/Users/mehmeteneserciyes/logs_julia/hopper-medium-replay-v2/debug/ already exists. Proceeding...
Made directory /Users/mehmeteneserciyes/logs_julia/hopper-medium-replay-v2/debug/


In [52]:
function reset_codebook()
    vq_model.model.codebook.embedding = Param(atype(weights["model.codebook.embedding"][:cpu]()[:numpy]()'))
    vq_model.model.codebook.ema_count = Param(atype(weights["model.codebook.ema_count"][:cpu]()[:numpy]()))
    vq_model.model.codebook.ema_w = Param(atype(weights["model.codebook.ema_w"][:cpu]()[:numpy]()'))
end

reset_codebook (generic function with 1 method)

# Load weights

In [37]:
# encoder
vq_model.model.embed.w = Param(atype(weights["model.embed.weight"][:cpu]()[:numpy]()))
vq_model.model.embed.b = Param(atype(weights["model.embed.bias"][:cpu]()[:numpy]()))

vq_model.model.pos_emb = Param(atype(permutedims(weights["model.pos_emb"][:cpu]()[:numpy](), (3,2,1))))

for i in 1:config["n_layer"]
    vq_model.model.encoder.layers[i].ln1.a = Param(atype(weights["model.encoder.$(i-1).ln1.weight"][:cpu]()[:numpy]()))
    vq_model.model.encoder.layers[i].ln1.b = Param(atype(weights["model.encoder.$(i-1).ln1.bias"][:cpu]()[:numpy]()))
    vq_model.model.encoder.layers[i].ln2.a = Param(atype(weights["model.encoder.$(i-1).ln2.weight"][:cpu]()[:numpy]()))
    vq_model.model.encoder.layers[i].ln2.b = Param(atype(weights["model.encoder.$(i-1).ln2.bias"][:cpu]()[:numpy]()))

    vq_model.model.encoder.layers[i].attn.key.w = Param(atype(weights["model.encoder.$(i-1).attn.key.weight"][:cpu]()[:numpy]()))
    vq_model.model.encoder.layers[i].attn.key.b = Param(atype(weights["model.encoder.$(i-1).attn.key.bias"][:cpu]()[:numpy]()))
    vq_model.model.encoder.layers[i].attn.query.w = Param(atype(weights["model.encoder.$(i-1).attn.query.weight"][:cpu]()[:numpy]()))
    vq_model.model.encoder.layers[i].attn.query.b = Param(atype(weights["model.encoder.$(i-1).attn.query.bias"][:cpu]()[:numpy]()))
    vq_model.model.encoder.layers[i].attn.value.w = Param(atype(weights["model.encoder.$(i-1).attn.value.weight"][:cpu]()[:numpy]()))
    vq_model.model.encoder.layers[i].attn.value.b = Param(atype(weights["model.encoder.$(i-1).attn.value.bias"][:cpu]()[:numpy]()))
    vq_model.model.encoder.layers[i].attn.proj.w = Param(atype(weights["model.encoder.$(i-1).attn.proj.weight"][:cpu]()[:numpy]()))
    vq_model.model.encoder.layers[i].attn.proj.b = Param(atype(weights["model.encoder.$(i-1).attn.proj.bias"][:cpu]()[:numpy]()))

    vq_model.model.encoder.layers[i].mlp.layers[1].w = Param(atype(weights["model.encoder.$(i-1).mlp.0.weight"][:cpu]()[:numpy]()))
    vq_model.model.encoder.layers[i].mlp.layers[1].b = Param(atype(weights["model.encoder.$(i-1).mlp.0.bias"][:cpu]()[:numpy]()))
    vq_model.model.encoder.layers[i].mlp.layers[3].w = Param(atype(weights["model.encoder.$(i-1).mlp.2.weight"][:cpu]()[:numpy]()))
    vq_model.model.encoder.layers[i].mlp.layers[3].b = Param(atype(weights["model.encoder.$(i-1).mlp.2.bias"][:cpu]()[:numpy]()))
end

vq_model.model.cast_embed.w = Param(atype(weights["model.cast_embed.weight"][:cpu]()[:numpy]()))
vq_model.model.cast_embed.b = Param(atype(weights["model.cast_embed.bias"][:cpu]()[:numpy]()))

# Decoder
vq_model.model.latent_mixing.w = Param(atype(weights["model.latent_mixing.weight"][:cpu]()[:numpy]()))
vq_model.model.latent_mixing.b = Param(atype(weights["model.latent_mixing.bias"][:cpu]()[:numpy]()))

for i in 1:config["n_layer"]
    vq_model.model.decoder.layers[i].ln1.a = Param(atype(weights["model.decoder.$(i-1).ln1.weight"][:cpu]()[:numpy]()))
    vq_model.model.decoder.layers[i].ln1.b = Param(atype(weights["model.decoder.$(i-1).ln1.bias"][:cpu]()[:numpy]()))
    vq_model.model.decoder.layers[i].ln2.a = Param(atype(weights["model.decoder.$(i-1).ln2.weight"][:cpu]()[:numpy]()))
    vq_model.model.decoder.layers[i].ln2.b = Param(atype(weights["model.decoder.$(i-1).ln2.bias"][:cpu]()[:numpy]()))

    vq_model.model.decoder.layers[i].attn.key.w = Param(atype(weights["model.decoder.$(i-1).attn.key.weight"][:cpu]()[:numpy]()))
    vq_model.model.decoder.layers[i].attn.key.b = Param(atype(weights["model.decoder.$(i-1).attn.key.bias"][:cpu]()[:numpy]()))
    vq_model.model.decoder.layers[i].attn.query.w = Param(atype(weights["model.decoder.$(i-1).attn.query.weight"][:cpu]()[:numpy]()))
    vq_model.model.decoder.layers[i].attn.query.b = Param(atype(weights["model.decoder.$(i-1).attn.query.bias"][:cpu]()[:numpy]()))
    vq_model.model.decoder.layers[i].attn.value.w = Param(atype(weights["model.decoder.$(i-1).attn.value.weight"][:cpu]()[:numpy]()))
    vq_model.model.decoder.layers[i].attn.value.b = Param(atype(weights["model.decoder.$(i-1).attn.value.bias"][:cpu]()[:numpy]()))
    vq_model.model.decoder.layers[i].attn.proj.w = Param(atype(weights["model.decoder.$(i-1).attn.proj.weight"][:cpu]()[:numpy]()))
    vq_model.model.decoder.layers[i].attn.proj.b = Param(atype(weights["model.decoder.$(i-1).attn.proj.bias"][:cpu]()[:numpy]()))

    vq_model.model.decoder.layers[i].mlp.layers[1].w = Param(atype(weights["model.decoder.$(i-1).mlp.0.weight"][:cpu]()[:numpy]()))
    vq_model.model.decoder.layers[i].mlp.layers[1].b = Param(atype(weights["model.decoder.$(i-1).mlp.0.bias"][:cpu]()[:numpy]()))
    vq_model.model.decoder.layers[i].mlp.layers[3].w = Param(atype(weights["model.decoder.$(i-1).mlp.2.weight"][:cpu]()[:numpy]()))
    vq_model.model.decoder.layers[i].mlp.layers[3].b = Param(atype(weights["model.decoder.$(i-1).mlp.2.bias"][:cpu]()[:numpy]()))
end

vq_model.model.ln_f.a = Param(atype(weights["model.ln_f.weight"][:cpu]()[:numpy]()))
vq_model.model.ln_f.b = Param(atype(weights["model.ln_f.bias"][:cpu]()[:numpy]()))

vq_model.model.predict.w = Param(atype(weights["model.predict.weight"][:cpu]()[:numpy]()))
vq_model.model.predict.b = Param(atype(weights["model.predict.bias"][:cpu]()[:numpy]()))

# codebook
vq_model.model.codebook.embedding = Param(atype(weights["model.codebook.embedding"][:cpu]()[:numpy]()'))
vq_model.model.codebook.ema_count = Param(atype(weights["model.codebook.ema_count"][:cpu]()[:numpy]()))
vq_model.model.codebook.ema_w = Param(atype(weights["model.codebook.ema_w"][:cpu]()[:numpy]()'))

# padding vector
vq_model.padding_vector = atype(normalize_joined_single(dataset, atype(zeros(vq_model.transition_dim-1))));

# DataLoader

In [46]:
env_name = occursin("-v", args["dataset"]) ? args["dataset"] : args["dataset"] * "-v0"

# env params
sequence_length = args["subsampled_sequence_length"] * args["step"]
args["logbase"] = expanduser(args["logbase"])
args["savepath"] = expanduser(args["savepath"])
if !isdir(args["savepath"])
    mkpath(args["savepath"])
end

println("Loading dataset..")

dataset = SequenceDataset(
    env_name;
    penalty=args["termination_penalty"], 
    sequence_length=sequence_length, 
    step=args["step"], 
    discount=args["discount"], 
    disable_goal=args["disable_goal"], 
    normalize_raw=args["normalize"], 
    normalize_reward=args["normalize_reward"],
    max_path_length=args["max_path_length"],
    atype=atype
)

println("Setup done..")

Loading dataset..
[ datasets/sequence ] Sequence length: 25 | Step: 1 | Max path length: 1000
[ datasets/sequence ] Loading...
✓
[ datasets/sequence ] Segmenting...


load datafile: 100%|███████████████████████████| 11/11 [00:00<00:00, 13.95it/s]


✓


[32mCalculating values 100%|█████████████████████████████████| Time: 0:00:05[39m


Setup done..


In [53]:
loader = DataLoader(dataset; shuffle=false, batch_size=args["batch_size"]);

In [54]:
opt_decay = AdamW(lr=args["learning_rate"], beta1=0.9, beta2=0.95, weight_decay=0.1, gclip=1.0)
opt_no_decay = AdamW(lr=args["learning_rate"], beta1=0.9, beta2=0.95, weight_decay=0.0, gclip=1.0)
for p in paramlist_decay(vq_model)
    p.opt = clone(opt_decay)
end
for p in paramlist_no_decay(vq_model)
    p.opt = clone(opt_no_decay)
end

In [55]:
function zerograd_embedding(model::VQContinuousVAE)
    model.model.codebook.embedding = value(model.model.codebook.embedding)
    model.model.codebook.ema_count = value(model.model.codebook.ema_count)
    model.model.codebook.ema_w = value(model.model.codebook.ema_w)
end
losssum(prediction) = mean(prediction[2] + prediction[3] + prediction[4])

losssum (generic function with 1 method)

In [56]:
batch = nothing
start_it = 1
end_it = 20
for (it, batch) in enumerate(loader)
    if it<start_it
        continue
    elseif it>end_it
        break
    end
    loss = @diff losssum(vq_model(batch...));
    println("Loss", it, ": ", value(loss))
    for p in paramlist(vq_model)
        update!(p, grad(loss, p))
    end
    zerograd_embedding(vq_model)
    GC.gc(true)
end

Loss1: 22.369543
Loss2: 22.044884
Loss3: 22.045149
Loss4: 22.791351
Loss5: 22.968828
Loss6: 18.977854
Loss7: 14.574955
Loss8: 13.411827
Loss9: 13.778966
Loss10: 9.11558
Loss11: 7.1508503
Loss12: 6.0912333
Loss13: 8.363685
Loss14: 12.680482
Loss15: 14.417554
Loss16: 20.291061
Loss17: 18.609686
Loss18: 20.447655
Loss19: 17.55527
Loss20: 9.726214
