In [1]:
import numpy as np

In [2]:
5 * 0.2 * np.log([0.2])

array([-1.60943791])

In [3]:
2 * 0.5 * np.log([0.5])

array([-0.69314718])

In [1]:
import numpy as np
import torch
import cv2
from dm_control import suite

import lib_duju.utils as duju_utils

from Model.ReplayBuffer import ReplayBuffer
from Model.FrameBuffer import FrameBuffer
from Model.SAC_base import target_initialize

from Model.DiscreteConv_SAC import DiscreteConvSAC
from Model.DiscreteConv_SAC import train_discrete_Conv_SAC_max

exp_title = "Conv_Discrete_SAC_black_and_white_32_q1_only_skip2"
print(exp_title)

env = suite.load(domain_name="cartpole",task_name="swingup")

action_dim = 2

# state related variables
step_size = 3
channel_size = 1
height = 64
width = 96
skip_frame = 2

input_channel_size = step_size * channel_size

action_dict = { 0 : -0.5,
               1 : 0.5 }

reward_compensate = 10 # inverse alpha

lr = 3e-4
gamma = 0.99
device = torch.device("cuda")
max_episode = 10000
batch_size = 32
buffer_size = 1e5

replay_buffer = ReplayBuffer(buffer_size)
frame_buffer = FrameBuffer(step_size, channel_size, height, width)

q_main = DiscreteConvSAC(step_size, channel_size, height, width, action_dim, lr, device)
q_target = DiscreteConvSAC(step_size, channel_size, height, width, action_dim, lr, device)

target_initialize(q_main, q_target)

Conv_Discrete_SAC_black_and_white_32_q1_only_skip2
input channel size :  3
fc input size :  384
input channel size :  3
fc input size :  384


In [2]:

for epi_i in range(1, max_episode + 1):
    print(epi_i, end = "\t")

    timestep = env.reset()
    ep_reward = 0.0

    # timestep, reward, discount, observation
    end, _, _, _ = timestep
    end = end.last()

    frame = env.physics.render(camera_id=0, height = height, width =width)
    for _ in range(step_size):
        frame_buffer.dm_add(frame)
    s = frame_buffer.get_buffer()

    while not end:
        a_category = q_main.get_stochastic_action(
                        torch.FloatTensor(s).to(device).view(1, input_channel_size, height, width)
                )
        a_deploy = action_dict[a_category]

        for _ in range(skip_frame):
            timestep = env.step(a_deploy)

        end, r, _, _ = timestep
        end = end.last()
        frame = env.physics.render(camera_id=0, height=height, width=width)
        frame_buffer.dm_add(frame)

        s2 = frame_buffer.get_buffer()

        replay_buffer.add(s, np.array([a_category]), np.array([r * reward_compensate]),np.array([end]), s2)

        # frame = env.physics.render(camera_id=0, height=480, width=640)  # [height, width, channel]

#         cv2.imshow("train", cv2.resize(np.moveaxis(s2,[0,1,2],[2,0,1]),(width*8,height*8)))
#         cv2.waitKey(1)

        s = s2
        ep_reward += r * skip_frame

    for _idx in range(int(1000 / skip_frame)):
        #print(_idx)
        max_q1, max_q2, max_entropy = train_discrete_Conv_SAC_max(q_main, q_target, replay_buffer, batch_size, gamma)

    print(ep_reward, "***", (float(max_q1), float(max_q2), float(max_entropy)))

    #### Eval ####

    timestep = env.reset()
    eval_ep_reward = 0.0
    eval_action = []

    end, _, _, _ = timestep
    end = end.last()

    frame = env.physics.render(camera_id=0, height=height, width=width)
    for _ in range(step_size):
        frame_buffer.dm_add(frame)
    s = frame_buffer.get_buffer()

    if (epi_i % 5) == 0 :
        while not end:
            a_category = q_main.get_max_action(
                        torch.FloatTensor(s).to(device).view(1, input_channel_size, height, width)
                                                  )
            a_deploy = action_dict[a_category]
            eval_action.append(a_deploy)

            for _ in range(skip_frame):
                timestep = env.step(a_deploy)

            end, r, _, _ = timestep
            end = end.last()
            frame = env.physics.render(camera_id=0, height=height, width=width)
            frame_buffer.dm_add(frame)

            s2 = frame_buffer.get_buffer()

            s = s2
            eval_ep_reward += r * skip_frame

            # frame = env.physics.render(camera_id=0, height=480, width=640) #[height, width, channel]
#             cv2.imshow("eval", cv2.resize(np.moveaxis(s2,[0,1,2],[2,0,1]),(width*8,height*8)))
#             cv2.waitKey(1)


        print("Eval! *** ", eval_ep_reward)
        #print(eval_action)

    if (epi_i % 10) == 0:
        print("Networks Saved!")
        duju_utils.torch_network_save(q_main,"../trained/"+exp_title+"_q_main_"+str(epi_i)+".torch")
        duju_utils.torch_network_save(q_target, "../trained/"+exp_title+"_q_target_"+str(epi_i)+".torch")

# cv2.destroyAllWindows()

1	75.20731548963907 *** (5.032853603363037, 5.380969047546387, 0.6931347846984863)
2	14.890053873749038 *** (10.33411693572998, 10.352701187133789, 0.693071186542511)
3	46.69274459163575 *** (13.895553588867188, 12.641948699951172, 0.6931470632553101)
4	13.227351191526003 *** (25.65631103515625, 22.86658477783203, 0.6929023265838623)
5	17.574116317499463 *** (22.372453689575195, 22.603254318237305, 0.6931460499763489)
Eval! ***  133.18635995196803
6	66.19552322040512 *** (35.738037109375, 36.36039352416992, 0.693117082118988)
7	7.692160348961531 *** (42.88447570800781, 44.9186897277832, 0.6931328773498535)
8	22.981758190979015 *** (39.89846420288086, 38.559234619140625, 0.693135678768158)
9	18.787651718502612 *** (54.22481155395508, 55.16899108886719, 0.6931429505348206)
10	55.217475728638235 *** (48.12155532836914, 55.574424743652344, 0.6931464076042175)


KeyboardInterrupt: 

In [None]:
ss, aas, rs, ts, s2s = replay_buffer.sample_batch(200)

In [None]:
ss.shape

In [None]:
aas

In [None]:
np.mean(rs)

In [None]:
ts

In [None]:
s2s.shape

In [None]:
env.reset()

In [None]:
import matplotlib.pyplot as plt

In [None]:
env.reset()

In [None]:
duju_utils.torch_network_load(q_main,"trained/Conv_Discrete_SAC_black_and_white_32_q1_only_skip2_q_main_230.torch")

In [None]:
import torch.nn.functional as F

In [None]:
action_dict = {
    0 : -0.5,
    1 : 0.5
}

In [None]:
aa4

In [None]:
from torch.distributions.categorical import Categorical

In [None]:
F.softmax(torch.FloatTensor(np.array([182.1922, 181.9607])),dim=0)

In [None]:
Categorical(F.softmax(torch.FloatTensor(np.array([182.1922, 181.9607])),dim=0)).entropy()

In [None]:
env.reset()

for _ in range(3):
    t = env.physics.render(camera_id=0, height = height, width = width)
    frame_buffer.dm_add(t)
    k = frame_buffer.get_buffer()

rc = 0.0
for _ in range(1000):
    t = env.physics.render(camera_id=0, height = height, width = width)
    frame_buffer.dm_add(t)
    k = frame_buffer.get_buffer()
    
    input_ = torch.FloatTensor(k.reshape([1,3,height,width])).to(device)

    a1 = q_main.q1_conv1(input_)
    aa1 = F.relu(a1)

    a2 = q_main.q1_conv2(aa1)
    aa2 = F.relu(a2)

    a3 = q_main.q1_conv3(aa2)
    aa3 = F.relu(a3)
    
    a4 = q_main.q1_conv4(aa3)
    aa4 = F.relu(a4)

    fc1 = aa4.view(1,-1)

    f1 = q_main.q1_fc1(fc1)
    ff1 = F.relu(f1)

    f2 = q_main.q1_fc2(ff1)
    ff2 = F.relu(f2)
    
    f3 = q_main.q1_fc3(ff2)
    ff3 = f3

    action = int(torch.argmax(ff3))
    action = action_dict[action]
    _, r, _, _ = env.step(action)
    rc +=r
    
    
    print(ff3, action)
print(rc)

In [None]:

input_ = torch.FloatTensor(k.reshape([1,12,48,64])).to(device)

a1 = q_main.q1_conv1(input_)
a1

aa1 = F.relu(a1)
aa1

a2 = q_main.q1_conv2(aa1)
aa2 = F.relu(a2)

a3 = q_main.q1_conv3(aa2)
aa3 = F.relu(a3)

fc1 = aa3.view(1,-1)

f1 = q_main.q1_fc1(fc1)
ff1 = F.relu(f1)

f2 = q_main.q1_fc2(ff1)
ff2 = f2

action = int(torch.argmax(ff2))
action = action_dict[action]
env.step(action)

In [None]:
action

In [None]:
test_image = np.array([
    [[0,0,0]],  
])
test_image.shape

plt.imshow(test_image)

In [None]:
kkk = kkk / 256

In [None]:
kkkk = kkk[:,:,0] * 0.2989 + kkk[:,:,1] * 0.5870 + kkk[:,:,2] * 0.1140

In [None]:
env = suite.load(domain_name="cartpole",task_name="swingup")

In [None]:
frame = env.physics.render(camera_id=0, height = 32, width = 32)

In [None]:
frame.shape

In [None]:
frame = frame / 256.0
frame = frame[:,:,[0]] * 0.2989 + frame[:,:,[1]] * 0.5870 + frame[:,:,[2]] * 0.1140
frame = np.moveaxis(frame, [0, 1, 2], [1, 2, 0])

In [None]:
frame.shape

In [None]:
plt.imshow(frame[0], cmap=plt.get_cmap('gray'), vmin=0,vmax=1)

In [None]:
plt

In [None]:
t = np.random.randn(64,64,3)

In [None]:
t.shape

In [None]:
tt = np.moveaxis(t,[0,1,2],[1,2,0])

In [None]:
tt.shape

In [None]:
ttt =  np.moveaxis(tt,[0,1,2],[2,0,1])
ttt.shape

In [None]:
plt.imshow(env.physics.render(camera_id=0, height=64, width=96))

In [None]:
cv2.resize(ttt,(128,128))

In [None]:
1.07 ** 10