<a href="https://colab.research.google.com/github/gunil0817/code_archive/blob/master/poolcart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import time
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_memory_growth(gpus[0], True)
  except RuntimeError as e:
    # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
    print(e)

model0 = keras.Sequential(
    [
        keras.Input(shape=(4,)),
        layers.Dense(5, activation="relu"),
        layers.Dense(10, activation="relu"),
        layers.Dense(1,name='output')
    ]
)
model1 = keras.Sequential(
    [
        keras.Input(shape=(4,)),
        layers.Dense(5, activation="relu"),
        layers.Dense(10, activation="relu"),
        layers.Dense(1, name='output')
    ]
)
#, kernel_initializer=keras.initializers.RandomNormal(mean=0.5, stddev=0.05, seed=None)

model0.compile(
    optimizer=keras.optimizers.SGD(learning_rate = 0.001, momentum=0),
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.MeanSquaredError()]
)
model1.compile(
    optimizer=keras.optimizers.SGD(learning_rate = 0.001, momentum=0),
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.MeanSquaredError()]
)


In [None]:
import gym
import time
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_memory_growth(gpus[0], True)
  except RuntimeError as e:
    # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
    print(e)

model = keras.Sequential(
    [
        keras.Input(shape=(4,)),
        layers.Dense(5, activation="relu", kernel_initializer=keras.initializers.RandomNormal(mean=1, stddev=0.1, seed=None)),
        layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.RandomNormal(mean=1, stddev=0.1, seed=None)),
        layers.Dense(2,name='output')
    ]
)

model.compile(
    optimizer=keras.optimizers.SGD(learning_rate = 0.001),
    loss=keras.losses.Huber(delta=1.0),
    metrics=[keras.metrics.MeanSquaredError()]
    )

# Constants
STOP_CRT = 1000
EPSILON = 0.1
DISCOUNT = 0.99
EPOCH = 1000
# Generate Environment
env = gym.make('CartPole-v1')

# For Every Learning Iterations
history = []

best = 0

for l in np.arange(EPOCH):
    state = env.reset()
    state = tf.convert_to_tensor(state)
    state = tf.expand_dims(state, 0)

    num_step = 0
    num_action = np.zeros([2,], dtype=int)
    done = False
    while (not done) and (num_step < STOP_CRT):
        value = model(state)
        #select action
        if np.random.rand(1) < EPSILON:
            action = np.random.randint(0,2)
        else:
            action = np.argmax(value)

        new_state, reward, done, _ = env.step(action)
        new_state = tf.convert_to_tensor(new_state)
        new_state = tf.expand_dims(new_state,0)


        history.append([state, value, action, new_state, int(done)])

        state = new_state
        num_step += 1

    print("Episode : {0} | score : {1} | best : {2} | {3} vs {4}".format(l, num_step, best, int(num_action[0]), int(num_action[1])))
    # Check Max Performance
    if best < num_step :
        best = num_step
    if len(history) > 5000:
        sample_batch = random.sample(history, 32)
        list_state, list_value, list_action, list_new_state, list_done = [], [], [], [], []
        for s, v, a, n_s, d in sample_batch:
            list_state.append(s)
            list_value.append(v)
            list_action.append(a)
            list_new_state.append(n_s)
            list_done.append(d)
        value = 1 + (DISCOUNT * np.amax(model(tf.convert_to_tensor(list_new_state)),2)) *

        # if done:
        #     value = 1
        # else:
        #     value = 1 + DISCOUNT * np.max(model(n_s))
        # target = model(s)
        # target.numpy()[0][a] = value  # unselected action remains same
        # model.fit(s, target, epochs=1, verbose=0)



#
#
# state = env.reset().reshape([-1,4])
#
# print(model(state).numpy())
# print(model1(state).numpy())
# print(model.get_weights())
#
# # Generate Environment
# state = env.reset()
# # For Every Learning Iterations
# done = False
# playtime = 0
# while not done:
#     state = state.reshape([-1, 4])
#     values = np.array([model(state)[0,0],model1(state)[0,0]])
#     action =  np.argmax(values)
#     state, reward, done, info = env.step(action)
#     env.render()
#     time.sleep(0.01)
#     playtime += 1
# print(playtime)
# env.close()


In [None]:

# Constants
STOP_CRT = 100
TEMP = 1
DISCOUNT = 0.99

# Generate Environment
env = gym.make('CartPole-v1')

# For Every Learning Iterations
state_history = np.zeros((STOP_CRT, 4))
estimated_value_history = np.zeros((STOP_CRT, 2))
action_history = np.zeros((STOP_CRT, 1), dtype=int)

for l in np.arange(100):
    state = env.reset().reshape([-1,4])
    num_step = 0
    num_balanced = 0
    done = False
    #while ((not done) and num_step < 256):
    t = time.time()
    while num_step < STOP_CRT:
        values = np.array([model0(state)[0,0],model1(state)[0,0]])
        #values = np.array([model0(.predict(state)[0,0],model1.predict(state)[0,0]])
        # Select Action
        if np.random.random(1) < (np.exp(values[0] * TEMP) / (np.sum(np.exp(values * TEMP)))):
            action = int(0)
        else:
            action = int(1)
        # Save Current Step
        state_history[num_step,:] = state
        estimated_value_history[num_step,:] = values
        action_history[num_step,0] = action
        # Do Action
        state, reward, done, _ = env.step(action)
        state = state.reshape([-1, 4])
        if not done:
            num_balanced += 1
        num_step += 1
    print("Epoch : {0} | num_balanced : {1}".format(l, num_balanced))

    # Generate Value Array
    true_value = np.append(
        (np.power(DISCOUNT,np.arange(num_balanced,0,-1)) - 1) / (DISCOUNT-1),
        np.zeros([STOP_CRT-num_balanced,1])
    )

    num_action0 = STOP_CRT - np.sum(action_history)
    num_action1 = np.sum(action_history)

    model0_state = np.zeros([STOP_CRT, 4])
    model1_state = np.zeros([STOP_CRT, 4])

    model0_value = np.zeros([STOP_CRT, 1])
    model1_value = np.zeros([STOP_CRT, 1])

    for i in np.arange(STOP_CRT):
        if action_history[i] == 0:
            model0_state[i, :] = state_history[i, :]
            model0_value[i, :] = true_value[i]
        else:
            model1_state[i, :] = state_history[i, :]
            model1_value[i, :] = true_value[i]

    # Learning
    if model0_value.shape[0] > 0 :
        model0.fit(model0_state[:num_action0,:], model0_value[:num_action0],initial_epoch=1)
    if model1_value.shape[0] > 0:
        model1.fit(model1_state[:num_action1,:], model1_value[:num_action1],initial_epoch=1)




Epoch : 0 | num_balanced : 8
Epoch : 1 | num_balanced : 100
Epoch : 2 | num_balanced : 12
Epoch : 3 | num_balanced : 24
Epoch : 4 | num_balanced : 25
Epoch : 5 | num_balanced : 48
Epoch : 6 | num_balanced : 13
Epoch : 7 | num_balanced : 26
Epoch : 8 | num_balanced : 24
Epoch : 9 | num_balanced : 12
Epoch : 10 | num_balanced : 16
Epoch : 11 | num_balanced : 12
Epoch : 12 | num_balanced : 94
Epoch : 13 | num_balanced : 24
Epoch : 14 | num_balanced : 16
Epoch : 15 | num_balanced : 14
Epoch : 16 | num_balanced : 17
Epoch : 17 | num_balanced : 15
Epoch : 18 | num_balanced : 16
Epoch : 19 | num_balanced : 11
Epoch : 20 | num_balanced : 14
Epoch : 21 | num_balanced : 11
Epoch : 22 | num_balanced : 31
Epoch : 23 | num_balanced : 10
Epoch : 24 | num_balanced : 16
Epoch : 25 | num_balanced : 21
Epoch : 26 | num_balanced : 17
Epoch : 27 | num_balanced : 39
Epoch : 28 | num_balanced : 19
Epoch : 29 | num_balanced : 14
Epoch : 30 | num_balanced : 20
Epoch : 31 | num_balanced : 25
Epoch : 32 | num_b

In [None]:
# Generate Environment
state = env.reset()
# For Every Learning Iterations
done = False
playtime = 0
while not done:
    state = state.reshape([-1, 4])
    values = np.array([model0(state)[0,0],model1(state)[0,0]])
    action =  np.argmax(values)
    state, reward, done, info = env.step(action)
    playtime += 1
print(playtime)
env.close()

351


**New Version**

In [None]:
# Constants
STOP_CRT = 300
TEMP = 0.3
DISCOUNT = 0.99
EPOCH = 500
# Generate Environment
env = gym.make('CartPole-v1')

# For Every Learning Iterations
state_history = np.zeros((STOP_CRT, 4))
estimated_value_history = np.zeros((STOP_CRT, 2))
action_history = np.zeros((STOP_CRT, 1), dtype=int)
best = 0

for l in np.arange(EPOCH):
    state = env.reset().reshape([-1,4])
    num_step = 0
    num_action = np.zeros((2,),dtype=int)
    done = False
    #//// saving done=True states ?
    while (not done) and (num_step < STOP_CRT):
        values = np.array([model0(state)[0,0],model1(state)[0,0]])
        # Select Action
        # //// Maybe going greedy after some amount of step is advisable
        #action = np.random.randint(0,2)
        if np.random.random(1) < (np.exp(values[0] * TEMP) / (np.sum(np.exp(values * TEMP)))):
            action = int(0)
        else:
            action = int(1)
        num_action[action] += int(1)
        # Save Current Step
        state_history[num_step,:] = state
        estimated_value_history[num_step,:] = values
        action_history[num_step,0] = action
        # Do Action
        state, _, done, _ = env.step(action)
        state = state.reshape([-1, 4])
        num_step += 1
    print("Epoch : {0} | score : {1} | best : {2} | {3} vs {4}".format(l, num_step, best, num_action[0], num_action[1]))
    
    if best < num_step:
        best = num_step

    # Generate Value Array
    true_value = np.append((np.power(DISCOUNT,np.arange(num_step,0,-1)) - 1) / (DISCOUNT-1),0)

    model0_state = np.empty([0, 4])
    model1_state = np.empty([0, 4])

    model0_value = np.empty([0, 1])
    model1_value = np.empty([0, 1])

    for i in np.arange(num_step):
        if action_history[i] == 0:
            model0_state = np.vstack([model0_state, state_history[i, :]])
            model0_value = np.vstack([model0_value, true_value[i]])
        else:
            model1_state = np.vstack([model1_state, state_history[i, :]])
            model1_value = np.vstack([model1_value, true_value[i]])

    # Learning
    if model0_value.shape[0] > 0 :
        model0.fit(model0_state[-1 * np.min([10,num_action[0]]):], model0_value[-1 * np.min([10,num_action[0]]):])
    if model1_value.shape[0] > 0:
        model1.fit(model1_state[-1 * np.min([10,num_action[1]]):], model1_value[-1 * np.min([10,num_action[1]]):])


Epoch : 0 | score : 12 | best : 0 | 10 vs 2
Epoch : 1 | score : 11 | best : 12 | 9 vs 2
Epoch : 2 | score : 10 | best : 12 | 7 vs 3
Epoch : 3 | score : 89 | best : 12 | 48 vs 41
Epoch : 4 | score : 91 | best : 89 | 51 vs 40
Epoch : 5 | score : 48 | best : 91 | 25 vs 23
Epoch : 6 | score : 90 | best : 91 | 51 vs 39
Epoch : 7 | score : 107 | best : 91 | 59 vs 48
Epoch : 8 | score : 84 | best : 107 | 47 vs 37
Epoch : 9 | score : 48 | best : 107 | 28 vs 20
Epoch : 10 | score : 47 | best : 107 | 25 vs 22
Epoch : 11 | score : 14 | best : 107 | 5 vs 9
Epoch : 12 | score : 16 | best : 107 | 5 vs 11
Epoch : 13 | score : 12 | best : 107 | 4 vs 8
Epoch : 14 | score : 97 | best : 107 | 53 vs 44
Epoch : 15 | score : 21 | best : 107 | 9 vs 12
Epoch : 16 | score : 82 | best : 107 | 45 vs 37
Epoch : 17 | score : 39 | best : 107 | 22 vs 17
Epoch : 18 | score : 31 | best : 107 | 15 vs 16
Epoch : 19 | score : 10 | best : 107 | 2 vs 8
Epoch : 20 | score : 21 | best : 107 | 10 vs 11
Epoch : 21 | score : 11

In [None]:
new_state = env.reset().reshape([-1,4])
print(model0(new_state).numpy())
print(model1(new_state).numpy())
print(model0.get_weights())

[[45.460842]]
[[11.589589]]
[array([[-0.19199362, -0.79946697,  0.35423177,  0.37033904,  0.57396024],
       [ 1.2549629 ,  0.47474214,  0.9743056 ,  0.6592582 ,  1.0227792 ],
       [ 0.9113481 ,  0.51702064,  0.5590204 , -0.12086061,  0.70064634],
       [-0.35446692, -0.05711846,  0.49514034, -2.084933  , -0.45837846]],
      dtype=float32), array([ 1.3453412,  0.8679711,  1.0537301, -0.9916672,  1.8024334],
      dtype=float32), array([[-0.3329887 ,  1.2293314 , -0.50693804,  0.6815692 , -0.13374008,
        -0.028402  , -0.38363907,  1.2468998 , -0.2930184 , -0.25176623],
       [ 0.41041186,  0.8126293 ,  0.2526903 , -0.29210514,  0.38200143,
        -0.44016987, -0.4187584 ,  0.7218626 , -0.50769883, -0.41984132],
       [ 0.68241036,  0.55315006, -0.02615865,  0.20306334,  0.1270867 ,
        -0.71339947,  0.5021563 ,  1.0020087 , -0.55876654, -0.33553186],
       [ 0.00412283, -0.65956455,  0.24245855, -0.09576132, -0.60158145,
         0.3746137 ,  0.5595936 , -1.8806552 , -

In [None]:
model0_state[-1 * np.min([10,num_action[0]]):]

8