# OpenAI CartPole-v0 환경을 사용하여, 처음으로 총보상이 200이 되었을 때 훈련을 멈추고 그때까지 훈련된 Q 값으로 CartPole을 시연하시오.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gym

In [2]:
env = gym.make('CartPole-v0')

In [3]:
from keras.models import Sequential
from keras.layers import Dense

from keras.optimizers import Adam

Using TensorFlow backend.


In [4]:
model = Sequential()
model.add(Dense(16, activation='relu', input_shape=(4,)))
model.add(Dense(16, activation='relu'))
model.add(Dense(2, activation='linear')) # 0 or 1

model.compile(loss='mse', optimizer='adam' ) 







In [5]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 34        
Total params: 386
Trainable params: 386
Non-trainable params: 0
_________________________________________________________________


In [6]:
import collections

memory = collections.deque(maxlen = 10000)

In [7]:
s = env.reset()

In [8]:
for i in range(32):
    action = env.action_space.sample() # np.random.randint(2)
    s2, r, done, _ = env.step(action)
    
    memory.append([s, action, r, done, s2])
    
    s = env.reset() if done else s2 # done 이면 리셋

In [9]:
display(memory[0], memory[-1])

[array([-0.01774083,  0.0139485 ,  0.00890822,  0.01286946]),
 0,
 1.0,
 False,
 array([-0.01746186, -0.18130007,  0.00916561,  0.30834967])]

[array([-0.01895414, -0.96563078,  0.0162453 ,  1.27665253]),
 0,
 1.0,
 False,
 array([-0.03826676, -1.16095607,  0.04177835,  1.57437772])]

In [10]:
epsilon = 1.0 # 처음에는 탐험만 수행한다. (랜덤 행동)
gamma = 0.99 # 감쇠율 (discount factor, 미래 보상을 얼마나 중요시할 지를 결정)
returns = [] # 에피소드 당 총보상값을 저장한다
count = 0

In [11]:
### 총 10000번의 에피소드를 진행한다.
for episode in range(300):
    
    total_reward = 0 # 에피소스당 총보상값
    
    s = env.reset()
    
    for i in range(200): # 한 에피소드당 최대 200번만 행동한다
        
        ### 탐험 확률 지정 (1 부터 시작해서 점점 낮아지다 최소값은 1%)
        epsilon = 0.01 + (1-0.01)*np.exp(-0.0001*count)
        count += 1
        
        if np.random.rand() < epsilon:
            action = env.action_space.sample() # 랜덤 행동
        else:
            action = np.argmax(model.predict(s.reshape(1,4))[0]) # Q값이 높은 행동 선택
            
        s2, r, done, _ = env.step(action)
        
        total_reward += r
        
        memory.append([s, action, r, done, s2])
        
        ### 학습 (배치크기는 32)
        indices = np.random.choice(len(memory), 32, replace=False)
        samples = [memory[i] for i in indices]
        
        X = np.zeros([32,4]) # 입력 상태값
        y = np.zeros([32,2]) # 목표 Q값
        
        for i, sample in enumerate(samples): # sample -> [s,a,r,done,s2]
            X[i] = sample[0]
            y[i] = model.predict(sample[0].reshape(1,4))[0]
            
            if sample[3] == True: # done
                y[i][sample[1]] = sample[2]
            else:
                y[i][sample[1]] = sample[2] + gamma*np.max(model.predict(sample[-1].reshape(1,4))[0])
                
        model.fit(X, y, epochs=1, verbose=False)
        
        if done:
            break
        else:
            s = s2
            
    print('Episode: %d, Reward: %d, Epsilon: %.5f' % (episode+1, total_reward, epsilon))
    returns.append(total_reward)
    if total_reward == 200 :
        break



Episode: 1, Reward: 62, Epsilon: 0.99398
Episode: 2, Reward: 10, Epsilon: 0.99300
Episode: 3, Reward: 38, Epsilon: 0.98927
Episode: 4, Reward: 16, Epsilon: 0.98770
Episode: 5, Reward: 37, Epsilon: 0.98409
Episode: 6, Reward: 38, Epsilon: 0.98040
Episode: 7, Reward: 15, Epsilon: 0.97894
Episode: 8, Reward: 68, Epsilon: 0.97238
Episode: 9, Reward: 14, Epsilon: 0.97103
Episode: 10, Reward: 22, Epsilon: 0.96892
Episode: 11, Reward: 21, Epsilon: 0.96691
Episode: 12, Reward: 16, Epsilon: 0.96538
Episode: 13, Reward: 17, Epsilon: 0.96375
Episode: 14, Reward: 41, Epsilon: 0.95985
Episode: 15, Reward: 19, Epsilon: 0.95805
Episode: 16, Reward: 21, Epsilon: 0.95606
Episode: 17, Reward: 20, Epsilon: 0.95417
Episode: 18, Reward: 21, Epsilon: 0.95219
Episode: 19, Reward: 24, Epsilon: 0.94993
Episode: 20, Reward: 86, Epsilon: 0.94188
Episode: 21, Reward: 41, Epsilon: 0.93807
Episode: 22, Reward: 35, Epsilon: 0.93483
Episode: 23, Reward: 22, Epsilon: 0.93279
Episode: 24, Reward: 18, Epsilon: 0.93113

In [12]:
for epoch in range(3):
    env.reset()
    
    done = False
    n = 0
    while not done:
        #a = np.random.randint(2)
        a = np.argmax(model.predict(s.reshape(1,4))[0])
        s, r, done, info = env.step(a)
        n += 1
        
        
        env.render()
        #print(s, r, done, info)
        
    print('Epoch: %d, Count: %d' % (epoch+1, n))
    print(s, r, done, info)
    #input()

Epoch: 1, Count: 200
[-2.47618562e-01  7.08104694e-05 -2.70725755e-02 -2.54421282e-01] 1.0 True {'TimeLimit.truncated': True}
Epoch: 2, Count: 200
[1.74797099 1.45630992 0.1496148  0.26134511] 1.0 True {'TimeLimit.truncated': True}
Epoch: 3, Count: 200
[0.14560026 0.01957706 0.00141913 0.03699403] 1.0 True {'TimeLimit.truncated': True}


In [13]:
env.close()