In [2]:
# coding:utf-8
# [0]ライブラリのインポート
import gym  #倒立振子(cartpole)の実行環境
from gym import wrappers  #gymの画像保存
import numpy as np
import time


# [1]Q関数を離散化して定義する関数　------------
# 観測した状態を離散値にデジタル変換する
def bins(clip_min, clip_max, num):
    return np.linspace(clip_min, clip_max, num + 1)[1:-1]
# bins(1, 20, 5)
# array([ 4.8,  8.6, 12.4, 16.2])

# 各値を離散値に変換
def digitize_state(observation):
    cart_pos, cart_v, pole_angle, pole_v = observation
    digitized = [
        np.digitize(cart_pos, bins=bins(-2.4, 2.4, num_dizitized)),
        np.digitize(cart_v, bins=bins(-3.0, 3.0, num_dizitized)),
        np.digitize(pole_angle, bins=bins(-0.5, 0.5, num_dizitized)),
        np.digitize(pole_v, bins=bins(-2.0, 2.0, num_dizitized))
    ]
    return sum([x * (num_dizitized**i) for i, x in enumerate(digitized)])

In [13]:
np.digitize(1.5, bins=bins(-3.0, 3.0, 16))
# -3から3まで16分割して、その中て1.5が何番目のグループに入るか

12

In [8]:
# [2]行動a(t)を求める関数 -------------------------------------
def get_action(next_state, episode):
           #徐々に最適行動のみをとる、ε-greedy法
    epsilon = 0.5 * (1 / (episode + 1))
    if epsilon <= np.random.uniform(0, 1):
        next_action = np.argmax(q_table[next_state])
    else:
        next_action = np.random.choice([0, 1])
    return next_action


# [3]Qテーブルを更新する関数 -------------------------------------
def update_Qtable(q_table, state, action, reward, next_state):
    gamma = 0.99
    alpha = 0.5
    next_Max_Q=max(q_table[next_state][0],q_table[next_state][1] )
    q_table[state, action] = (1 - alpha) * q_table[state, action] +\
            alpha * (reward + gamma * next_Max_Q)
   
    return q_table

In [15]:
# [4]. メイン関数開始 パラメータ設定--------------------------------------------------------
env = gym.make('CartPole-v0')
max_number_of_steps = 200  #1試行のstep数
num_consecutive_iterations = 100  #学習完了評価に使用する平均試行回数
num_episodes = 2000  #総試行回数
goal_average_reward = 195  #この報酬を超えると学習終了（中心への制御なし）
# 状態を6分割^（4変数）にデジタル変換してQ関数（表）を作成
num_dizitized = 6  #分割数
q_table = np.random.uniform(
    low=-1, high=1, size=(num_dizitized**4, env.action_space.n))

total_reward_vec = np.zeros(num_consecutive_iterations)  #各試行の報酬を格納
final_x = np.zeros((num_episodes, 1))  #学習後、各試行のt=200でのｘの位置を格納
islearned = 0  #学習が終わったフラグ
isrender = 0  #描画フラグ

In [18]:
q_table.shape

(1296, 2)

In [19]:
# [5] メインルーチン--------------------------------------------------
for episode in range(num_episodes):  #試行数分繰り返す
    # 環境の初期化
    observation = env.reset()
    state = digitize_state(observation)
    action = np.argmax(q_table[state])
    episode_reward = 0

    for t in range(max_number_of_steps):  #1試行のループ
        if islearned == 1:  #学習終了したらcartPoleを描画する
            env.render()
            time.sleep(0.1)
            print (observation[0])  #カートのx位置を出力

        # 行動a_tの実行により、s_{t+1}, r_{t}などを計算する
        observation, reward, done, info = env.step(action)

        # 報酬を設定し与える
        if done:
            if t < 195:
                reward = -200  #こけたら罰則
            else:
                reward = 1  #立ったまま終了時は罰則はなし
        else:
            reward = 1  #各ステップで立ってたら報酬追加

        episode_reward += reward  #報酬を追加

        # 離散状態s_{t+1}を求め、Q関数を更新する
        next_state = digitize_state(observation)  #t+1での観測状態を、離散値に変換
        q_table = update_Qtable(q_table, state, action, reward, next_state)
        
        #  次の行動a_{t+1}を求める 
        action = get_action(next_state, episode)    # a_{t+1} 
        
        state = next_state
        
        #終了時の処理
        if done:
            print('%d Episode finished after %f time steps / mean %f' %
                  (episode, t + 1, total_reward_vec.mean()))
            total_reward_vec = np.hstack((total_reward_vec[1:],
                                          episode_reward))  #報酬を記録
            if islearned == 1:  #学習終わってたら最終のx座標を格納
                final_x[episode, 0] = observation[0]
            break

    if (total_reward_vec.mean() >=
            goal_average_reward):  # 直近の100エピソードが規定報酬以上であれば成功
        print('Episode %d train agent successfuly!' % episode)
        islearned = 1
        #np.savetxt('learned_Q_table.csv',q_table, delimiter=",") #Qtableの保存する場合
        if isrender == 0:
            #env = wrappers.Monitor(env, './movie/cartpole-experiment-1') #動画保存する場合
            isrender = 1
    #10エピソードだけでどんな挙動になるのか見たかったら、以下のコメントを外す
    #if episode>10:
    #    if isrender == 0:
    #        env = wrappers.Monitor(env, './movie/cartpole-experiment-1') #動画保存する場合
    #        isrender = 1
    #    islearned=1;

if islearned:
    np.savetxt('final_x.csv', final_x, delimiter=",")

0 Episode finished after 16.000000 time steps / mean 0.000000
1 Episode finished after 10.000000 time steps / mean -1.850000
2 Episode finished after 18.000000 time steps / mean -3.760000
3 Episode finished after 21.000000 time steps / mean -5.590000
4 Episode finished after 13.000000 time steps / mean -7.390000
5 Episode finished after 31.000000 time steps / mean -9.270000
6 Episode finished after 80.000000 time steps / mean -10.970000
7 Episode finished after 10.000000 time steps / mean -12.180000
8 Episode finished after 65.000000 time steps / mean -14.090000
9 Episode finished after 13.000000 time steps / mean -15.450000
10 Episode finished after 34.000000 time steps / mean -17.330000
11 Episode finished after 33.000000 time steps / mean -19.000000
12 Episode finished after 9.000000 time steps / mean -20.680000
13 Episode finished after 12.000000 time steps / mean -22.600000
14 Episode finished after 84.000000 time steps / mean -24.490000
15 Episode finished after 16.000000 time st

138 Episode finished after 200.000000 time steps / mean 6.550000
139 Episode finished after 200.000000 time steps / mean 9.540000
140 Episode finished after 52.000000 time steps / mean 12.140000
141 Episode finished after 159.000000 time steps / mean 8.650000
142 Episode finished after 200.000000 time steps / mean 6.230000
143 Episode finished after 188.000000 time steps / mean 9.040000
144 Episode finished after 160.000000 time steps / mean 8.990000
145 Episode finished after 187.000000 time steps / mean 9.190000
146 Episode finished after 163.000000 time steps / mean 10.180000
147 Episode finished after 200.000000 time steps / mean 10.890000
148 Episode finished after 189.000000 time steps / mean 14.030000
149 Episode finished after 160.000000 time steps / mean 11.910000
150 Episode finished after 142.000000 time steps / mean 12.680000
151 Episode finished after 150.000000 time steps / mean 13.170000
152 Episode finished after 109.000000 time steps / mean 14.050000
153 Episode finish

263 Episode finished after 151.000000 time steps / mean 54.640000
264 Episode finished after 163.000000 time steps / mean 52.140000
265 Episode finished after 13.000000 time steps / mean 52.580000
266 Episode finished after 11.000000 time steps / mean 51.530000
267 Episode finished after 128.000000 time steps / mean 47.630000
268 Episode finished after 128.000000 time steps / mean 44.900000
269 Episode finished after 104.000000 time steps / mean 45.020000
270 Episode finished after 125.000000 time steps / mean 44.580000
271 Episode finished after 178.000000 time steps / mean 44.360000
272 Episode finished after 13.000000 time steps / mean 44.670000
273 Episode finished after 78.000000 time steps / mean 43.380000
274 Episode finished after 170.000000 time steps / mean 40.150000
275 Episode finished after 177.000000 time steps / mean 37.840000
276 Episode finished after 187.000000 time steps / mean 35.610000
277 Episode finished after 200.000000 time steps / mean 33.470000
278 Episode fi

390 Episode finished after 142.000000 time steps / mean 43.110000
391 Episode finished after 144.000000 time steps / mean 40.520000
392 Episode finished after 200.000000 time steps / mean 40.660000
393 Episode finished after 198.000000 time steps / mean 40.660000
394 Episode finished after 135.000000 time steps / mean 40.640000
395 Episode finished after 200.000000 time steps / mean 37.980000
396 Episode finished after 200.000000 time steps / mean 37.980000
397 Episode finished after 200.000000 time steps / mean 37.980000
398 Episode finished after 200.000000 time steps / mean 40.920000
399 Episode finished after 165.000000 time steps / mean 43.950000
400 Episode finished after 200.000000 time steps / mean 43.960000
401 Episode finished after 152.000000 time steps / mean 46.510000
402 Episode finished after 200.000000 time steps / mean 46.820000
403 Episode finished after 154.000000 time steps / mean 46.850000
404 Episode finished after 200.000000 time steps / mean 46.500000
405 Episod

523 Episode finished after 200.000000 time steps / mean 123.240000
524 Episode finished after 200.000000 time steps / mean 123.240000
525 Episode finished after 200.000000 time steps / mean 125.630000
526 Episode finished after 200.000000 time steps / mean 125.630000
527 Episode finished after 200.000000 time steps / mean 128.260000
528 Episode finished after 200.000000 time steps / mean 130.800000
529 Episode finished after 200.000000 time steps / mean 133.270000
530 Episode finished after 200.000000 time steps / mean 133.270000
531 Episode finished after 200.000000 time steps / mean 133.270000
532 Episode finished after 200.000000 time steps / mean 133.270000
533 Episode finished after 200.000000 time steps / mean 133.270000
534 Episode finished after 200.000000 time steps / mean 133.270000
535 Episode finished after 200.000000 time steps / mean 133.270000
536 Episode finished after 173.000000 time steps / mean 135.820000
537 Episode finished after 200.000000 time steps / mean 133.54

649 Episode finished after 200.000000 time steps / mean 135.330000
650 Episode finished after 200.000000 time steps / mean 137.600000
651 Episode finished after 200.000000 time steps / mean 137.600000
652 Episode finished after 200.000000 time steps / mean 137.600000
653 Episode finished after 200.000000 time steps / mean 137.600000
654 Episode finished after 198.000000 time steps / mean 137.600000
655 Episode finished after 200.000000 time steps / mean 137.580000
656 Episode finished after 200.000000 time steps / mean 137.580000
657 Episode finished after 200.000000 time steps / mean 137.580000
658 Episode finished after 200.000000 time steps / mean 137.580000
659 Episode finished after 171.000000 time steps / mean 137.580000
660 Episode finished after 151.000000 time steps / mean 135.280000
661 Episode finished after 200.000000 time steps / mean 132.780000
662 Episode finished after 200.000000 time steps / mean 132.780000
663 Episode finished after 200.000000 time steps / mean 132.78

778 Episode finished after 200.000000 time steps / mean 101.710000
779 Episode finished after 200.000000 time steps / mean 101.710000
780 Episode finished after 200.000000 time steps / mean 101.710000
781 Episode finished after 200.000000 time steps / mean 101.710000
782 Episode finished after 200.000000 time steps / mean 104.020000
783 Episode finished after 200.000000 time steps / mean 107.170000
784 Episode finished after 200.000000 time steps / mean 107.170000
785 Episode finished after 200.000000 time steps / mean 109.370000
786 Episode finished after 200.000000 time steps / mean 109.370000
787 Episode finished after 200.000000 time steps / mean 109.370000
788 Episode finished after 179.000000 time steps / mean 109.370000
789 Episode finished after 200.000000 time steps / mean 107.150000
790 Episode finished after 200.000000 time steps / mean 107.150000
791 Episode finished after 200.000000 time steps / mean 109.620000
792 Episode finished after 200.000000 time steps / mean 109.62

905 Episode finished after 147.000000 time steps / mean 178.650000
906 Episode finished after 45.000000 time steps / mean 176.110000
907 Episode finished after 200.000000 time steps / mean 172.550000
908 Episode finished after 200.000000 time steps / mean 172.550000
909 Episode finished after 200.000000 time steps / mean 172.550000
910 Episode finished after 200.000000 time steps / mean 172.550000
911 Episode finished after 200.000000 time steps / mean 175.010000
912 Episode finished after 200.000000 time steps / mean 175.010000
913 Episode finished after 200.000000 time steps / mean 175.010000
914 Episode finished after 200.000000 time steps / mean 177.470000
915 Episode finished after 200.000000 time steps / mean 177.470000
916 Episode finished after 200.000000 time steps / mean 177.470000
917 Episode finished after 200.000000 time steps / mean 177.470000
918 Episode finished after 159.000000 time steps / mean 177.470000
919 Episode finished after 200.000000 time steps / mean 175.050

1032 Episode finished after 200.000000 time steps / mean 193.610000
1033 Episode finished after 200.000000 time steps / mean 193.610000
1034 Episode finished after 200.000000 time steps / mean 193.610000
1035 Episode finished after 200.000000 time steps / mean 193.610000
1036 Episode finished after 200.000000 time steps / mean 193.610000
1037 Episode finished after 200.000000 time steps / mean 193.610000
1038 Episode finished after 200.000000 time steps / mean 193.610000
1039 Episode finished after 200.000000 time steps / mean 193.610000
1040 Episode finished after 200.000000 time steps / mean 193.610000
1041 Episode finished after 200.000000 time steps / mean 193.610000
1042 Episode finished after 200.000000 time steps / mean 193.610000
1043 Episode finished after 200.000000 time steps / mean 193.610000
1044 Episode finished after 200.000000 time steps / mean 193.610000
1045 Episode finished after 200.000000 time steps / mean 193.610000
1046 Episode finished after 200.000000 time step

-0.12861976589881835
-0.1351173854482033
-0.14550032622710096
-0.15196502323397024
-0.16231543555494096
-0.16874771394472027
-0.17906564675188363
-0.18546519832651756
-0.19574989314239444
-0.20211560276884788
-0.21236548515676007
-0.21869541812223045
-0.22890807131997096
-0.24300145206135887
-0.2531735367669703
-0.26722782696943453
-0.27736168575604986
-0.2913786577549397
-0.3014756580108953
-0.3154561452964824
-0.32551672338645277
-0.33946064772525325
-0.3494843325676062
-0.36339071148251223
-0.373376128180992
-0.38724306293699845
-0.40498965338887616
-0.41881425836831465
-0.43652024104930726
-0.4503052607034959
-0.467972781956533
-0.4817199491796105
-0.4993501872779049
-0.513060275573964
-0.530653475068201
-0.5443263297809675
-0.5618818160442173
-0.575516365177464
-0.5930325391760805
-0.6066267846096192
-0.6241010982320215
-0.6454533701812899
-0.6706834560671071
-0.6997931559557791
-0.7249836528726064
-0.7540616963281472
-0.7792264277458467
-0.8082858854968321
-0.8334380036060661
-0.

0.01964567338058796
0.020492051831190773
0.017442969508470806
0.018303758972466833
0.015269093425205545
0.016144395195846255
0.013124438080387348
0.01401471613411955
0.011010124776849088
0.011916212258993178
0.008928021579174984
0.00985113618448723
0.006880779714956715
0.007822548085992493
0.004871888398181425
0.005834379800583428
0.0029057487824909163
0.0038915184119897514
0.0009877679512296524
-0.005803574105777446
-0.00867899267423615
-0.015443501348139994
-0.01829314437593891
-0.025032988586747695
-0.02785873703013246
-0.03457544897713754
-0.037378562282118236
-0.04407308017739685
-0.04685423500942064
-0.053526931787305804
-0.0562862464102789
-0.06293694485901691
-0.06567398981818033
-0.07230196445542553
-0.0750157600342229
-0.08161972588502063
-0.08430872662522178
-0.09088681489596165
-0.09354888132492453
-0.10009860183298186
-0.11053410063113023
-0.11705226882440417
-0.12745783883061138
-0.1339471895870194
-0.14432513037589795
-0.15078764679660528
-0.16113954823020263
-0.16757651

KeyboardInterrupt: 

In [22]:
def digitize_state(observation):
    cart_pos, cart_v, pole_angle, pole_v = observation
    digitized = [
        np.digitize(cart_pos, bins=bins(-2.4, 2.4, num_dizitized)),
        np.digitize(cart_v, bins=bins(-3.0, 3.0, num_dizitized)),
        np.digitize(pole_angle, bins=bins(-0.5, 0.5, num_dizitized)),
        np.digitize(pole_v, bins=bins(-2.0, 2.0, num_dizitized))
    ]
    print(digitized)
    for i, x in enumerate(digitized):
        print(x * (num_digitized**i))
    return sum([x * (num_dizitized**i) for i, x in enumerate(digitized)])

In [24]:
num_digitized = 6
observation = env.reset()
state = digitize_state(observation)
state

[3, 3, 2, 3]
3
18
72
648


741