In [None]:
# coding:utf-8
# [0]ライブラリのインポート
import gym  #倒立振子(cartpole)の実行環境
from gym import wrappers  #gymの画像保存
import numpy as np
import time


# [1]Q関数を離散化して定義する関数　------------
# 観測した状態を離散値にデジタル変換する
def bins(clip_min, clip_max, num):
    return np.linspace(clip_min, clip_max, num + 1)[1:-1]

# 各値を離散値に変換
def digitize_state(observation):
    cart_pos, cart_v, pole_angle, pole_v = observation
    digitized = [
        np.digitize(cart_pos, bins=bins(-2.4, 2.4, num_dizitized)),
        np.digitize(cart_v, bins=bins(-3.0, 3.0, num_dizitized)),
        np.digitize(pole_angle, bins=bins(-0.5, 0.5, num_dizitized)),
        np.digitize(pole_v, bins=bins(-2.0, 2.0, num_dizitized))
    ]
    return sum([x * (num_dizitized**i) for i, x in enumerate(digitized)])


# [2]行動a(t)を求める関数 -------------------------------------
def get_action(next_state, episode):
           #徐々に最適行動のみをとる、ε-greedy法
    epsilon = 0.5 * (1 / (episode + 1))
    if epsilon <= np.random.uniform(0, 1):
        next_action = np.argmax(q_table[next_state])
    else:
        next_action = np.random.choice([0, 1])
    return next_action


# [3]Qテーブルを更新する関数 -------------------------------------
def update_Qtable(q_table, state, action, reward, next_state):
    gamma = 0.99
    alpha = 0.5
    next_Max_Q=max(q_table[next_state][0],q_table[next_state][1] )
    q_table[state, action] = (1 - alpha) * q_table[state, action] +\
            alpha * (reward + gamma * next_Max_Q)
   
    return q_table

# [4]. メイン関数開始 パラメータ設定--------------------------------------------------------
env = gym.make('CartPole-v0')
max_number_of_steps = 200  #1試行のstep数
num_consecutive_iterations = 100  #学習完了評価に使用する平均試行回数
num_episodes = 2000  #総試行回数
goal_average_reward = 195  #この報酬を超えると学習終了（中心への制御なし）
# 状態を6分割^（4変数）にデジタル変換してQ関数（表）を作成
num_dizitized = 6  #分割数
q_table = np.random.uniform(
    low=-1, high=1, size=(num_dizitized**4, env.action_space.n))

total_reward_vec = np.zeros(num_consecutive_iterations)  #各試行の報酬を格納
final_x = np.zeros((num_episodes, 1))  #学習後、各試行のt=200でのｘの位置を格納
islearned = 0  #学習が終わったフラグ
isrender = 0  #描画フラグ


# [5] メインルーチン--------------------------------------------------
for episode in range(num_episodes):  #試行数分繰り返す
    # 環境の初期化
    observation = env.reset()
    state = digitize_state(observation)
    action = np.argmax(q_table[state])
    episode_reward = 0

    for t in range(max_number_of_steps):  #1試行のループ
        if islearned == 1:  #学習終了したらcartPoleを描画する
            env.render()
            time.sleep(0.1)
            print (observation[0])  #カートのx位置を出力

        # 行動a_tの実行により、s_{t+1}, r_{t}などを計算する
        observation, reward, done, info = env.step(action)

        # 報酬を設定し与える
        if done:
            if t < 195:
                reward = -200  #こけたら罰則
            else:
                reward = 1  #立ったまま終了時は罰則はなし
        else:
            reward = 1  #各ステップで立ってたら報酬追加

        episode_reward += reward  #報酬を追加

        # 離散状態s_{t+1}を求め、Q関数を更新する
        next_state = digitize_state(observation)  #t+1での観測状態を、離散値に変換
        q_table = update_Qtable(q_table, state, action, reward, next_state)
        
        #  次の行動a_{t+1}を求める 
        action = get_action(next_state, episode)    # a_{t+1} 
        
        state = next_state
        
        #終了時の処理
        if done:
            print('%d Episode finished after %f time steps / mean %f' %
                  (episode, t + 1, total_reward_vec.mean()))
            total_reward_vec = np.hstack((total_reward_vec[1:],
                                          episode_reward))  #報酬を記録
            if islearned == 1:  #学習終わってたら最終のx座標を格納
                final_x[episode, 0] = observation[0]
            break

    if (total_reward_vec.mean() >=
            goal_average_reward):  # 直近の100エピソードが規定報酬以上であれば成功
        print('Episode %d train agent successfuly!' % episode)
        islearned = 1
        #np.savetxt('learned_Q_table.csv',q_table, delimiter=",") #Qtableの保存する場合
        if isrender == 0:
            #env = wrappers.Monitor(env, './movie/cartpole-experiment-1') #動画保存する場合
            isrender = 1
    #10エピソードだけでどんな挙動になるのか見たかったら、以下のコメントを外す
    #if episode>10:
    #    if isrender == 0:
    #        env = wrappers.Monitor(env, './movie/cartpole-experiment-1') #動画保存する場合
    #        isrender = 1
    #    islearned=1;

if islearned:
    np.savetxt('final_x.csv', final_x, delimiter=",")

0 Episode finished after 19.000000 time steps / mean 0.000000
1 Episode finished after 16.000000 time steps / mean -1.820000
2 Episode finished after 15.000000 time steps / mean -3.670000
3 Episode finished after 44.000000 time steps / mean -5.530000
4 Episode finished after 95.000000 time steps / mean -7.100000
5 Episode finished after 10.000000 time steps / mean -8.160000
6 Episode finished after 15.000000 time steps / mean -10.070000
7 Episode finished after 32.000000 time steps / mean -11.930000
8 Episode finished after 14.000000 time steps / mean -13.620000
9 Episode finished after 8.000000 time steps / mean -15.490000
10 Episode finished after 30.000000 time steps / mean -17.420000
11 Episode finished after 112.000000 time steps / mean -19.130000
12 Episode finished after 99.000000 time steps / mean -20.020000
13 Episode finished after 140.000000 time steps / mean -21.040000
14 Episode finished after 28.000000 time steps / mean -21.650000
15 Episode finished after 10.000000 time 

126 Episode finished after 119.000000 time steps / mean 19.390000
127 Episode finished after 70.000000 time steps / mean 19.040000
128 Episode finished after 75.000000 time steps / mean 15.730000
129 Episode finished after 98.000000 time steps / mean 16.090000
130 Episode finished after 133.000000 time steps / mean 16.850000
131 Episode finished after 15.000000 time steps / mean 17.250000
132 Episode finished after 9.000000 time steps / mean 16.850000
133 Episode finished after 98.000000 time steps / mean 12.930000
134 Episode finished after 14.000000 time steps / mean 13.260000
135 Episode finished after 21.000000 time steps / mean 12.740000
136 Episode finished after 18.000000 time steps / mean 11.970000
137 Episode finished after 112.000000 time steps / mean 11.770000
138 Episode finished after 11.000000 time steps / mean 8.880000
139 Episode finished after 47.000000 time steps / mean 7.180000
140 Episode finished after 200.000000 time steps / mean 6.020000
141 Episode finished afte

252 Episode finished after 173.000000 time steps / mean 52.150000
253 Episode finished after 200.000000 time steps / mean 52.670000
254 Episode finished after 162.000000 time steps / mean 52.670000
255 Episode finished after 113.000000 time steps / mean 53.070000
256 Episode finished after 121.000000 time steps / mean 53.010000
257 Episode finished after 125.000000 time steps / mean 52.550000
258 Episode finished after 103.000000 time steps / mean 52.590000
259 Episode finished after 175.000000 time steps / mean 52.340000
260 Episode finished after 145.000000 time steps / mean 52.260000
261 Episode finished after 200.000000 time steps / mean 49.700000
262 Episode finished after 200.000000 time steps / mean 52.290000
263 Episode finished after 148.000000 time steps / mean 52.290000
264 Episode finished after 159.000000 time steps / mean 52.070000
265 Episode finished after 112.000000 time steps / mean 52.400000
266 Episode finished after 200.000000 time steps / mean 52.240000
267 Episod

383 Episode finished after 200.000000 time steps / mean 70.880000
384 Episode finished after 140.000000 time steps / mean 70.880000
385 Episode finished after 200.000000 time steps / mean 68.270000
386 Episode finished after 200.000000 time steps / mean 71.030000
387 Episode finished after 200.000000 time steps / mean 74.270000
388 Episode finished after 200.000000 time steps / mean 74.270000
389 Episode finished after 200.000000 time steps / mean 74.270000
390 Episode finished after 173.000000 time steps / mean 77.300000
391 Episode finished after 200.000000 time steps / mean 75.020000
392 Episode finished after 200.000000 time steps / mean 78.780000
393 Episode finished after 200.000000 time steps / mean 82.460000
394 Episode finished after 140.000000 time steps / mean 82.460000
395 Episode finished after 200.000000 time steps / mean 79.850000
396 Episode finished after 200.000000 time steps / mean 79.850000
397 Episode finished after 200.000000 time steps / mean 83.760000
398 Episod

512 Episode finished after 200.000000 time steps / mean 131.130000
513 Episode finished after 190.000000 time steps / mean 131.130000
514 Episode finished after 154.000000 time steps / mean 129.030000
515 Episode finished after 116.000000 time steps / mean 126.560000
516 Episode finished after 142.000000 time steps / mean 123.710000
517 Episode finished after 200.000000 time steps / mean 121.120000
518 Episode finished after 200.000000 time steps / mean 121.120000
519 Episode finished after 200.000000 time steps / mean 121.120000
520 Episode finished after 200.000000 time steps / mean 121.120000
521 Episode finished after 200.000000 time steps / mean 121.120000
522 Episode finished after 200.000000 time steps / mean 121.120000
523 Episode finished after 200.000000 time steps / mean 121.120000
524 Episode finished after 200.000000 time steps / mean 121.120000
525 Episode finished after 200.000000 time steps / mean 121.120000
526 Episode finished after 200.000000 time steps / mean 123.67

637 Episode finished after 186.000000 time steps / mean 72.800000
638 Episode finished after 200.000000 time steps / mean 70.650000
639 Episode finished after 200.000000 time steps / mean 70.650000
640 Episode finished after 200.000000 time steps / mean 70.650000
641 Episode finished after 200.000000 time steps / mean 73.160000
642 Episode finished after 185.000000 time steps / mean 75.230000
643 Episode finished after 200.000000 time steps / mean 73.070000
644 Episode finished after 200.000000 time steps / mean 75.440000
645 Episode finished after 200.000000 time steps / mean 78.430000
646 Episode finished after 134.000000 time steps / mean 81.380000
647 Episode finished after 140.000000 time steps / mean 81.510000
648 Episode finished after 199.000000 time steps / mean 81.800000
649 Episode finished after 197.000000 time steps / mean 84.910000
650 Episode finished after 140.000000 time steps / mean 87.190000
651 Episode finished after 173.000000 time steps / mean 87.620000
652 Episod

763 Episode finished after 200.000000 time steps / mean 8.240000
764 Episode finished after 200.000000 time steps / mean 10.640000
765 Episode finished after 200.000000 time steps / mean 13.190000
766 Episode finished after 200.000000 time steps / mean 15.380000
767 Episode finished after 152.000000 time steps / mean 18.080000
768 Episode finished after 200.000000 time steps / mean 18.020000
769 Episode finished after 200.000000 time steps / mean 20.500000
770 Episode finished after 157.000000 time steps / mean 22.770000
771 Episode finished after 200.000000 time steps / mean 23.010000
772 Episode finished after 200.000000 time steps / mean 25.230000
773 Episode finished after 200.000000 time steps / mean 27.920000
774 Episode finished after 182.000000 time steps / mean 30.240000
775 Episode finished after 200.000000 time steps / mean 30.770000
776 Episode finished after 200.000000 time steps / mean 33.040000
777 Episode finished after 200.000000 time steps / mean 35.470000
778 Episode

891 Episode finished after 200.000000 time steps / mean 186.660000
892 Episode finished after 200.000000 time steps / mean 186.660000
893 Episode finished after 200.000000 time steps / mean 186.660000
894 Episode finished after 200.000000 time steps / mean 186.660000
895 Episode finished after 200.000000 time steps / mean 186.660000
896 Episode finished after 200.000000 time steps / mean 186.660000
897 Episode finished after 200.000000 time steps / mean 186.660000
898 Episode finished after 200.000000 time steps / mean 186.660000
899 Episode finished after 200.000000 time steps / mean 186.660000
900 Episode finished after 200.000000 time steps / mean 186.660000
901 Episode finished after 200.000000 time steps / mean 186.660000
902 Episode finished after 200.000000 time steps / mean 186.660000
903 Episode finished after 200.000000 time steps / mean 186.660000
904 Episode finished after 200.000000 time steps / mean 186.660000
905 Episode finished after 200.000000 time steps / mean 186.66

1014 Episode finished after 200.000000 time steps / mean 187.810000
1015 Episode finished after 200.000000 time steps / mean 190.350000
1016 Episode finished after 200.000000 time steps / mean 190.350000
1017 Episode finished after 200.000000 time steps / mean 190.350000
1018 Episode finished after 200.000000 time steps / mean 190.350000
1019 Episode finished after 200.000000 time steps / mean 190.350000
1020 Episode finished after 200.000000 time steps / mean 190.350000
1021 Episode finished after 200.000000 time steps / mean 190.350000
1022 Episode finished after 200.000000 time steps / mean 190.350000
1023 Episode finished after 200.000000 time steps / mean 190.350000
1024 Episode finished after 200.000000 time steps / mean 190.350000
1025 Episode finished after 200.000000 time steps / mean 190.350000
1026 Episode finished after 200.000000 time steps / mean 190.350000
1027 Episode finished after 200.000000 time steps / mean 192.890000
1028 Episode finished after 200.000000 time step

1137 Episode finished after 200.000000 time steps / mean 167.660000
1138 Episode finished after 200.000000 time steps / mean 170.180000
1139 Episode finished after 200.000000 time steps / mean 170.180000
1140 Episode finished after 200.000000 time steps / mean 170.180000
1141 Episode finished after 200.000000 time steps / mean 170.180000
1142 Episode finished after 200.000000 time steps / mean 173.250000
1143 Episode finished after 200.000000 time steps / mean 173.250000
1144 Episode finished after 200.000000 time steps / mean 173.250000
1145 Episode finished after 200.000000 time steps / mean 173.250000
1146 Episode finished after 200.000000 time steps / mean 173.250000
1147 Episode finished after 200.000000 time steps / mean 173.250000
1148 Episode finished after 200.000000 time steps / mean 173.250000
1149 Episode finished after 200.000000 time steps / mean 173.250000
1150 Episode finished after 182.000000 time steps / mean 173.250000
1151 Episode finished after 200.000000 time step

1262 Episode finished after 200.000000 time steps / mean 166.100000
1263 Episode finished after 200.000000 time steps / mean 166.100000
1264 Episode finished after 200.000000 time steps / mean 168.640000
1265 Episode finished after 200.000000 time steps / mean 168.640000
1266 Episode finished after 200.000000 time steps / mean 168.640000
1267 Episode finished after 200.000000 time steps / mean 168.640000
1268 Episode finished after 200.000000 time steps / mean 168.640000
1269 Episode finished after 200.000000 time steps / mean 168.640000
1270 Episode finished after 200.000000 time steps / mean 168.640000
1271 Episode finished after 200.000000 time steps / mean 168.640000
1272 Episode finished after 162.000000 time steps / mean 168.640000
1273 Episode finished after 200.000000 time steps / mean 166.250000
1274 Episode finished after 200.000000 time steps / mean 166.250000
1275 Episode finished after 200.000000 time steps / mean 168.640000
1276 Episode finished after 200.000000 time step

1385 Episode finished after 200.000000 time steps / mean 187.080000
1386 Episode finished after 200.000000 time steps / mean 187.080000
1387 Episode finished after 200.000000 time steps / mean 187.080000
1388 Episode finished after 200.000000 time steps / mean 187.080000
1389 Episode finished after 200.000000 time steps / mean 187.080000
1390 Episode finished after 200.000000 time steps / mean 187.080000
1391 Episode finished after 200.000000 time steps / mean 187.080000
1392 Episode finished after 200.000000 time steps / mean 187.080000
1393 Episode finished after 200.000000 time steps / mean 187.080000
1394 Episode finished after 200.000000 time steps / mean 187.080000
1395 Episode finished after 200.000000 time steps / mean 189.610000
1396 Episode finished after 200.000000 time steps / mean 189.610000
1397 Episode finished after 200.000000 time steps / mean 189.610000
1398 Episode finished after 200.000000 time steps / mean 189.610000
1399 Episode finished after 200.000000 time step

-0.24578237177857826
-0.24662499805085006
-0.2513741391199387
-0.2522243540033425
-0.25698145597272126
-0.25784007263161035
-0.26260604260524906
-0.2634740754078536
-0.2682500311563909
-0.26912871860019916
-0.27391601353785505
-0.27480684644384734
-0.2796070999764296
-0.2805118555357569
-0.2853269893490465
-0.2862477717058254
-0.2910800526381473
-0.2920193407833944
-0.29687143097868374
-0.29783213480050763
-0.2949033426339377
-0.28808508941331107
-0.2773755846584648
-0.2705758247822786
-0.2598767724654678
-0.2530803308422109
-0.25018187131240105
-0.2511782497907543
-0.25606805057133336
-0.2648516677541008
-0.27753123931745877
-0.2863056166859772
-0.2911797628866647
-0.29215729059812956
-0.28924015743030046
-0.28242853387570893
-0.27172082412779525
-0.26491863847392216
-0.26201747861399527
-0.2630142679423672
-0.26790760909621136
-0.2766978711122622
-0.2893871216102438
-0.2981740265828754
-0.30306380615602996
-0.3040602853701323
-0.30116559381944424
-0.2943800309835414
-0.28370207679532