In [1]:
# coding: utf-8
from collections import deque
import os

import numpy as np
import time

import tensorflow as tf

import gym
from gym import wrappers

np.random.seed(7)

# 過去何コマを見るか
STATE_NUM = 4


class DQNAgent():
    """
    Multi Layer Perceptron with Experience Replay
    """

    def __init__(self, epsilon=0.99):
        # parameters
        # self.name = os.path.splitext(os.path.basename(__file__))[0]
        # self.environment_name = environment_name
        self.enable_actions = [0,1] 
        self.n_actions = len(self.enable_actions)
        self.minibatch_size = 32
        self.replay_memory_size = 300*100
        self.learning_rate = 0.001
        self.discount_factor = 0.9
        self.exploration = 0.1
        self.epsilon = epsilon
        self.experienceMemory=[] # 経験メモリ（エピソードローカル）
        self.experienceMemory_local=[] # 経験メモリ（エピソードローカル）
        self.memSize = 300*100  # 経験メモリのサイズ(300サンプリングx100エピソード)
        self.experienceMemory_local=[] # 経験メモリ（エピソードローカル）
        self.memPos = 0 #メモリのインデックス
        self.batch_num = 32 # 学習に使うバッチサイズ
        self.gamma = 0.9       # 割引率
        self.loss=0
        self.total_reward_award=np.ones(100)*-1000 #100エピソード
        


        # replay memory
        self.D = deque(maxlen=self.replay_memory_size)

        # model
        self.init_model()

        # variables
        self.current_loss = 0.0

    def init_model(self):

        # input layer (1 x 4)
        self.x = tf.placeholder(tf.float32, [4])

        # flatten (64)
        x_flat = tf.reshape(self.x, [-1, 4])

        # fully connected layer (32)
        W_fc1 = tf.Variable(tf.truncated_normal([4 ,16], stddev=0.01))
        b_fc1 = tf.Variable(tf.zeros([16]))
        h_fc1 = tf.nn.relu(tf.matmul(x_flat, W_fc1) + b_fc1)

        # output layer (n_actions)
        W_out = tf.Variable(tf.truncated_normal([16, self.n_actions], stddev=0.01))
        b_out = tf.Variable(tf.zeros([self.n_actions]))
        self.y = tf.matmul(h_fc1, W_out) + b_out

        # loss function
        self.y_ = tf.placeholder(tf.float32, [self.n_actions])
        self.loss = tf.reduce_mean(tf.square(self.y_ - self.y))

        # train operation
        optimizer = tf.train.RMSPropOptimizer(self.learning_rate)
        self.training = optimizer.minimize(self.loss)

        # saver
        self.saver = tf.train.Saver()

        # session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

    def get_action_value(self, state):
        #Q_values(self, state): #
        # Q(state, action) of all actions
        return self.sess.run(self.y, feed_dict={self.x: state})[0]

    def reduce_epsilon(self):
        self.epsilon-=1.0/100000

    def get_epsilon(self):
        return self.epsilon

    def get_action(self, state, train):
        if train==True and np.random.rand() < self.epsilon:
            # random
            return np.random.choice(self.enable_actions)
        else:
            # max_action Q(state, action)
            return self.enable_actions[np.argmax(self.get_action_value(state))]

    def experience_local(self,old_seq, action, reward, new_seq):
        #エピソードローカルな記憶
        self.experienceMemory_local.append( np.hstack([old_seq,action,reward,new_seq]) )
 
    def store_experience(self, state, action, reward, state_1, train):
        self.D.append((state, action, reward, state_1, train))


    def experience_global(self,total_reward):
        #グローバルな記憶
        #ベスト100に入る経験を取り込む
        if np.min(self.total_reward_award)<total_reward:
            i=np.argmin(self.total_reward_award)
            self.total_reward_award[i]=total_reward

            # GOOD EXPERIENCE REPLAY
            for x in self.experienceMemory_local:
                self.experience( x )

        #一定確率で優秀でないものも取り込む
        if np.random.random()<0.01:
            # # NORMAL EXPERIENCE REPLAY
            for x in self.experienceMemory_local:
                self.experience( x )

        self.experienceMemory_local=[]

    def experience(self,x):
        if len(self.experienceMemory)>self.memSize:
            self.experienceMemory[int(self.memPos%self.memSize)]=x
            self.memPos+=1
        else:
            self.experienceMemory.append( x )
        
    def experience_replay(self):
        state_minibatch = []
        y_minibatch = []

        # sample random minibatch
        minibatch_size = min(len(self.D), self.minibatch_size)
        minibatch_indexes = np.random.randint(0, len(self.D), minibatch_size)

        for j in minibatch_indexes:
            state_j, action_j, reward_j, state_j_1, terminal = self.D[j]
            action_j_index = self.enable_actions.index(action_j)
            state_j
            y_j = self.get_action_value(state_j)

            if terminal:
                y_j[action_j_index] = reward_j
            else:
                # reward_j + gamma * max_action' Q(state', action')
                y_j[action_j_index] = reward_j + self.discount_factor * np.max(self.get_action_value(state_j_1))  # NOQA

            state_minibatch.append(state_j)
            y_minibatch.append(y_j)

        # training
        self.sess.run(self.training, feed_dict={self.x: state_minibatch, self.y_: y_minibatch})

        # for log
        self.current_loss = self.sess.run(self.loss, feed_dict={self.x: state_minibatch, self.y_: y_minibatch})
    
    
    def update_model(self,old_seq, action, reward, new_seq):
        '''
        モデルを更新する
        '''
        # 経験メモリにたまってない場合は更新しない
        if len(self.experienceMemory)<self.batch_num:
            return

        # 経験メモリからバッチを作成
        memsize=len(self.experienceMemory)
        batch_index = list(np.random.randint(0,memsize,(self.batch_num)))
        batch =np.array( [self.experienceMemory[i] for i in batch_index ])
        x = batch[:,0:STATE_NUM].reshape( (self.batch_num,-1)).astype(np.float32)
        # print(x)
        targets=[]
        # targets=np.zeros((x.shape[0],4))
        # targets=np.array([self.get_action_value(x[j]) for i in x.shape[0]])
        for j in range(x.shape[0]): 
        #   targets[j]=self.get_action_value(x[j])        
            targets.append(np.ndarray.tolist(self.get_action_value(x[j])))
        #targets=self.get_action_value(x).data.copy()
        targets = np.array(targets)
        # print("targets=",targets)
        
        for i in range(self.batch_num):
            #[ seq..., action, reward, seq_new]
            a = batch[i,STATE_NUM]
            r = batch[i, STATE_NUM+1]
            ai=int((a+1)/2) #±1 をindex(0,1)に。
            new_seq= batch[i,(STATE_NUM+2):(STATE_NUM*2+2)]
            targets[i,ai]=( r+ self.gamma * np.max(self.get_action_value(new_seq)))
        t =targets
        #t = Variable(np.array(targets).reshape((self.batch_num,-1)).astype(np.float32))
         

        # ネットの更新
        # self.model.zerograds()
        # loss=self.model(x ,t)
        # self.loss = loss.data
        # loss.backward()
        # self.optimizer.update()

        # training
        for i in range(x.shape[0]):
            x1 = x[i]
            t2 = t[i]
            self.sess.run(self.training, feed_dict={self.x: x1, self.y_: t2})
            # for log
            self.current_loss = self.sess.run(self.loss, feed_dict={self.x: x1 ,self.y_: t2})


class pendulumEnvironment():
    '''
    model
    '''
    def __init__(self):
        self.env = wrappers.Monitor(gym.make('CartPole-v0'), './private/tmp/cartpole-experiment-3', force = True)

    def reset(self):
        self.env.reset()

    def step(self, action):
        return self.env.step(action)

    def monitor_close(self):
        self.env.close()

# シミュレータ。
class simulator:
    def __init__(self, environment, agent):
        self.agent = agent
        self.env = environment
        self.num_seq=STATE_NUM
        self.reset_seq()
        self.learning_rate=1.0
        self.highscore=0
        self.log=[]

    def reset_seq(self):
        self.seq=np.zeros(self.num_seq)

    def push_seq(self, state):
        self.seq[1:self.num_seq]=self.seq[0:self.num_seq-1]
        self.seq[0]=state

    def run(self, train=True):

        self.env.reset()
        self.reset_seq()
        total_reward=0

        for i in range(300):
            # 現在のstateからなるシーケンスを保存
            old_seq = self.seq.copy()

            # エージェントの行動を決める
            action = self.agent.get_action(old_seq,train)

            # 環境に行動を入力する
            observation, reward, done, info =  self.env.step(action)
            total_reward +=reward

            # 結果を観測してstateとシーケンスを更新する
            state = observation[2]
            self.push_seq(state)
            new_seq = self.seq.copy()

            # エピソードローカルなメモリに記憶する
            self.agent.store_experience(old_seq, action, reward, new_seq,train)
            self.agent.experience_local(old_seq, action, reward, new_seq)

            if done:
                print("Episode finished after {} timesteps".format(i+1))
                break

        # エピソードローカルなメモリ内容をグローバルなメモリに移す
        self.agent.experience_global(total_reward)

        if train:
            # 学習用メモリを使ってモデルを更新する
            self.agent.update_model(old_seq, action, reward, new_seq)
            # self.agent.experience_replay()
            self.agent.reduce_epsilon()

        return total_reward

if __name__ == '__main__':
    agent=DQNAgent()
    env=pendulumEnvironment()
    sim=simulator(env,agent)

    best_reward1 = 0
    for i in range(1000):
        total_reward1 = sim.run(train=True)
        if best_reward1 < total_reward1:
            best_reward1 = total_reward1

        print(str(i) + " " + str(total_reward1) + " " + str(best_reward1))
        env.reset()

        if best_reward1 > 195:
            break

    env.monitor_close()
    gym.upload('./private/tmp/cartpole-experiment-3', api_key='sk_GDB9izzTxu1PyxNAdhcw')


[2017-05-31 06:59:02,419] Making new env: CartPole-v0
[2017-05-31 06:59:02,468] Clearing 24 monitor files from previous run (because force=True was provided)
[2017-05-31 06:59:02,474] Starting new video recorder writing to /Users/nanaki/Documents/desk/code/AI_gym/CarPole/private/tmp/cartpole-experiment-3/openaigym.video.0.67716.video000000.mp4
[2017-05-31 06:59:04,937] Starting new video recorder writing to /Users/nanaki/Documents/desk/code/AI_gym/CarPole/private/tmp/cartpole-experiment-3/openaigym.video.0.67716.video000001.mp4


Episode finished after 48 timesteps
0 48.0 48.0
Episode finished after 16 timesteps


[2017-05-31 06:59:05,158] Starting new video recorder writing to /Users/nanaki/Documents/desk/code/AI_gym/CarPole/private/tmp/cartpole-experiment-3/openaigym.video.0.67716.video000008.mp4


1 16.0 48.0
Episode finished after 10 timesteps
2 10.0 48.0
Episode finished after 36 timesteps
3 36.0 48.0
Episode finished after 15 timesteps
4 15.0 48.0
Episode finished after 19 timesteps
5 19.0 48.0
Episode finished after 26 timesteps
6 26.0 48.0
Episode finished after 26 timesteps
7 26.0 48.0
Episode finished after 19 timesteps


[2017-05-31 06:59:05,918] Starting new video recorder writing to /Users/nanaki/Documents/desk/code/AI_gym/CarPole/private/tmp/cartpole-experiment-3/openaigym.video.0.67716.video000027.mp4


8 19.0 48.0
Episode finished after 21 timesteps
9 21.0 48.0
Episode finished after 24 timesteps
10 24.0 48.0
Episode finished after 26 timesteps
11 26.0 48.0
Episode finished after 13 timesteps
12 13.0 48.0
Episode finished after 30 timesteps
13 30.0 48.0
Episode finished after 14 timesteps
14 14.0 48.0
Episode finished after 19 timesteps
15 19.0 48.0
Episode finished after 32 timesteps
16 32.0 48.0
Episode finished after 13 timesteps
17 13.0 48.0
Episode finished after 43 timesteps
18 43.0 48.0
Episode finished after 13 timesteps
19 13.0 48.0
Episode finished after 41 timesteps
20 41.0 48.0
Episode finished after 17 timesteps
21 17.0 48.0
Episode finished after 31 timesteps
22 31.0 48.0
Episode finished after 15 timesteps
23 15.0 48.0
Episode finished after 9 timesteps
24 9.0 48.0
Episode finished after 32 timesteps
25 32.0 48.0
Episode finished after 14 timesteps
26 14.0 48.0
Episode finished after 32 timesteps
27 32.0 48.0
Episode finished after 25 timesteps
28 25.0 48.0
Episode fin

[2017-05-31 06:59:06,762] Starting new video recorder writing to /Users/nanaki/Documents/desk/code/AI_gym/CarPole/private/tmp/cartpole-experiment-3/openaigym.video.0.67716.video000064.mp4


29 18.0 48.0
Episode finished after 38 timesteps
30 38.0 48.0
Episode finished after 17 timesteps
31 17.0 48.0
Episode finished after 26 timesteps
32 26.0 48.0
Episode finished after 13 timesteps
33 13.0 48.0
Episode finished after 25 timesteps
34 25.0 48.0
Episode finished after 31 timesteps
35 31.0 48.0
Episode finished after 40 timesteps
36 40.0 48.0
Episode finished after 13 timesteps
37 13.0 48.0
Episode finished after 28 timesteps
38 28.0 48.0
Episode finished after 14 timesteps
39 14.0 48.0
Episode finished after 12 timesteps
40 12.0 48.0
Episode finished after 64 timesteps
41 64.0 64.0
Episode finished after 11 timesteps
42 11.0 64.0
Episode finished after 26 timesteps
43 26.0 64.0
Episode finished after 36 timesteps
44 36.0 64.0
Episode finished after 10 timesteps
45 10.0 64.0
Episode finished after 15 timesteps
46 15.0 64.0
Episode finished after 17 timesteps
47 17.0 64.0
Episode finished after 20 timesteps
48 20.0 64.0
Episode finished after 22 timesteps
49 22.0 64.0
Episode

[2017-05-31 06:59:08,616] Starting new video recorder writing to /Users/nanaki/Documents/desk/code/AI_gym/CarPole/private/tmp/cartpole-experiment-3/openaigym.video.0.67716.video000125.mp4


58 15.0 64.0
Episode finished after 12 timesteps
59 12.0 64.0
Episode finished after 21 timesteps
60 21.0 64.0
Episode finished after 21 timesteps
61 21.0 64.0
Episode finished after 64 timesteps
62 64.0 64.0
Episode finished after 11 timesteps
63 11.0 64.0
Episode finished after 19 timesteps
64 19.0 64.0
Episode finished after 19 timesteps
65 19.0 64.0
Episode finished after 18 timesteps
66 18.0 64.0
Episode finished after 12 timesteps
67 12.0 64.0
Episode finished after 12 timesteps
68 12.0 64.0
Episode finished after 72 timesteps
69 72.0 72.0
Episode finished after 14 timesteps
70 14.0 72.0
Episode finished after 11 timesteps
71 11.0 72.0
Episode finished after 26 timesteps
72 26.0 72.0
Episode finished after 12 timesteps
73 12.0 72.0
Episode finished after 12 timesteps
74 12.0 72.0
Episode finished after 13 timesteps
75 13.0 72.0
Episode finished after 26 timesteps
76 26.0 72.0
Episode finished after 38 timesteps
77 38.0 72.0
Episode finished after 46 timesteps
78 46.0 72.0
Episode

[2017-05-31 06:59:10,482] Starting new video recorder writing to /Users/nanaki/Documents/desk/code/AI_gym/CarPole/private/tmp/cartpole-experiment-3/openaigym.video.0.67716.video000216.mp4


104 18.0 72.0
Episode finished after 18 timesteps
105 18.0 72.0
Episode finished after 29 timesteps
106 29.0 72.0
Episode finished after 13 timesteps
107 13.0 72.0
Episode finished after 27 timesteps
108 27.0 72.0
Episode finished after 10 timesteps
109 10.0 72.0
Episode finished after 41 timesteps
110 41.0 72.0
Episode finished after 12 timesteps
111 12.0 72.0
Episode finished after 24 timesteps
112 24.0 72.0
Episode finished after 11 timesteps
113 11.0 72.0
Episode finished after 14 timesteps
114 14.0 72.0
Episode finished after 14 timesteps
115 14.0 72.0
Episode finished after 16 timesteps
116 16.0 72.0
Episode finished after 28 timesteps
117 28.0 72.0
Episode finished after 16 timesteps
118 16.0 72.0
Episode finished after 23 timesteps
119 23.0 72.0
Episode finished after 12 timesteps
120 12.0 72.0
Episode finished after 13 timesteps
121 13.0 72.0
Episode finished after 26 timesteps
122 26.0 72.0
Episode finished after 15 timesteps
123 15.0 72.0
Episode finished after 25 timesteps


[2017-05-31 06:59:13,606] Starting new video recorder writing to /Users/nanaki/Documents/desk/code/AI_gym/CarPole/private/tmp/cartpole-experiment-3/openaigym.video.0.67716.video000343.mp4


168 28.0 72.0
Episode finished after 18 timesteps
169 18.0 72.0
Episode finished after 14 timesteps
170 14.0 72.0
Episode finished after 29 timesteps
171 29.0 72.0
Episode finished after 24 timesteps
172 24.0 72.0
Episode finished after 53 timesteps
173 53.0 72.0
Episode finished after 23 timesteps
174 23.0 72.0
Episode finished after 23 timesteps
175 23.0 72.0
Episode finished after 16 timesteps
176 16.0 72.0
Episode finished after 18 timesteps
177 18.0 72.0
Episode finished after 18 timesteps
178 18.0 72.0
Episode finished after 35 timesteps
179 35.0 72.0
Episode finished after 33 timesteps
180 33.0 72.0
Episode finished after 25 timesteps
181 25.0 72.0
Episode finished after 12 timesteps
182 12.0 72.0
Episode finished after 29 timesteps
183 29.0 72.0
Episode finished after 12 timesteps
184 12.0 72.0
Episode finished after 34 timesteps
185 34.0 72.0
Episode finished after 18 timesteps
186 18.0 72.0
Episode finished after 12 timesteps
187 12.0 72.0
Episode finished after 45 timesteps


[2017-05-31 06:59:17,032] Starting new video recorder writing to /Users/nanaki/Documents/desk/code/AI_gym/CarPole/private/tmp/cartpole-experiment-3/openaigym.video.0.67716.video000512.mp4


255 12.0 72.0
Episode finished after 24 timesteps
256 24.0 72.0
Episode finished after 11 timesteps
257 11.0 72.0
Episode finished after 15 timesteps
258 15.0 72.0
Episode finished after 13 timesteps
259 13.0 72.0
Episode finished after 13 timesteps
260 13.0 72.0
Episode finished after 19 timesteps
261 19.0 72.0
Episode finished after 19 timesteps
262 19.0 72.0
Episode finished after 55 timesteps
263 55.0 72.0
Episode finished after 35 timesteps
264 35.0 72.0
Episode finished after 21 timesteps
265 21.0 72.0
Episode finished after 22 timesteps
266 22.0 72.0
Episode finished after 14 timesteps
267 14.0 72.0
Episode finished after 27 timesteps
268 27.0 72.0
Episode finished after 18 timesteps
269 18.0 72.0
Episode finished after 18 timesteps
270 18.0 72.0
Episode finished after 13 timesteps
271 13.0 72.0
Episode finished after 20 timesteps
272 20.0 72.0
Episode finished after 22 timesteps
273 22.0 72.0
Episode finished after 14 timesteps
274 14.0 72.0
Episode finished after 21 timesteps


[2017-05-31 06:59:21,840] Starting new video recorder writing to /Users/nanaki/Documents/desk/code/AI_gym/CarPole/private/tmp/cartpole-experiment-3/openaigym.video.0.67716.video000729.mp4


363 53.0 72.0
Episode finished after 29 timesteps
364 29.0 72.0
Episode finished after 36 timesteps
365 36.0 72.0
Episode finished after 34 timesteps
366 34.0 72.0
Episode finished after 35 timesteps
367 35.0 72.0
Episode finished after 11 timesteps
368 11.0 72.0
Episode finished after 28 timesteps
369 28.0 72.0
Episode finished after 20 timesteps
370 20.0 72.0
Episode finished after 21 timesteps
371 21.0 72.0
Episode finished after 10 timesteps
372 10.0 72.0
Episode finished after 24 timesteps
373 24.0 72.0
Episode finished after 11 timesteps
374 11.0 72.0
Episode finished after 21 timesteps
375 21.0 72.0
Episode finished after 19 timesteps
376 19.0 72.0
Episode finished after 18 timesteps
377 18.0 72.0
Episode finished after 27 timesteps
378 27.0 72.0
Episode finished after 20 timesteps
379 20.0 72.0
Episode finished after 17 timesteps
380 17.0 72.0
Episode finished after 11 timesteps
381 11.0 72.0
Episode finished after 10 timesteps
382 10.0 72.0
Episode finished after 21 timesteps


[2017-05-31 06:59:27,303] Starting new video recorder writing to /Users/nanaki/Documents/desk/code/AI_gym/CarPole/private/tmp/cartpole-experiment-3/openaigym.video.0.67716.video001000.mp4


495 16.0 72.0
Episode finished after 32 timesteps
496 32.0 72.0
Episode finished after 18 timesteps
497 18.0 72.0
Episode finished after 18 timesteps
498 18.0 72.0
Episode finished after 25 timesteps
499 25.0 72.0
Episode finished after 15 timesteps
500 15.0 72.0
Episode finished after 37 timesteps
501 37.0 72.0
Episode finished after 13 timesteps
502 13.0 72.0
Episode finished after 10 timesteps
503 10.0 72.0
Episode finished after 21 timesteps
504 21.0 72.0
Episode finished after 14 timesteps
505 14.0 72.0
Episode finished after 58 timesteps
506 58.0 72.0
Episode finished after 47 timesteps
507 47.0 72.0
Episode finished after 15 timesteps
508 15.0 72.0
Episode finished after 14 timesteps
509 14.0 72.0
Episode finished after 13 timesteps
510 13.0 72.0
Episode finished after 11 timesteps
511 11.0 72.0
Episode finished after 21 timesteps
512 21.0 72.0
Episode finished after 20 timesteps
513 20.0 72.0
Episode finished after 13 timesteps
514 13.0 72.0
Episode finished after 13 timesteps


[2017-05-31 06:59:47,607] Finished writing results. You can upload them to the scoreboard via gym.upload('/Users/nanaki/Documents/desk/code/AI_gym/CarPole/private/tmp/cartpole-experiment-3')
[2017-05-31 06:59:47,617] [CartPole-v0] Uploading 1000 episodes of training data


Episode finished after 81 timesteps
996 81.0 83.0
Episode finished after 13 timesteps
997 13.0 83.0
Episode finished after 25 timesteps
998 25.0 83.0
Episode finished after 29 timesteps
999 29.0 83.0


[2017-05-31 06:59:51,509] [CartPole-v0] Uploading videos of 11 training episodes (20626 bytes)
[2017-05-31 06:59:53,055] [CartPole-v0] Creating evaluation object from ./private/tmp/cartpole-experiment-3 with learning curve and training video
[2017-05-31 06:59:53,668] 
****************************************************
You successfully uploaded your evaluation on CartPole-v0 to
OpenAI Gym! You can find it at:

    https://gym.openai.com/evaluations/eval_HnX9rsnfQUSLo49IW4K0Pg

****************************************************
