In [1]:
# coding:utf-8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import random
import chainer
from chainer import Function, Variable, optimizers, serializers
from chainer import Link, Chain
import chainer.functions as F
import chainer.links as L
import copy
import gym

NUM_IN   = 2
NUM_HID1 = 1000
NUM_HID2 = 500
NUM_HID3 = 250
NUM_OUT  = 3
BATCH_SIZE = 50
ACTION = [-2, 0, 2]
EPOCH = 3
GAMMA = 0.9
EPSIL = 0.05
DQNMODEL_PATH = "./DQN_model/mounaincarver3.model"

def deep_q_learn(env, q_network, target_network, experince):
    # プロッセサーの選択
    xp = switch_ProccerUnit("CPU")
    epsil = EPSIL
    record = []
    for episode in range(1, EPOCH+1):
        print(episode)
        # 初期状態の定義
        now_state = env.reset()
        pos, spe = get_state(now_state)
        state_vec = get_state_vec(pos, spe, xp, 2)
        epsil = reduce_epsil(episode, epsil)
        for step in range(20000):
            # 方策に従って行動を選択
            action, _, action_list = policy_egreedy_tri(state_vec, q_network, epsil, xp)
            # 行動による次状態を観測
            next_state, reward, terminal, info = agent_action(env, action)
            if reward != -1: print(next_state, reward)
            next_pos, next_spe = get_state(next_state)
            deep_learn(action, q_network, state_vec, xp, [make_target(action, reward, next_state, terminal, target_network, xp, action_list)])
            if reward == 1:
                print("成功")
                revord = stock_record(record, pos, spe, action, 0, 0, reward, True, action_list)
                break
            # 状態更新
            next_pos, next_spe = get_state(next_state)
            state_vec = get_state_vec(next_pos, next_spe, xp, 2)
            # レコード蓄積
            record = stock_record(record, pos, spe, action, next_pos, next_state, reward, 0, action_list)
            pos, spe = next_pos, next_spe
            # env.render()
    
            # Experience_Replayによるバッチ学習
            if len(record) > 1000:
                # experience_replay(record, q_network, target_network, xp)
                if len(record) > 1500:
                    record = []
    
            # Target_networkの更新
            if step % 100 == 0:
                print(step)
                # q_network.save_weight(DQNMODEL_PATH)
                target_network = update_target_network(q_network)
        
def get_state(state):
    return state[0], state[1]

def get_state_vec(pos, spe, xp, flg):
    vec = xp.array([pos, spe], dtype=xp.float32)
    if flg == 1: return vec
    return xp.array([vec], dtype=xp.float32) 
    
    
def experience_replay(record, q_network, target_network, xp):
    state_vecs, actions, next_states, rewards, terminals, action_lists = transelate(record, xp)
    perm = xp.random.permutation(len(record))[:BATCH_SIZE]
    x_batch_state_vecs   = state_vecs[perm[0:BATCH_SIZE]]
    x_batch_action       = actions[perm[0:BATCH_SIZE]]
    x_batch_rewards      = rewards[perm[0:BATCH_SIZE]]
    x_batch_terminals    = terminals[perm[0:BATCH_SIZE]]
    x_batch_action_lists = action_lists[perm[0:BATCH_SIZE]]
    y_batch_targets    = []
    for index in range(BATCH_SIZE):
        y_batch_targets.append(make_target(x_batch_action[index], x_batch_rewards[index], next_states[perm[index]], x_batch_terminals[index], target_network, xp, x_batch_action_lists[index]))
    deep_learn(x_batch_action, q_network, x_batch_state_vecs, xp, y_batch_targets)

def transelate(record, xp):
    state_vecs  = []
    actions     = []
    rewards     = []
    terminals   = []
    next_states = []
    action_lists= []
    for data in record:
        state_vecs.append(get_state_vec(data[0][0],data[0][1], xp, 1))
        actions.append(data[1])
        next_states.append(data[2])
        rewards.append(data[3])
        terminals.append(data[4])
        action_lists.append(data[5])
    return xp.array(state_vecs, dtype=xp.float32), xp.array(actions), next_states, xp.array(rewards), xp.array(terminals), xp.array(action_lists)   
          
    
def stock_record(record, pos, spe, action, next_pos, next_spe, reward, terminal, action_list):
    record.append([(pos, spe), action, (next_pos, next_pos), reward, terminal, action_list])
    return record
       
def make_target(action, reward, next_state, tereminal, target_network, xp, action_list):
    y_target = copy.deepcopy(action_list)
    _, max_q, _ = policy_egreedy_tri(get_state_vec(next_state[0], next_state[1], xp, 2), target_network, 0, xp)
    y_target[action] = reward if tereminal else reward + GAMMA * max_q    
    y_target = xp.array(y_target, dtype=np.float32)
    return y_target

def deep_learn(action, q_network, state_vec, xp, y_target, flg=None):
    y_target = xp.array(y_target, dtype=xp.float32)
    q_network.init_grads()
    loss = q_network.forward(1, state_vec, y_target)
    q_network.backpropagation(loss)
    
    
def policy_egreedy_tri(state, neural, epsil, xp):
    import scipy.spatial.distance
    qvalue_list = []
    tmp = []
    qvalue_list.append(neural.forward(0, state).data[0])
    qvalue_list.append(neural.forward(0, state).data[0])
    qvalue_vec = np.array(neural.forward(0, state).data[0])
    for qvalue in qvalue_list:
        sim = 1 - scipy.spatial.distance.cosine(xp.array(qvalue), qvalue_vec)
        tmp.append(sim)
    if tmp[0] < tmp[1]:
        return (list(qvalue_list[1]).index(max(qvalue_list[1])) if random.random()>epsil else random.choice([0,1,2])), max(qvalue_list[1]), qvalue_list[1] 
    else:
        return (list(qvalue_list[0]).index(max(qvalue_list[0])) if random.random()>epsil else random.choice([0,1,2])), max(qvalue_list[0]), qvalue_list[0]
        
        
        
def agent_action(env, action):
    next_state, reward, terminal, info = env.step(action)
    terminal = False if reward == 0 else True
    return next_state, reward, False, info

def update_target_network(q_network):
    return copy.deepcopy(q_network)

def reduce_epsil(epoch, epsil):
    return epsil
                 
    
def switch_ProccerUnit(pu):
    return cuda.cupy if pu == "GPU" else np
                 
    
def main():
    env = gym.make("MountainCar-v0")
    q_network = NeuralNetwork(NUM_IN, NUM_HID1, NUM_HID2, NUM_HID3, NUM_OUT)
    target_network = NeuralNetwork(NUM_IN, NUM_HID1, NUM_HID2, NUM_HID3, NUM_OUT)
    q_network.load_weight(DQNMODEL_PATH)
    target_network = update_target_network(q_network)
    deep_q_learn(env, q_network, target_network, [0])
        

In [2]:
def start(env, q_network, target_network):
    # プロッセサーの選択
    xp = switch_ProccerUnit("CPU")
    for i in range(50):
        # 初期状態の定義
        now_state = env.reset()
        pos, spe = get_state(now_state)
        state_vec = get_state_vec(pos, spe, xp, 2)
        step = 1
        while True:
            action, _, action_list = policy_egreedy_tri(state_vec, q_network, 0.0, xp)
            next_state, reward, terminal, _ = agent_action(env, action)
            deep_learn(action, q_network, state_vec, xp, [make_target(action, reward, next_state, terminal, target_network, xp, action_list)])
            if next_state[0] - pos == 0:
                print(next_state,reward,terminal)
                deep_learn(action, q_network, state_vec, xp, [make_target(action, 1, next_state, True, target_network, xp, action_list)])
                q_network.save_weight(DQNMODEL_PATH)
                break
            # 状態更新
            pos, spe = get_state(next_state)
            state_vec = get_state_vec(pos, spe, xp, 2) 
            env.render()

            # Target_networkの更新
            if step % 500 == 0: target_network = update_target_network(q_network)
            if step % 8000 == 0: 
                print("失敗")
                q_network.load_weight(DQNMODEL_PATH)
                break
            step += 1
        
def experiment():
    env = gym.make("MountainCar-v0")
    q_network = NeuralNetwork(NUM_IN, NUM_HID1, NUM_HID2, NUM_HID3, NUM_OUT)
    target_network = NeuralNetwork(NUM_IN, NUM_HID1, NUM_HID2, NUM_HID3, NUM_OUT)
    q_network.load_weight(DQNMODEL_PATH)
    target_network = update_target_network(q_network)
    start(env, q_network, target_network)

In [3]:
# coding:utf-8
import chainer
from chainer import Function, Variable, optimizers, serializers
from chainer import Link, Chain
import chainer.functions as F
import chainer.links as L
import matplotlib.pyplot as plt
import numpy as np

class NeuralNetwork:
    def __init__(self, num_in, num_hid1, num_hid2, num_hid3, num_out):
        self.model = Chain(hid_layer1 = L.Linear(num_in, num_hid1),
                           hid_layer2 = L.Linear(num_hid1, num_hid2),
                           hid_layer3 = L.Linear(num_hid2, num_hid3),
                           out_layer  = L.Linear(num_hid3, num_out, initialW=np.zeros((num_out, num_hid3), dtype=np.float32)))
        self.optimizer = optimizers.Adam()
        self.optimizer.setup(self.model)
    
    def forward(self, flg, x, t = None):
        _x = Variable(x)
        if flg == 1: _t = Variable(t)
        h1  = F.dropout(F.relu(self.model.hid_layer1(_x)))
        h2  = F.dropout(F.relu(self.model.hid_layer2(h1)))
        h3  = F.dropout(F.relu(self.model.hid_layer3(h2)))
        u3  = self.model.out_layer(h3)
        # return F.softmax_cross_entropy(u2, _t) if flg else F.softmax(u2)
        # return F.mean_squared_error(self.policy_greedy(u3), _t) if flg else u3
        return F.mean_squared_error(u3, _t) if flg else u3
    
    def backpropagation(self, loss):
        loss.backward()
        self.optimizer.update()
    
    def init_grads(self):
        self.optimizer.zero_grads()
        
    def save_weight(self, model):
        serializers.save_npz(model, self.model)
        
    def load_weight(self, model):
        serializers.load_npz(model, self.model)
        
    def policy_greedy(self, actions):
        return np.max(actions.data, axis = 1)

In [4]:
experiment()

[2017-05-13 22:12:12,045] Making new env: MountainCar-v0


[ 0.6         0.03824944] -1.0 False
[ 0.6         0.01887101] -1.0 False
[ 0.6         0.02786123] -1.0 False
[ 0.6         0.02813036] -1.0 False
[ 0.6         0.02952298] -1.0 False
[ 0.6         0.02793485] -1.0 False
[ 0.6         0.02505589] -1.0 False
[ 0.6         0.02017654] -1.0 False
[ 0.6         0.03250361] -1.0 False
[ 0.6         0.01602706] -1.0 False
[ 0.6        0.0197521] -1.0 False
[ 0.6         0.02076825] -1.0 False
[ 0.6         0.02484054] -1.0 False
失敗
[ 0.6         0.01296905] -1.0 False
[ 0.6        0.0181213] -1.0 False
[ 0.6        0.0302109] -1.0 False
[ 0.6         0.02483365] -1.0 False
失敗
[ 0.6         0.03762514] -1.0 False
[ 0.6         0.01847052] -1.0 False
[ 0.6         0.03372728] -1.0 False
失敗
[ 0.6         0.03846623] -1.0 False
失敗
[ 0.6         0.04307957] -1.0 False
[ 0.6         0.02603241] -1.0 False
失敗
[ 0.6         0.02062242] -1.0 False
失敗
[ 0.6         0.04521863] -1.0 False
[ 0.6         0.02368322] -1.0 False
[ 0.6         0.01906601] 