In [8]:
# coding:utf-8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import random
import chainer
from chainer import Function, Variable, optimizers, serializers
from chainer import Link, Chain
import chainer.functions as F
import chainer.links as L
import copy
import gym

NUM_IN   = 2
NUM_HID1 = 1000
NUM_HID2 = 500
NUM_HID3 = 250
NUM_OUT  = 3
BATCH_SIZE = 32
ALPH   = 0.1
ACTION = [-2, 0, 2]
EPOCH = 100
GAMMA = 0.99
DQNMODEL_PATH = "./DQN_model/mounaincar.model"

def deep_q_learn(env, q_network, target_network, experince):
    # プロッセサーの選択
    xp = switch_ProccerUnit("CPU")
    epsil = 0.2
    for episode in range(1, EPOCH+1):
        # 初期状態の定義
        now_state = env.reset()
        pos, spe = get_state(now_state)
        state_vec = get_state_vec(pos, spe, xp, 2)
        epsil = reduce_epsil(episode, epsil)
        for step in range(200):
            action, max_q = policy_egreedy_tri(env, state_vec, q_network, epsil, xp)
            next_state, reward, terminal, info = agent_action(env, action)
            deep_learn(q_network, target_network, state_vec, env, xp)
            if reward == 1:
                break
            # 状態更新
            pos, spe = get_state(next_state)
            state_vec = get_state_vec(pos, spe, xp, 2)
        
        if episode % 10 == 0:
            q_network.save_weight(DQNMODEL_PATH)
        
def get_state(state):
    return state[0], state[1]
                 

def get_state_vec(pos, spe, xp, flg):
    vec = xp.array([pos, spe], dtype=xp.float32)
    if flg == 1: return vec
    return xp.array([vec], dtype=xp.float32) 
        
    
    
def deep_learn(q_network, target_network, state_vec, env, xp, flg=None):
    y_targets = []
    for action in range(len(ACTION)):
        # 環境をコピー
        env_try = copy.deepcopy(env)
        # 行動による次状態を観測
        next_state, reward, terminal, info = agent_action(env_try, action)
        next_pos, next_spe = get_state(next_state)
        # 状態を観測した次状態として次行動を選択
        next_action, max_q = policy_egreedy_tri(env_try, get_state_vec(next_pos, next_spe, xp, 2), target_network, 0, xp)
        # 次行動よる次々状態を観測
        next_next_state, reward, terminal, info = agent_action(env_try, next_action)
        # 教師データ生成
        y_target = reward + GAMMA * max_q    
        y_targets.append(y_target)
    y_targets = xp.array(y_targets, dtype=np.float32)
    if flg: return y_targets
    y_target = xp.array([y_targets], dtype=np.float32)
    q_network.init_grads()
    loss = q_network.forward(1, state_vec, y_target)
    q_network.backpropagation(loss)

def policy_egreedy_tri(env, state, neural, epsil, xp):
    import scipy.spatial.distance
    qvalue_list = []
    tmp = []
    qvalue_list.append(neural.forward(0, state).data[0])
    qvalue_list.append(neural.forward(0, state).data[0])
    qvalue_vec = np.array(neural.forward(0, state).data[0])
    for qvalue in qvalue_list:
        sim = 1 - scipy.spatial.distance.cosine(xp.array(qvalue), qvalue_vec)
        tmp.append(sim)
    if tmp[0] < tmp[1]:
        return list(qvalue_list[1]).index(max(qvalue_list[1])), max(qvalue_list[1])
    else:
        return list(qvalue_list[0]).index(max(qvalue_list[0])), max(qvalue_list[0])
        
def agent_action(env, action):
    return env.step(action)

def update_target_network(q_network):
    return copy.deepcopy(q_network)

def reduce_epsil(epoch, epsil):
    return epsil
                 
def switch_ProccerUnit(pu):
    return cuda.cupy if pu == "GPU" else np
                 
def main():
    env = gym.make("MountainCar-v0")
    q_network = NeuralNetwork(NUM_IN, NUM_HID1, NUM_HID2, NUM_HID3, NUM_OUT)
    target_network = NeuralNetwork(NUM_IN, NUM_HID1, NUM_HID2, NUM_HID3, NUM_OUT)
    # q_network.load_weight()
    target_network = update_target_network(q_network)
    deep_q_learn(env, q_network, target_network, [0])
        

In [7]:
EPOCH = 500
def start(env, q_network, target_network, experince):
    # プロッセサーの選択
    xp = switch_ProccerUnit("CPU")
    epsil = 0.2
    for episode in range(1, EPOCH+1):
        # 初期状態の定義
        now_state = env.reset()
        pos, spe = get_state(now_state)
        state_vec = get_state_vec(pos, spe, xp, 2)
        epsil = reduce_epsil(episode, epsil)
        action, max_q = policy_egreedy_tri(env, state_vec, q_network, epsil, xp)
        next_state, reward, terminal, info = agent_action(env, action)
        if reward == 1:
            break
        # 状態更新
        pos, spe = get_state(next_state)
        state_vec = get_state_vec(pos, spe, xp, 2)
        env.render()
        
def experiment():
    env = gym.make("MountainCar-v0")
    q_network = NeuralNetwork(NUM_IN, NUM_HID1, NUM_HID2, NUM_HID3, NUM_OUT)
    target_network = NeuralNetwork(NUM_IN, NUM_HID1, NUM_HID2, NUM_HID3, NUM_OUT)
    q_network.load_weight(DQNMODEL_PATH)
    target_network = update_target_network(q_network)
    start(env, q_network, target_network, [0])

In [9]:
experiment()

[2017-05-12 11:18:00,365] Making new env: MountainCar-v0


In [3]:
# coding:utf-8
import chainer
from chainer import Function, Variable, optimizers, serializers
from chainer import Link, Chain
import chainer.functions as F
import chainer.links as L
import matplotlib.pyplot as plt
import numpy as np

class NeuralNetwork:
    def __init__(self, num_in, num_hid1, num_hid2, num_hid3, num_out):
        self.model = Chain(hid_layer1 = L.Linear(num_in, num_hid1),
                           hid_layer2 = L.Linear(num_hid1, num_hid2),
                           hid_layer3 = L.Linear(num_hid2, num_hid3),
                           out_layer  = L.Linear(num_hid3, num_out))
        self.optimizer = optimizers.Adam()
        self.optimizer.setup(self.model)
    
    def forward(self, flg, x, t = None):
        _x = Variable(x)
        if flg == 1: _t = Variable(t)
        h1  = F.dropout(F.relu(self.model.hid_layer1(_x)))
        h2  = F.dropout(F.relu(self.model.hid_layer2(h1)))
        h3  = F.dropout(F.relu(self.model.hid_layer3(h2)))
        u3  = self.model.out_layer(h3)
        # return F.softmax_cross_entropy(u2, _t) if flg else F.softmax(u2)
        # return F.mean_squared_error(self.policy_greedy(u3), _t) if flg else u3
        return F.mean_squared_error(u3, _t) if flg else u3
    
    def backpropagation(self, loss):
        loss.backward()
        self.optimizer.update()
    
    def init_grads(self):
        self.optimizer.zero_grads()
        
    def save_weight(self, model):
        serializers.save_npz(model, self.model)
        
    def load_weight(self, model):
        serializers.load_npz(model, self.model)
        
    def policy_greedy(self, actions):
        return np.max(actions.data, axis = 1)