In [94]:
# coding:utf-8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import random
import chainer
from chainer import Function, Variable, optimizers, serializers
from chainer import Link, Chain
import chainer.functions as F
import chainer.links as L
import copy
import gym

NUM_IN   = 2
NUM_HID1 = 1000
NUM_HID2 = 500
NUM_HID3 = 250
NUM_OUT  = 3
BATCH_SIZE = 32
ALPH   = 0.1
ACTION = [-2, 0, 2]
EPOCH = 100
GAMMA = 0.1
EPSIL = 0.3
DQNMODEL_PATH = "./DQN_model/mounaincar.model"

def deep_q_learn(env, q_network, target_network, experince):
    # プロッセサーの選択
    xp = switch_ProccerUnit("CPU")
    epsil = EPSIL
    record = []
    for episode in range(1, EPOCH+1):
        print(episode)
        # 初期状態の定義
        now_state = env.reset()
        pos, spe = get_state(now_state)
        state_vec = get_state_vec(pos, spe, xp, 2)
        epsil = reduce_epsil(episode, epsil)
        for step in range(2000):
            # 方策に従って行動を選択
            action, _ = policy_egreedy_tri(state_vec, q_network, epsil, xp)
            # 行動による次状態を観測
            next_state, reward, terminal, info = agent_action(env, action)
            next_pos, next_spe = get_state(next_state)
            deep_learn(action, q_network, state_vec, xp, [make_target(action, reward, next_state, terminal, target_network, xp)])
            if reward == 1:
                stock_record(pos, spe, action, 0, 0, reward, 1)
                break
            # 状態更新
            next_pos, next_spe = get_state(next_state)
            state_vec = get_state_vec(next_pos, next_spe, xp, 2)
            # レコード蓄積
            record = stock_record(record, pos, spe, action, next_pos, next_state, reward, 0)
            pos, spe = next_pos, next_spe
            # env.render()
            # Experience_Replayによるバッチ学習
            if len(record) > 500:
                experience_replay(record, q_network, target_network, xp)
                if len(record) > 2000:
                    record = []
            # Target_networkの更新
            if step % 200 == 0:
                q_network.save_weight(DQNMODEL_PATH)
        
def get_state(state):
    return state[0], state[1]

def get_state_vec(pos, spe, xp, flg):
    vec = xp.array([pos, spe], dtype=xp.float32)
    if flg == 1: return vec
    return xp.array([vec], dtype=xp.float32) 
        
    
    
def experience_replay(record, q_network, target_network, xp):
    state_vecs, actions, next_states, rewards, terminals = transelate(record, xp)
    perm = xp.random.permutation(len(record))[:BATCH_SIZE]
    x_batch_state_vecs = state_vecs[perm[0:BATCH_SIZE]]
    x_batch_action     = actions[perm[0:BATCH_SIZE]]
    x_batch_rewards    = rewards[perm[0:BATCH_SIZE]]
    x_batch_terminals  = terminals[perm[0:BATCH_SIZE]]
    y_batch_targets    = []
    for index in range(BATCH_SIZE):
        y_batch_targets.append(make_target(x_batch_action[index], x_batch_rewards[index], next_states[perm[index]], x_batch_tereminals[index], xp))
    deep_learn(x_batch_action, q_network, x_batch_state_vecs, xp, y_targets)

def transelate(record, xp):
    state_vecs  = []
    actions     = []
    rewards     = []
    terminals   = []
    next_states = []
    for data in record:
        state_vecs.append(get_state_vec(data[0][0],data[0][1], xp, 1))
        actions.append(data[1])
        next_states.append(data[2])
        rewards.append(data[3])
        terminals.append(data[4])
    return xp.array(state_vecs, dtype=xp.float32), xp.array(actions), next_states, xp.array(rewards), xp.array(terminals)   
          
def stock_record(record, pos, spe, action, next_pos, next_spe, reward, terminal):
    record.append([(pos, spe), action, (next_pos, next_pos), reward, terminal])
    return record
    
def make_target(action, reward, next_state, tereminal, target_network, xp):
    y_target = [0 for i in range(3)]
    _, max_q = policy_egreedy_tri(get_state_vec(next_state[0], next_state[1], xp, 2), target_network, 0, xp)
    y_target[action] = reward if tereminal else reward + GAMMA * max_q    
    y_target = xp.array(y_target, dtype=np.float32)
    return y_target

def deep_learn(action, q_network, state_vec, xp, y_target, flg=None):
    y_target = xp.array(y_target, dtype=xp.float32)
    q_network.init_grads()
    loss = q_network.forward(1, state_vec, y_target)
    q_network.backpropagation(loss)
    
def policy_egreedy_tri(state, neural, epsil, xp):
    import scipy.spatial.distance
    qvalue_list = []
    tmp = []
    qvalue_list.append(neural.forward(0, state).data[0])
    qvalue_list.append(neural.forward(0, state).data[0])
    qvalue_vec = np.array(neural.forward(0, state).data[0])
    for qvalue in qvalue_list:
        sim = 1 - scipy.spatial.distance.cosine(xp.array(qvalue), qvalue_vec)
        tmp.append(sim)
    if tmp[0] < tmp[1]:
        return (list(qvalue_list[1]).index(max(qvalue_list[1])) if random.random()>epsil else random.choice([0,1,2])), max(qvalue_list[1]) 
    else:
        return (list(qvalue_list[0]).index(max(qvalue_list[0])) if random.random()>epsil else random.choice([0,1,2])), max(qvalue_list[0])
        
def agent_action(env, action):
    return env.step(action)

def update_target_network(q_network):
    return copy.deepcopy(q_network)

def reduce_epsil(epoch, epsil):
    return epsil
                 
def switch_ProccerUnit(pu):
    return cuda.cupy if pu == "GPU" else np
                 
def main():
    env = gym.make("MountainCar-v0")
    q_network = NeuralNetwork(NUM_IN, NUM_HID1, NUM_HID2, NUM_HID3, NUM_OUT)
    target_network = NeuralNetwork(NUM_IN, NUM_HID1, NUM_HID2, NUM_HID3, NUM_OUT)
    q_network.load_weight(DQNMODEL_PATH)
    target_network = update_target_network(q_network)
    deep_q_learn(env, q_network, target_network, [0])
        

In [95]:
# coding:utf-8
import chainer
from chainer import Function, Variable, optimizers, serializers
from chainer import Link, Chain
import chainer.functions as F
import chainer.links as L
import matplotlib.pyplot as plt
import numpy as np

class NeuralNetwork:
    def __init__(self, num_in, num_hid1, num_hid2, num_hid3, num_out):
        self.model = Chain(hid_layer1 = L.Linear(num_in, num_hid1),
                           hid_layer2 = L.Linear(num_hid1, num_hid2),
                           hid_layer3 = L.Linear(num_hid2, num_hid3),
                           out_layer  = L.Linear(num_hid3, num_out))
        self.optimizer = optimizers.Adam()
        self.optimizer.setup(self.model)
    
    def forward(self, flg, x, t = None):
        _x = Variable(x)
        if flg == 1: _t = Variable(t)
        h1  = F.dropout(F.relu(self.model.hid_layer1(_x)))
        h2  = F.dropout(F.relu(self.model.hid_layer2(h1)))
        h3  = F.dropout(F.relu(self.model.hid_layer3(h2)))
        u3  = self.model.out_layer(h3)
        # return F.softmax_cross_entropy(u2, _t) if flg else F.softmax(u2)
        # return F.mean_squared_error(self.policy_greedy(u3), _t) if flg else u3
        return F.mean_squared_error(u3, _t) if flg else u3
    
    def backpropagation(self, loss):
        loss.backward()
        self.optimizer.update()
    
    def init_grads(self):
        self.optimizer.zero_grads()
        
    def save_weight(self, model):
        serializers.save_npz(model, self.model)
        
    def load_weight(self, model):
        serializers.load_npz(model, self.model)
        
    def policy_greedy(self, actions):
        return np.max(actions.data, axis = 1)

In [96]:
main()

[2017-05-12 20:51:20,338] Making new env: MountainCar-v0


1


NameError: name 'x_batch_tereminal' is not defined

In [None]:
# coding:utf-8
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import random
import chainer
from chainer import Function, Variable, optimizers, serializers
from chainer import Link, Chain
import chainer.functions as F
import chainer.links as L
import copy
"""
MAZE = [[-1,-1,-1,-1,-1,-1,-1,-1,-1,-1],
        [-1, 0, 0, 0, 0, 0, 0, 0, 0,-1],
        [-1,-1,-1,-1,-1, 0,-1,-1, 0,-1],
        [-1,-1, 0, 0, 0, 0, 0,-1, 0,-1],
        [-1, 0,-1,-1,-1,-1,-1,-1, 0,-1],
        [-1, 0,-1, 0, 0, 0, 0, 0, 0,-1],
        [-1, 0,-1,-1,-1,-1,-1,-1, 0,-1],
        [-1, 0, 0, 0, 0, 0, 0, 0, 0,-1],
        [-1,-1, 0,-1,-1,-1,-1,-1,-9,-1],
        [-1,-1, 0, 0, 0, 0, 0, 0, 1,-1],
        [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]]
"""
MAZE = [[-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1],
        [-1, 0, 0, 0, 0, 0, 0, 0, 0,-1,-1,-1],
        [-1,-1,-1,-1,-1, 0,-1,-1, 0,-1,-1,-1],
        [-1,-1, 0, 0, 0, 0, 0,-1, 0,-1,-1,-1],
        [-1, 0,-1,-1,-1,-1,-1,-1, 0,-1,-1,-1],
        [-1, 0, 0, 0, 0, 0, 0, 0 ,0,-1,-1,-1],
        [-1, 0,-1,-1,-1,-1,-1,-1, 0,-1,-1,-1],
        [ 0, 0, 0, 0, 0, 0, 0, 0, 0,-1,-1,-1],
        [ 0,-1, 0,-1,-1,-1,-1,-1,-1,-1,-1,-1],
        [ 0,-1, 0, 0, 0, 0, 0, 0, 0,-1,-1,-1],
        [ 0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1],
        [ 0,-1, 0, 0, 0, 0,-1, 0, 0, 0, 1,-1],
        [ 0, 0, 0,-1,-1, 0, 0, 0,-1,-1,-1,-1]]
START  = (1, 1)
ACTION = [(-1, 0), (1, 0), (0, -1), (0, 1)] # [上, 下, 左, 右]
EPOCH  = 30
RESULT = []
NUM_IN   = (len(MAZE)+1) * (len(MAZE[0])+1)
NUM_HID1 = 1000
NUM_HID2 = 500
NUM_HID3 = 250
NUM_OUT  = 4
BATCH_SIZE = 32
ALPH   = 0.1
GAMMA  = np.array([0.99 for i in range(BATCH_SIZE)], np.float32)

def deep_qq_learn(q_network, target_network, records):
    file = open("/Users/chan-p/Desktop/action2.txt", "w")
    result_list  = []
    next_records = []
    best_records = []
    state_index  = init_index()
    EPSIL  = 0.1
    state_vecs, actions, rewords ,next_state_vecs, terminals, next_states, now_states= translate(records)
    for epoch in range(1, EPOCH+1):
        EPSIL = decay_EPSIL(epoch, EPSIL)
        state_vec = init_state_vec()
        now_state = START
        state_vec[state_index[START]] = 1
        record_write(file, 0, 0, 3)
        episode_records = []
        while True:
            # 状態sをDNN用にベクトル化
            state_vec = get_state_vec(state_vec)
            # 状態sをDNNの入力として状態sにおける各行動aの行動価値を算出：Q(s,a)
            # 方策：e-greedy
            if epoch < EPOCH*(1/100):
                action, q_max, action_list = policy_egreedy(state_vec, q_network, EPSIL)
            else:
                action, q_max, action_list = policy_greedy_tri(state_vec, q_network, EPSIL)
            # 次の状態s'を決定
            next_state = get_next_state(now_state, action)
            # 次の状態が迷路外ならエピソード終了
            if state_check(next_state) == 0 or MAZE[next_state[0]][next_state[1]] == -1:
                print(now_state)
                # 罰則による学習
                deep_learn(q_network, target_network, state_vec, now_state, next_state, 0)
                record_write(file, action[0], action[1], 1)
                next_records.append((now_state, action, -1, 1))
                next_records.extend(episode_records)
                result_list.append(0)
                break
            # 即時報酬
            reward = MAZE[next_state[0]][next_state[1]]
            if reward == 1:
                print(epoch)
                # 成功報酬による学習
                deep_learn(q_network, target_network, state_vec, now_state, next_state, 0)
                record_write(file, 0, 0, 2)
                # 成功体験を優先してExperiment_Replayレコードに追加
                best_records.append((now_state, action, 1, 1))
                best_records.extend(episode_records)
                result_list.append(1)
                break
            # 実行動による学習
            deep_learn(q_network, target_network, state_vec, now_state, next_state, 1)
            # Experiment_Replayレコードに追加
            episode_records.append((now_state, action, reward, 0))
            record_write(file, action[0], action[1], 0)
            now_state = next_state
            state_vec = init_state_vec()
            state_vec[state_index[now_state]] = 1

            # 学習
            # experiment_replayによるバッチ学習
            perm = np.random.permutation(len(records))[:BATCH_SIZE]
            x_batch_state   = state_vecs[perm[0:BATCH_SIZE]]
            x_batch_action  = actions[perm[0:BATCH_SIZE]]
            x_batch_rewords = rewords[perm[0:BATCH_SIZE]]
            x_batch_now_state = []
            x_batch_next_state = []
            for index in perm:
                x_batch_now_state.append(now_states[index])
                x_batch_next_state.append(next_states[index])
            x_batch_next_state_vec = next_state_vecs[perm[0:BATCH_SIZE]]
            y_batch_target = []
            # ニューラルの重み更新
            for index in range(BATCH_SIZE):
                y_batch_target.append(deep_learn(q_network, target_network, x_batch_state[index], x_batch_now_state[index], x_batch_next_state[index], 1))
            y_batch_target = np.array(y_batch_target, dtype=np.float32)
            q_network.init_grads()
            loss = q_network.forward(1, x_batch_state, y_batch_target)
            q_network.backpropagation(loss)
              
        # レコードの更新
        if len(next_records) + len(best_records) > 5000:
            best_records.extend(next_records)
            state_vecs, actions, rewords ,next_state_vecs, terminals, next_states, now_states = translate(best_records)
            records = copy.copy(best_records)
            if len(best_records) > 5000:
                best_records = []
            next_records = []
        
        # target networkの更新
        if epoch % 5 == 0:
            print(epoch)
            target_network = update_target_network(q_network)
            
        if epoch % 1 == 0:
            q_network.save_weight()
            
    file.close()
    return result_list

def init_state_vec():
    return np.array([0 for i in range(NUM_IN)], dtype=np.float32)

def get_state_vec(state_vec):
    return np.array([state_vec], dtype = np.float32)

def get_next_state(state, action):
    return (state[0]+action[0], state[1]+action[1])

def deep_learn(q_network, target_network, state_vec, now_state, next_state, flg):
    y_targets = []
    state_index = init_index()
    for action in ACTION:
        next_state_vec = init_state_vec()
        next_state = get_next_state(now_state, action)
        # 次の状態が迷路外
        # 報酬(罰則)のみ
        if next_state[0] < 0 or next_state[0] > len(MAZE)-1 or next_state[1] > len(MAZE[0])-1 or next_state[1] < 0 or MAZE[next_state[0]][next_state[1]] == -1:
            y_target = -2
        else:
            next_state_vec[state_index[(next_state[0], next_state[1])]] = 1
            next_actions = target_network.forward(0, np.array([next_state_vec], dtype=np.float32))
            max_q = np.max(next_actions.data[0], axis = 0)
            y_target = MAZE[next_state[0]][next_state[1]] + 0.1 * max_q
        y_targets.append(y_target)
    y_targets = np.array(y_targets, dtype=np.float32)
    if flg: return y_targets
    y_target = np.array([y_targets], dtype=np.float32)
    q_network.init_grads()
    loss = q_network.forward(1, state_vec, y_target)
    q_network.backpropagation(loss)

def update_target_network(q_network):
    return copy.deepcopy(q_network)

def state_check(state):
    if (state[0] < 0) or (state[1] < 0) or (len(MAZE)-1) < state[0] or (len(MAZE[0])-1 < state[1]):
        RESULT.append(0)
        return 0
    return 1 

def policy_egreedy(state, neural, EPSIL):
    qvalue = neural.forward(0, state).data[0]
    return (ACTION[random.choice([i for i, x in enumerate(qvalue) if x == max(qvalue)])] if EPSIL < random.random() else random.choice(ACTION)), max(qvalue), qvalue           

def policy_greedy_tri(state, neural, EPSIL):
    import scipy.spatial.distance
    qvalue_list = []
    tmp = []
    qvalue_list.append(neural.forward(0, state).data[0])
    qvalue_list.append(neural.forward(0, state).data[0])
    qvalue_vec = np.array(neural.forward(0, state).data[0])
    for qvalue in qvalue_list:
        sim = 1 - scipy.spatial.distance.cosine(np.array(qvalue), qvalue_vec)
        tmp.append(sim)
    if tmp[0] < tmp[1]:
        return ACTION[list(qvalue_list[1]).index(max(qvalue_list[1]))], max(qvalue_list[1]), qvalue_list[1]
    else:
        return ACTION[list(qvalue_list[0]).index(max(qvalue_list[0]))], max(qvalue_list[0]), qvalue_list[0]

def translate(records):
    now_states = []
    state_vecs  = []
    actions = []
    rewords = []
    next_states = []
    terminals = []
    next_state_vecs = []
    state_index = init_index()
    state_vec = np.array([0 for i in range(NUM_IN)], dtype=np.float32)
    next_state_vec = np.array([0 for i in range(NUM_IN)], dtype=np.float32)
    for record in records:
        now_states.append(record[0])
        next_state = (record[0][0]+record[1][0], record[0][1]+record[1][1])
        next_states.append(next_state)
        state_vec = np.array([0 for i in range(NUM_IN)], dtype=np.float32)
        state_vec[state_index[record[0]]] = 1
        state_vecs.append(state_vec)
        actions.append(record[1])
        rewords.append(record[2])
        terminals.append(record[3])
        if record[3] == 1:
            next_state_vec = np.array([0 for i in range(NUM_IN)], dtype=np.float32)
            next_state_vecs.append(next_state_vec)
        else:
            next_state_vec = np.array([0 for i in range(NUM_IN)], dtype=np.float32)
            next_state_vec[state_index[next_state]] = 1
            next_state_vecs.append(next_state_vec)
    return np.array(state_vecs, dtype=np.float32), np.array(actions), np.array(rewords, dtype=np.float32), np.array(next_state_vecs, dtype=np.float32), np.array(terminals, dtype=np.float32), next_states, now_states

def experience_replay():
    records = []
    with open("./record.csv") as f:
        for line in f:
            line   = line[:-1].split(",")
            state  = (int(line[0]), int(line[1]))
            action = (int(line[2]), int(line[3]))
            reword = int(line[4])
            terminal = int(line[5])
            record = (state, action, reword, terminal)
            records.append(record)
    return records

def record_write(file, state_y, state_x, terminal):
    file.write(str(state_y) + "," + str(state_x) + "," +str(terminal) + "\n")

def init_index():
    qtable_index = {}
    num = 0
    for y in range(len(MAZE)+1):
        for x in range(len(MAZE[0])+1):
            qtable_index[(y, x)] = num
            num += 1
    return qtable_index

def decay_EPSIL(epoch, EPSIL):
    if epoch > (EPOCH/3)*2:
        return EPSIL/(epoch)*(EPOCH/10)
    return EPSIL

def main():
    q_network = NeuralNetwork(NUM_IN, NUM_HID1, NUM_HID2, NUM_HID3, NUM_OUT)
    target_network = NeuralNetwork(NUM_IN, NUM_HID1, NUM_HID2, NUM_HID3, NUM_OUT)
    q_network.load_weight()
    target_network = update_target_network(q_network)
    return deep_q_learn(q_network, target_network, experience_replay())

In [1]:
# coding:utf-8
import gym
import copy

def pp(env):
    return copy.deepcopy(env)

def main1():
    env = gym.make('MountainCar-v0')
    # env1 = gym.make('MountainCar-v0')
    ob1 = env.reset()
    # ob2 = env1.reset()
    ob2 = ob1
    env1 = copy.deepcopy(env)
    for i in range(3100):
        # env.render()
        action = env.action_space.sample()
        # action = 2
        print("asd:" + str(env.step(2)))
        for o in range(10):
            action = env1.action_space.sample()
            env1.step(action)
            env1 = pp(env)
            if o == 9999: print(env1.step(2))
        print("asd:" + str(env.step(2)))
        env1 = copy.deepcopy(env)
        print(str(env.step(0)))

In [None]:
main1()