In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

__file__ = '/content/drive/Othercomputers/MacBook_Air/path-finding-rl/data'

Mounted at /content/drive


In [None]:
from string import ascii_uppercase
#from draw_utils import *
#from pyglet.gl import *
import numpy as np
import pandas as pd
import os
import random
from datetime import datetime
import pytz
import matplotlib.pyplot as plt

# reward
move_reward = -1
obs_reward = -10
goal_reward = 100

###################################
# train or test 모드 지정
train_mode = True
###################################
print('reward:' , move_reward, obs_reward, goal_reward)

#__file__ = '/home/ogangza/heung_path_finding/path-finding-rl/data'

local_path = os.path.abspath(os.path.join(os.path.dirname(__file__)))


class Simulator:
    def __init__(self):
        '''
        height : 그리드 높이
        width : 그리드 너비 
        inds : A ~ Q alphabet list
        '''
        #######################################################################################
        # Load train or test data
        if train_mode:  # 훈련 데이타 읽기
            self.files = pd.read_csv(os.path.join(local_path, "data/factory_order_train.csv"))
            print('data/factory_order_train.csv used')
        else:  # 테스트 데이터 읽기
            self.files = pd.read_csv(os.path.join(local_path, "data/factory_order_test.csv"))
            print('data/factory_order_test.csv used')
        #######################################################################################
        self.height = 10
        self.width = 9
        self.inds = list(ascii_uppercase)[:17]

    def set_box(self):
        '''
        아이템들이 있을 위치를 미리 정해놓고 그 위치 좌표들에 아이템이 들어올 수 있으므로 그리드에 100으로 표시한다.
        데이터 파일에서 이번 에피소드 아이템 정보를 받아 가져와야 할 아이템이 있는 좌표만 -100으로 표시한다.
        self.local_target에 에이전트가 이번에 방문해야할 좌표들을 저장한다.
        따라서 가져와야하는 아이템 좌표와 end point 좌표(처음 시작했던 좌표로 돌아와야하므로)가 들어가게 된다.
        '''
        box_data = pd.read_csv(os.path.join(local_path, "./data/box.csv"))
        

        ######################################
        # box의 list 만들어서 step에 활용하기
        self.box_list = box_data[['row', 'col']]
        # print(f"box_list = {self.box_list}")
        ######################################
        

        ######################################
        #print('box data:', box_data)
        ######################################

        # 물건이 들어있을 수 있는 경우
        for box in box_data.itertuples(index = True, name ='Pandas'):
            self.grid[getattr(box, "row")][getattr(box, "col")] = -10
            
            #######################################
            #print('self.grid:', self.grid)
            #######################################
        

        # 물건이 실제 들어있는 경우
        order_item = list(set(self.inds) & set(self.items))
        order_csv = box_data[box_data['item'].isin(order_item)]
        
        ########################################
        #print('order_csv:', order_csv)
        ########################################

        for order_box in order_csv.itertuples(index = True, name ='Pandas'):
            self.grid[getattr(order_box, "row")][getattr(order_box, "col")] = 50
            # local target에 가야 할 위치 좌표 넣기
            self.local_target.append(
                [getattr(order_box, "row"),
                 getattr(order_box, "col")]
                )

            #######################################
            #print('self.grid:', self.grid)
            #######################################

        ################################################
        #print('self.local_target.sort() 전:', self.local_target)
        ################################################
        
        # self.local_target.sort() 
        self.local_target.append([9,4]) 

        ###############################
        # local_target을 100으로 두기
        x_temp_local_target, y_temp_local_target = self.local_target[0]
        self.grid[x_temp_local_target][y_temp_local_target] = 100
        ###############################

        ################################################
        #print('self.local_target.sort():', self.local_target)
        ###############################################
        
        # 알파벳을 Grid에 넣어서 -> grid에 2Dconv 적용 가능

    def set_obstacle(self):
        '''
        장애물이 있어야하는 위치는 미리 obstacles.csv에 정의되어 있다. 이 좌표들을 0으로 표시한다.
        '''
        obstacles_data = pd.read_csv(os.path.join(local_path, "./data/obstacles.csv"))
        for obstacle in obstacles_data.itertuples(index = True, name ='Pandas'):
            self.grid[getattr(obstacle, "row")][getattr(obstacle, "col")] = -10
            
        ##########################################
        #print('self.grid:', self.grid)
        ##########################################
        
    def reset(self, epi):
        # initial episode parameter setting
        self.epi = epi
        self.items = list(self.files.iloc[self.epi])[0]
        self.cumulative_reward = 0
        self.terminal_location = None
        self.local_target = []
        self.actions = []

        # initial grid setting
        # self.grid = np.ones((self.height, self.width), dtype="float16")
        self.grid = np.full((self.height, self.width), -1., dtype="float16")

        # set information about the gridworld
        self.set_box()
        self.set_obstacle()

        # start point를 grid에 표시
        self.curloc = [9, 4]
        self.grid[int(self.curloc[0])][int(self.curloc[1])] = 0
        
        self.done = False
        
        ########################################
        #print('reset curloc:', self.curloc)
        ########################################
        
        return self.grid

    def apply_action(self, action, cur_x, cur_y):
        new_x = cur_x
        new_y = cur_y
        # up
        if action == 0:
            new_x = cur_x - 1
        # down
        elif action == 1:
            new_x = cur_x + 1
        # left
        elif action == 2:
            new_y = cur_y - 1
        # right
        else:
            new_y = cur_y + 1

        return int(new_x), int(new_y)


    def get_reward(self, new_x, new_y, out_of_boundary):
        # 바깥으로 나가는 경우
        if any(out_of_boundary):
            reward = obs_reward
        else:
            # 장애물에 부딪히는 경우 
            if self.grid[new_x][new_y] == -10:
                reward = obs_reward  
            # 현재 목표에 도달한 경우
            elif new_x == self.terminal_location[0] and new_y == self.terminal_location[1]:
                reward = goal_reward
            # 그냥 움직이는 경우 
            else:
                reward = move_reward

        return reward

    def step(self, action):
        self.terminal_location = self.local_target[0]
        cur_x,cur_y = self.curloc
        self.actions.append((cur_x, cur_y))

        goal_ob_reward = False
        
        new_x, new_y = self.apply_action(action, cur_x, cur_y)

        out_of_boundary = [new_x < 0, new_x >= self.height, new_y < 0, new_y >= self.width]

        is_box_list = self.box_list.copy()
        is_box_list = is_box_list.loc[(is_box_list['row']==cur_x) & (is_box_list['col']==cur_y)]

        ####################################
        # test용
        # 9,4를 is_box_list에 append해서 결과 확인해보기
        # test_df = pd.DataFrame(data=[[9,4]], index=['17'], columns=['row','col'])
        # is_box_list = pd.concat([is_box_list,test_df],join='inner')
        # is_box_list = is_box_list.loc[(is_box_list['row']==cur_x) & (is_box_list['col']==cur_y)]
        # print(len(is_box_list))
        ####################################

        # 바깥으로 나가는 경우 종료
        if any(out_of_boundary):
            
            #원 위치
            #new_x = cur_x
            #new_y = cur_y
            
            self.done = True
            # goal_ob_reward = True
            
        else:
            # 장애물에 부딪히는 경우 종료
            if self.grid[new_x][new_y] == -10:
                
                #원 위치
                #new_x = cur_x
                #new_y = cur_y
                
                self.done = True
                # goal_ob_reward = True

            # 현재 목표에 도달한 경우, 다음 목표 설정
            elif new_x == self.terminal_location[0] and new_y == self.terminal_location[1]:

                # end point 일 때
                if [new_x, new_y] == [9,4]:
                    self.done = True

                self.local_target.remove(self.local_target[0])
                ##########################
                # local_target
                x_temp_local_target, y_temp_local_target = self.local_target[0]
                self.grid[x_temp_local_target][y_temp_local_target] = 100
                ##########################
                
                self.grid[cur_x][cur_y] = -1
                self.grid[new_x][new_y] = 0
                goal_ob_reward = True
                self.curloc = [new_x, new_y]

                
            else:
                ##########################
                # 현재 좌표가 box_list에 해당하면 움직였을 때 grid값이 1이 아닌 100 넣어주기
                if len(is_box_list) == True:
                    self.grid[cur_x][cur_y] = -10
                else:
                    # 그냥 움직이는 경우 
                    self.grid[cur_x][cur_y] = -1
                ##########################

                self.grid[new_x][new_y] = 0
                self.curloc = [new_x,new_y]
                
        reward = self.get_reward(new_x, new_y, out_of_boundary)
        self.cumulative_reward += reward
        return self.grid, reward, self.cumulative_reward, self.done, goal_ob_reward, self.curloc, self.local_target[0]

reward: -1 -10 100


In [None]:
import gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Hyperparameters
learning_rate = 0.0005
gamma         = 0.9
buffer_limit  = 50000 
batch_size    = 2

class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)
    
    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst, cur_location_lst, target_location_lst = [], [], [], [], [], [], []
        #######################################################################
        for transition in mini_batch:
            s, a, r, s_prime, done_mask, cur_location, target_location = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])
            cur_location_lst.append([cur_location])
            target_location_lst.append([target_location])

        return torch.tensor(np.array(s_lst), dtype=torch.float), torch.tensor(np.array(a_lst)), \
               torch.tensor(np.array(r_lst)), torch.tensor(np.array(s_prime_lst), dtype=torch.float), \
               torch.tensor(np.array(done_mask_lst)), torch.tensor(np.array(cur_location_lst)), \
               torch.tensor(np.array(target_location_lst))
    
    def size(self):
        return len(self.buffer)

class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self.fc1 = nn.Linear(90, 360)
        self.fc2 = nn.Linear(360, 720)
        self.fc3 = nn.Linear(720, 360)
        self.fc4 = nn.Linear(360, 64)
        self.fc5 = nn.Linear(64, 4)

    def forward(self, x):
        x = x.to(device)
        x = F.relu(self.fc1(x))
        # print(x.shape)
        x = F.relu(self.fc2(x))
        # print(x.shape)
        x = F.relu(self.fc3(x))
        # print(x.shape)
        x = F.relu(self.fc4(x))
        # print(x.shape)
        x = self.fc5(x)
        # print(x.shape)
        x = F.softmax(x, dim=0)
        # print(x.shape)
        return x
      
    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random()
        if coin < epsilon:
            return random.randint(0,3)
        else : 
            return out.argmax().item()

    ###################################################
    def test_action(self, obs):  # Test용 action 추가
        out = self.forward(obs)
        return out.argmax().item()
    ###################################################
            
def train(q, q_target, memory, optimizer):
    for i in range(10):
        s,a,r,s_prime,done_mask,cur_location,target_location = memory.sample(batch_size)


        q_out = q(s)
        q_a = q_out.gather(1,a.to(device))
        max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
        target = r.to(device) + gamma * max_q_prime * done_mask.to(device)

        #####################################
        # print("q_a")
        # print(q_a)
        # print(q_a.shape)
        # print("\n\n\n")
        # print("target")
        # print(target)
        # print(target.shape)
        # print("################")
        #####################################

        # loss = F.smooth_l1_loss(q_a, target)
        loss_fn = nn.MSELoss()
        loss = loss_fn(q_a.to(torch.float32), target.to(torch.float32).detach())
        # print(f"loss = {loss}")
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
def main():
    tz = pytz.timezone('Asia/Seoul')

    env = Simulator()
    q = Qnet()
    q.to(device)
    q_target = Qnet()
    q_target.to(device)
    q_target.load_state_dict(q.state_dict())
    memory = ReplayBuffer()

    print_interval = 1000
    score = 0.0  
    optimizer = optim.Adam(q.parameters(), lr=learning_rate)
    
    print('len(env.files):', len(env.files))
    
    #path_length = []

    for epoch in range(10):
        cur_time = datetime.now(tz)
        simple_cur_time = cur_time.strftime("%H:%M:%S")
        print(f"{epoch}번째 epoch, 시작▶{simple_cur_time}")
        for n_epi in range(len(env.files)): # range()의 인수로 len(env.files) 사용하면 됨 (=39,999)
            # epsilon = max(0.01, 1 - 0.01*(epoch/1))
            epsilon = max(0.01, 1 - (0.01*(epoch*10)))
            env.grid = env.reset(n_epi)  # env reset: 에피소드 번호에 해당하는 목표물 리스트 env.local_target 생성, 그리드맵 생성
            start_point = [9,4]  # 출발지 지정
            end_point = env.local_target[0]  # 첫 번째 목적지 지정
            done = False
            ###################
            grid_temp_1 = env.grid.reshape(1,90)
            grid_temp_1 = grid_temp_1.squeeze()
            s = grid_temp_1
            ###################
            old_score = 0.0
            # print(f"env.grid=\n{env.grid}")

            while not done:
                a = q.sample_action(torch.from_numpy(s).float(), epsilon)
                grid, r, cum_reward, done, goal_ob_reward, cur_location, target_location = env.step(a)
                ###################
                grid_temp_2 = grid.reshape(1,90)
                grid_temp_2 = grid_temp_2.squeeze()
                s_prime = grid_temp_2

                # done_mask = 0.0 if done else 1.0 # done = True일 때만 memory.put 진행한다. -> 죽는거만 메모리에 들어가지 않을까?
                if (done == True and cum_reward > 0) or goal_ob_reward == True: # -> target_location에 도착했을 때도 추가해야함
                    memory.put((s,a,r/100.0,s_prime,done,cur_location,target_location)) # 현재 agent 좌표랑 local target 좌표 추가
                s = s_prime
                

                score += r
                '''
                if score > old_score:
                        cur_time = datetime.now(tz)
                        simple_cur_time = cur_time.strftime("%H:%M:%S")
                        print('▶ Episode #', n_epi, 'start time:', simple_cur_time, end='→')
                        print('Score =', score, 'appended actions =', s, end='...')
                        print('경로 길이:', len(env.actions))
                '''
                old_score = score
                        
                if done:
                    break
            if memory.size() > 5: 
                train(q, q_target, memory, optimizer)

            if n_epi % print_interval == 0 and n_epi != 0:
                cur_time = datetime.now(tz)
                simple_cur_time = cur_time.strftime("%H:%M:%S")
                print(f"Epoch # {epoch}, Episode # {n_epi}, start time: {simple_cur_time}")
                q_target.load_state_dict(q.state_dict())
                print(f"score:{score/print_interval:.1f}, n_buffer:{memory.size()}, eps:{epsilon*100:.1f}")
                
                score = 0.0

                # torch.save(q, model_path)
                torch.save(q, __file__+f"/{epoch}_{n_epi}_model.pt")
            ##### 액션 결과물 확인하기
            if env.cumulative_reward > 0:
                print(f"{epoch}번째 에포크로, {n_epi}번째 에피소드는 {env.cumulative_reward:.1f}점 입니다.")
                print(f"target은 {env.items} ▶▶▶ {env.local_target}입니다.")
                print(f"action은 총 {len(env.actions)}번 움직였습니다.\n{env.actions}")
                # print(grid)
                print("###############")

        torch.save(q, __file__+f"/{epoch}_{n_epi}_model.pt")

# if __name__ == '__main__':
#     main()

data/factory_order_train.csv used
len(env.files): 39999
0번째 epoch, 시작▶15:31:37
Epoch # 0, Episode # 1000, start time: 15:31:52
score:-11.2, n_buffer:0, eps:100.0
Epoch # 0, Episode # 2000, start time: 15:32:05
score:-11.3, n_buffer:0, eps:100.0
0번째 에포크로, 2665번째 에피소드는 62.0점 입니다.
target은 ['A', 'D', 'E', 'J', 'K', 'N', 'O'] ▶▶▶ [[2, 0], [0, 0], [0, 5], [0, 6], [2, 8], [3, 8], [9, 4]]입니다.
action은 총 30번 움직였습니다.
[(9, 4), (8, 4), (8, 3), (8, 2), (8, 3), (8, 2), (7, 2), (7, 3), (8, 3), (7, 3), (8, 3), (7, 3), (6, 3), (5, 3), (6, 3), (7, 3), (6, 3), (7, 3), (7, 2), (8, 2), (7, 2), (8, 2), (8, 1), (7, 1), (7, 2), (7, 1), (7, 0), (6, 0), (5, 0), (6, 0)]
###############
Epoch # 0, Episode # 3000, start time: 15:32:18
score:-11.3, n_buffer:2, eps:100.0
Epoch # 0, Episode # 4000, start time: 15:32:31
score:-11.3, n_buffer:2, eps:100.0
Epoch # 0, Episode # 5000, start time: 15:32:45
score:-11.2, n_buffer:2, eps:100.0
Epoch # 0, Episode # 6000, start time: 15:32:58
score:-11.2, n_buffer:2, eps:100.0
0

In [None]:
def test():
    model_name = "/9_39998_model.pt"
    model = torch.load(__file__ + model_name)
    print(f"model path = {model_name}")
    train_mode = False  # False
    tz = pytz.timezone('Asia/Seoul')
    env = Simulator()
    q = Qnet()
    q.to(device)
    s_before = None
    print_interval = 1
    score = 0.0


    ################################
    # 액션 저장하는 txt파일 만들기
    # my_file_name = f"{model_name}_{move_reward}_{obs_reward}_{goal_reward}"
    # f = open(my_file_name+'.txt', 'w')
    ################################

    for n_epi in range(len(env.files)): # range()의 인수로 len(env.files) 사용하면 됨 (=1225)
        env.grid = env.reset(n_epi)
        grid_map = env.grid.reshape(1,90)          # 그리드 맵
        

        ################################
        # 아이템 리스트 확인
        items = list(env.files.iloc[n_epi])[0]
        print('가져와야 할 아이템 : ' + list(env.files.iloc[n_epi])[0] + '\n')
        items = env.local_target
        # isFinished flag
        isFinished = False
        ################################

        # 출발지, 목적지 좌표 지정
        start_point = [9,4]  # 출발지 지정
        end_point = env.local_target[0]  # 첫 번째 목적지 지정
        #######################################################################
        # 목적지 도착 flag (done) 및 인풋 데이터 (s) 초기화
        done = False

        grid_temp_1 = env.grid.reshape(1,90)
        grid_temp_1 = grid_temp_1.squeeze()
        s = grid_temp_1
        #######################

        while not done:  # OB, 장애물, 최종 목표 도달시 종료
            a = q.test_action(torch.from_numpy(s).float())  # test_action 수행
            print(a)
            grid, r, cum_reward, done, goal_ob_reward, cur_location, target_location = env.step(a)
            
            grid_temp_2 = grid.reshape(1,90)
            grid_temp_2 = grid_temp_2.squeeze()
            s_prime = grid_temp_2
            s = s_prime
            score += r

        # print_interval (=1) 마다 현황 디스플레이하고 q_target 업데이트 및 score 초기화
        if n_epi % print_interval == 0 and n_epi != 0:
            cur_time = datetime.now(tz)
            simple_cur_time = cur_time.strftime("%H:%M:%S")
            print('▶ Episode #', n_epi, end=' → ')
            print('Score =', score, end='...')
            print('경로 길이:', len(env.actions), end=', ')
            if goal_ob_reward == 'finish':
                print('성공 여부: ', goal_ob_reward)
                isFinished = True
            else:
                print('성공 여부 : 실패', goal_ob_reward)
        
        score = 0.0
        print('→ pred_path:', env.actions)

        ##### 액션 결과물 확인하기
        if len(env.actions) > 15 and env.cumulative_reward > 0:
            print(f"move = {move_reward}, obs = {obs_reward}, goal = {goal_reward}")
            print(f"{n_epi}번째 에피소드는 {env.cumulative_reward:.1f}점 입니다.")
            print(f"target은 {env.items} ▶▶▶ {env.local_target}입니다.")
            print(f"action은 총 {len(env.actions)}번 움직였습니다.\n{env.actions}")
            print(grid)
            print("###############")

    #########################################    
    #     if len(env.actions) > 5:
    #         f.write(str(n_epi)+'/'+str(items)+'/'+str(cum_reward)+'/'+str(isFinished)+'\n')
    #         f.write(str(env.actions))
    #         f.write('\n')

    # f.close()
    #########################################

    # 모든 에피소드 종료 후 결과 디스플레이
    # 코드 추가할 것!!!

# test 모드 실행
train_mode = False
test()

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
1
▶ Episode # 226 → Score = -10.0...경로 길이: 1, 성공 여부 : 실패 False
→ pred_path: [(9, 4)]
가져와야 할 아이템 : ['A', 'G', 'I', 'J', 'N', 'O', 'Q']

1
▶ Episode # 227 → Score = -10.0...경로 길이: 1, 성공 여부 : 실패 False
→ pred_path: [(9, 4)]
가져와야 할 아이템 : ['A', 'B', 'H', 'K', 'N', 'P', 'Q']

1
▶ Episode # 228 → Score = -10.0...경로 길이: 1, 성공 여부 : 실패 False
→ pred_path: [(9, 4)]
가져와야 할 아이템 : ['C', 'D', 'H', 'J', 'K', 'P', 'Q']

1
▶ Episode # 229 → Score = -10.0...경로 길이: 1, 성공 여부 : 실패 False
→ pred_path: [(9, 4)]
가져와야 할 아이템 : ['A', 'C', 'I', 'L', 'N', 'Q']

1
▶ Episode # 230 → Score = -10.0...경로 길이: 1, 성공 여부 : 실패 False
→ pred_path: [(9, 4)]
가져와야 할 아이템 : ['B', 'D', 'G', 'H', 'I', 'O']

1
▶ Episode # 231 → Score = -10.0...경로 길이: 1, 성공 여부 : 실패 False
→ pred_path: [(9, 4)]
가져와야 할 아이템 : ['D', 'E', 'I', 'J', 'L', 'M', 'Q']

1
▶ Episode # 232 → Score = -10.0...경로 길이: 1, 성공 여부 : 실패 False
→ pred_path: [(9, 4)]
가져와야 할 아이템 : ['A', 'C', 'E', 'G', 'L', 'P', 'Q']

1
▶ Episode # 2

### 기존 main(), test() 코드

In [None]:
# def main():
#     tz = pytz.timezone('Asia/Seoul')

#     env = Simulator()
#     q = Qnet()
#     q.to(device)
#     q_target = Qnet()
#     q_target.to(device)
#     q_target.load_state_dict(q.state_dict())
#     memory = ReplayBuffer()

#     print_interval = 1000
#     score = 0.0  
#     optimizer = optim.Adam(q.parameters(), lr=learning_rate)
    
#     print('len(env.files):', len(env.files))
    
#     #path_length = []

    
#     for n_epi in range(len(env.files)): # range()의 인수로 len(env.files) 사용하면 됨 (=39,999)
#     # for n_epi in range(1): # test용

#         epsilon = max(0.01, 1 - 0.01*(n_epi/200)) # 상민님 epi/40000
#         env.grid = env.reset(n_epi)  # env reset: 에피소드 번호에 해당하는 목표물 리스트 env.local_target 생성, 그리드맵 생성
#         start_point = [9,4]  # 출발지 지정
#         end_point = env.local_target[0]  # 첫 번째 목적지 지정
#         done = False
#         ###################
#         grid_temp_1 = env.grid.reshape(1,90)
#         grid_temp_1 = grid_temp_1.squeeze()
#         s = grid_temp_1
#         ###################
#         old_score = 0.0
#         # print(f"env.grid=\n{env.grid}")

#         while not done:
#             a = q.sample_action(torch.from_numpy(s).float(), epsilon)
#             grid, r, cum_reward, done, goal_ob_reward, cur_location, target_location = env.step(a)
#             ###################
#             grid_temp_2 = grid.reshape(1,90)
#             grid_temp_2 = grid_temp_2.squeeze()
#             s_prime = grid_temp_2

#             # done_mask = 0.0 if done else 1.0 # done = True일 때만 memory.put 진행한다. -> 죽는거만 메모리에 들어가지 않을까?
#             if done == True and cum_reward > 0:
#                 memory.put((s,a,r/100.0,s_prime,done,cur_location,target_location)) # 현재 agent 좌표랑 local target 좌표 추가
#             s = s_prime
            

#             score += r
#             '''
#             if score > old_score:
#                     cur_time = datetime.now(tz)
#                     simple_cur_time = cur_time.strftime("%H:%M:%S")
#                     print('▶ Episode #', n_epi, 'start time:', simple_cur_time, end='→')
#                     print('Score =', score, 'appended actions =', s, end='...')
#                     print('경로 길이:', len(env.actions))
#             '''
#             old_score = score
                       
#             if done:
#                 break

#             # while loop 종료

#         ############################################################################
#                 ## memory.size(): 2000 → 10000 → 20000 수정... 이후 훈련 시작
#         ############################################################################   
#         if memory.size() > 10000: # 20000 -> 10000
#             train(q, q_target, memory, optimizer)

#         if n_epi % print_interval == 0 and n_epi != 0:
#             cur_time = datetime.now(tz)
#             simple_cur_time = cur_time.strftime("%H:%M:%S")
#             print('▶ Episode #', n_epi, 'start time:', simple_cur_time, end='→')
#             q_target.load_state_dict(q.state_dict())
#             print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(
#                                                             n_epi, score/print_interval, memory.size(), epsilon*100))
            
#             score = 0.0

#             # torch.save(q, model_path)
#             torch.save(q, __file__+f"/{n_epi}_model.pt")
#         ##### 액션 결과물 확인하기
#         if env.cumulative_reward > 0:
#             print(f"move = {move_reward}, obs = {obs_reward}, goal = {goal_reward}")
#             print(f"{n_epi}번째 에피소드는 {env.cumulative_reward:.1f}점 입니다.")
#             print(f"target은 {env.items} ▶▶▶ {env.local_target}입니다.")
#             print(f"action은 총 {len(env.actions)}번 움직였습니다.\n{env.actions}")
#             print(grid)
#             print("###############")

#     # torch.save(q, model_path)
#     torch.save(q, __file__+f"/{n_epi}_model.pt")

# if __name__ == '__main__':
#     main()

In [None]:
# def test():
#     # model = torch.load('/content/drive/MyDrive/aiffelthon/data/model.pt')
#     # model = torch.load(model_path)
#     model_name = "/5000_model.pt"
#     model = torch.load(__file__ + model_name)
#     print(f"model path = {model_name}")
#     train_mode = False  # False
#     tz = pytz.timezone('Asia/Seoul')
#     env = Simulator()
#     q = Qnet()
#     s_before = None
#     print_interval = 1
#     score = 0.0

#     ################################
#     # 액션 저장하는 txt파일 만들기
#     # my_file_name = f"{model_name}_{move_reward}_{obs_reward}_{goal_reward}"
#     # f = open(my_file_name+'.txt', 'w')
#     ################################

#     for n_epi in range(len(env.files)): # range()의 인수로 len(env.files) 사용하면 됨 (=1225)
#         env.grid = env.reset(n_epi)
#         grid_map = env.grid.reshape(-1)          # 그리드 맵

#         ################################
#         # 아이템 리스트 확인
#         items = list(env.files.iloc[n_epi])[0]
#         print('가져와야 할 아이템 : ' + list(env.files.iloc[n_epi])[0] + '\n')
#         items = env.local_target
#         # isFinished flag
#         isFinished = False
#         ################################

#         # 출발지, 목적지 좌표 지정
#         #env.x, env.y = start_point                # 출발지 좌표
#         #env.end_x, env.end_y = end_point          # 목적지 좌표
#         start_point = [9,4]  # 출발지 지정
#         end_point = env.local_target[0]  # 첫 번째 목적지 지정
#         #######################################################################
#         # 목적지 도착 flag (done) 및 인풋 데이터 (s) 초기화
#         done = False
#         # s_temp_1 = np.append(grid_map, start_point)
#         # s_temp_2 = np.append(s_temp_1, end_point)
#         # s = np.array(s_temp_2)
#         #######################
#         grid_temp_1 = env.grid.reshape(1,90)
#         grid_temp_1 = grid_temp_1.squeeze()
#         s = grid_temp_1
#         #######################

#         while not done:  # OB, 장애물, 최종 목표 도달시 종료
#             ####################################################################################
#             # # -> 무한 루프
#             # # (option) test 경우 추가: 출발점에서는 무조건 위로 올라간다 (action=0)
#             # x, y = s[-4:-2]
#             # if [x,y] == [9,4]:
#             #     a = 0
#             # else:
#             #     a = q.test_action(torch.from_numpy(s).float())  # test_action 수행 ->
#             #####################################################################################
#             a = q.test_action(torch.from_numpy(s).float())  # test_action 수행
#             #####################################################################################
#             grid, r, cum_reward, done, goal_ob_reward = env.step(a)
            
#             ######################################
#             # new_grid_map = grid.reshape(-1)
#             # s_prime_temp_1 = np.append(new_grid_map, env.curloc)
#             # s_prime_temp_2 = np.append(s_prime_temp_1, end_point)
#             # s_prime = np.array(s_prime_temp_2)
#             ######################################
#             grid_temp_2 = grid.reshape(1,90)
#             grid_temp_2 = grid_temp_2.squeeze()
#             s_prime = grid_temp_2

#             s = s_prime
#             score += r

#         # print_interval (=1) 마다 현황 디스플레이하고 q_target 업데이트 및 score 초기화
#         if n_epi % print_interval == 0 and n_epi != 0:
#             cur_time = datetime.now(tz)
#             simple_cur_time = cur_time.strftime("%H:%M:%S")
#             print('▶ Episode #', n_epi, end=' → ')
#             print('Score =', score, end='...')
#             print('경로 길이:', len(env.actions), end=', ')
#             if goal_ob_reward == 'finish':
#                 print('성공 여부: ', goal_ob_reward)
#                 isFinished = True
#             else:
#                 print('성공 여부 : 실패', goal_ob_reward)
        
#         score = 0.0
#         print('→ pred_path:', env.actions)

#         ##### 액션 결과물 확인하기
#         if len(env.actions) > 15 and env.cumulative_reward > 0:
#             print(f"move = {move_reward}, obs = {obs_reward}, goal = {goal_reward}")
#             print(f"{n_epi}번째 에피소드는 {env.cumulative_reward:.1f}점 입니다.")
#             print(f"target은 {env.items} ▶▶▶ {env.local_target}입니다.")
#             print(f"action은 총 {len(env.actions)}번 움직였습니다.\n{env.actions}")
#             print(grid)
#             print("###############")

#     #########################################    
#     #     if len(env.actions) > 5:
#     #         f.write(str(n_epi)+'/'+str(items)+'/'+str(cum_reward)+'/'+str(isFinished)+'\n')
#     #         f.write(str(env.actions))
#     #         f.write('\n')

#     # f.close()
#     #########################################

#     # 모든 에피소드 종료 후 결과 디스플레이
#     # 코드 추가할 것!!!

# # test 모드 실행
# train_mode = False
# test()