In [1]:
import numpy as np
import pandas as pd
import random
from collections import defaultdict
import gym
import gym_minigrid
import matplotlib.pyplot as plt
%matplotlib inline





In [2]:
class QLearning:
    def __init__(self, actions, agent_indicator=10):
        self.actions = actions
        self.agent_indicator = agent_indicator
        self.alpha = 0.01
        self.gamma = 0.9
        self.epsilon = 0.2
        self.q_values = defaultdict(lambda: [0.0] * actions)

    def _convert_state(self, obs):
        y, x = np.where(obs == self.agent_indicator)
        return (int(y[0]), int(x[0]))  # ← 정확한 좌표 기반 상태 반환

    def update(self, state, action, reward, next_state):
        state = self._convert_state(state)
        next_state = self._convert_state(next_state)
        q_value = self.q_values[state][action]
        max_next_q = max(self.q_values[next_state])
        td_error = reward + self.gamma * max_next_q - q_value
        self.q_values[state][action] = q_value + self.alpha * td_error

    def act(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.actions)
        else:
            state = self._convert_state(state)
            q_values = self.q_values[state]
            return np.argmax(q_values)


In [3]:
from utils import gen_wrapped_env, show_video

In [4]:
from utils import gen_wrapped_env
env = gen_wrapped_env('MiniGrid-Empty-6x6-v0')  # 6x6 환경

agent = QLearning(actions=3, agent_indicator=10)

episodes = 5000  # 충분히 학습
for ep in range(episodes):
    done = False
    obs = env.reset()
    action = agent.act(obs)
    ep_rewards = 0

    while not done:
        next_obs, reward, done, info = env.step(action)
        next_action = agent.act(next_obs)
        agent.update(obs, action, reward, next_obs)
        obs = next_obs
        action = next_action
        ep_rewards += reward

    if (ep+1) % 500 == 0:
        print(f"Episode {ep+1}: Reward = {ep_rewards}")

env.close()


ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
{s:np.round(q, 5).tolist() for s, q in agent.q_values.items()}

In [None]:
import matplotlib.pyplot as plt

# policy 추출
policy = {s: np.argmax(q) for s, q in agent.q_values.items()}

grid_size = 6
policy_grid = np.full((grid_size, grid_size), -1)

for (y, x), a in policy.items():
    policy_grid[y, x] = a

# MiniGrid-style 방향 매핑
arrows = {0: '⟲', 1: '⟳', 2: '↑', -1: ' '}

# 시각화
fig, ax = plt.subplots(figsize=(6, 6))
for i in range(grid_size):
    for j in range(grid_size):
        action = policy_grid[i, j]
        ax.text(j + 0.5, i + 0.5, arrows[action], ha='center', va='center', fontsize=18)

ax.set_xticks(np.arange(grid_size + 1))
ax.set_yticks(np.arange(grid_size + 1))
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.grid(True)
ax.invert_yaxis()
ax.set_title("Policy Heatmap (Start: (0,0) → Goal: (5,5))")
plt.show()


In [None]:
show_video()

In [None]:
import os
os.makedirs("./logs", exist_ok=True)

pd.Series(rewards).to_csv('./logs/rewards_qlearning_empty.csv')


In [None]:
#sarsa_logs = pd.read_csv('./logs/rewards_sarsa.csv', index_col=False).iloc[:, 1]
q_logs = pd.read_csv('./logs/rewards_qlearning_empty.csv', index_col=False).iloc[:, 1]

In [None]:
plt.figure(figsize=(16, 8))
plt.plot(q_logs.cumsum() / (pd.Series(np.arange(q_logs.shape[0])) + 1), label="Q-Learning")
plt.xlabel("Episode")
plt.ylabel("Average Reward")
plt.title("Q-Learning: Cumulative Reward")
plt.legend()
plt.grid(True)
plt.savefig("qlearning_empty.jpg", dpi=300, bbox_inches='tight')
plt.show()
