In [None]:
import torch as th
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
import time

In [None]:
def q_values(model: DQN, obs: np.ndarray) -> np.ndarray:
  # Доступ к Q-network
  q_net = model.q_net

  # Конвертируем observation в PyTorch tensor
  obs_tensor = th.tensor(obs, dtype=th.float32)

  # Изменяем размерность
  obs_tensor = obs_tensor.unsqueeze(0)

  #Извлекаем Q-values
  q_values = model.q_net.forward(obs_tensor.cuda())

  return q_values.cpu().detach().numpy()[0]

In [None]:
def plot_q_values(q_values_list):

  # Построение графика
  plt.figure(figsize=(10, 6))
  for i in range(6):
    plt.plot(q_values_list[i], label='Q%i Values' % i)
  plt.xlabel('Time')
  plt.ylabel('Q-Values')
  plt.title('Convergence of Q-Values over Time')
  plt.legend()
  plt.show()

In [None]:
def mean_reward(discount_factor):
  #Создание окружения
  env = gym.make("Taxi-v3", render_mode="rgb_array")

  #Создание модели
  model = DQN("MlpPolicy", env, verbose=1, gamma=discount_factor, learning_rate=0.001)

  #Количество эпизодов для оценки модели
  n_eval_episodes = 250

  #Оценка модели до обучения
  mean_reward, std_reward = evaluate_policy(model, gym.make("Taxi-v3", render_mode="rgb_array"), deterministic=True, n_eval_episodes=n_eval_episodes)
  print(f"До обучения модели с discount_factor = {discount_factor}, mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

  # Засекаем начальное время
  start_time = time.time()

  #Обучение модели
  model.learn(total_timesteps=10000, log_interval=100)

  # Засекаем время завершения
  end_time = time.time()

  #Оценка модели после обучения
  mean_reward, std_reward = evaluate_policy(model, gym.make("Taxi-v3", render_mode="rgb_array"), deterministic=True, n_eval_episodes=n_eval_episodes)
  print(f"После обучения модели с discount_factor = {discount_factor}, mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
  model.save(f"dqn_taxi_{discount_factor}")
  del model

  # Вычисляем время обучений
  learn_time = end_time - start_time
  return env, learn_time

In [None]:
def q_values_calculation(discount_factor, env):
  #Загружаем созданную и созраненную модель
  model = DQN.load(f"dqn_taxi_{discount_factor}")

  action_str = ['down', 'up', 'right', 'left', "pick up", "drop off"]
  q_values_list = []

  obs, info = env.reset()
  for _ in range(100):
      q_val = q_values(model,obs)
      q_values_list.append(q_val)
      action, _states = model.predict(obs, deterministic=True)

      print(f"Q-value состояния down={q_val[0]:.2f} up={q_val[1]:.2f} right={q_val[2]:.2f} left={q_val[3]:.2f} pick up={q_val[4]:.2f} drop off={q_val[5]:.2f}")
      print(f"Действие: {action_str[action]}")

      obs, reward, terminated, truncated, _ = env.step(int(action))

  return q_values_list

In [None]:
discount_factors = [0.01,0.2,0.7,0.9]

In [None]:
for discount_factor in discount_factors:
  environment, time_to_lrn = mean_reward(discount_factor)
  q_vals = q_values_calculation(discount_factor, environment)
  plot_q_values(q_vals)
  print(f'Время обучения модели при discount_factor = {discount_factor} : {time_to_lrn} секунд.')