In [1]:
import json
import numpy as np
import gymnasium as gym


def initialize_q_table(observation_space_n, action_space_n):
    Q = np.zeros([observation_space_n, action_space_n])
    return Q


def select_action_eps_greedy(Q, state, epsilon):
    # TODO
    if np.random.rand() < epsilon:
        action = np.random.randint(len(Q[state]))
    else:
        action = np.argmax(Q[state])
    return action


def update_Q_SARSA(Q, s, a, r, next_s, next_a, alpha, gamma):
    # TODO
    target = r + gamma * Q[next_s, next_a]
    Q[s, a] += alpha * (target - Q[s,a])


def update_Q(Q, s, a, r, next_s, next_a, alpha, gamma):
    # TODO
    target = r + gamma * np.max(Q[next_s])
    Q[s, a] += alpha * (target - Q[s,a])


def learn(method):
    env = gym.make('CliffWalking-v0')
    # определяем память, в которой будет храниться Q(s,a)
    Q = initialize_q_table(env.observation_space.n, env.action_space.n)

    # гиперпараметры алгоритма (не меняйте параметры)
    alpha = 0.1
    gamma = 0.9
    max_epsilon = 0.2
    episodes_number = 10000

    for episode in range(1, episodes_number + 1):
        epsilon = max_epsilon * (episodes_number - episode) / (episodes_number - 1)
        s, _ = env.reset()

        r, episode_reward = 0, 0
        done = False
        a = select_action_eps_greedy(Q, s, epsilon)
        while not done:
            next_s, r, terminated, truncated, info = env.step(a)
            done = terminated or truncated

            next_a = select_action_eps_greedy(Q, next_s, epsilon)
            ##############################
            # Обновите Q функцию в соответствии с алгоритмом SARSA или Q обучение
            if method == 'SARSA':
                update_Q_SARSA(Q, s, a, r, next_s, next_a, alpha, gamma)
            else:
                update_Q(Q, s, a, r, next_s, next_a, alpha, gamma)
            # Note: считаем Q функцию для терминальных состояний всегда равной 0
            ##############################

            s = next_s
            a = next_a
            episode_reward += r
        if episode % 100 == 0:
            print(f"Episode: {episode}, Reward: {episode_reward}, Eps: {epsilon}")
    return Q

  "Gymnasium minimally supports python 3.6 as the python foundation not longer supports the version, please update your version to 3.7+"


In [26]:
Q = learn('Q')
SARSA = learn('SARSA')
# сохранение
env = gym.make('CliffWalking-v0')
states = env.observation_space.n
actions = env.action_space.n
Q_dict = {}
pi_Q = {}
SARSA_dict = {}
pi_SARSA = {}
for s in range(states):
    Q_dict[s] = {}
    SARSA_dict[s] = {}

    # Задайте жадную стратегию !!!!!!!!
    # TODO
    pi_SARSA[s] = int(np.argmax(SARSA[s]))
    pi_Q[s] = int(np.argmax(Q[s]))

    for a in range(actions):
        Q_dict[s][a] = Q[s, a]
        SARSA_dict[s][a] = SARSA[s, a]

with open('submit.json', "w") as f:
    json.dump([Q_dict, pi_Q, SARSA_dict, pi_SARSA], f)

Episode: 100, Reward: -139, Eps: 0.19801980198019803
Episode: 200, Reward: -25, Eps: 0.19601960196019602
Episode: 300, Reward: -17, Eps: 0.19401940194019401
Episode: 400, Reward: -222, Eps: 0.192019201920192
Episode: 500, Reward: -126, Eps: 0.19001900190019003
Episode: 600, Reward: -17, Eps: 0.18801880188018802
Episode: 700, Reward: -232, Eps: 0.186018601860186
Episode: 800, Reward: -19, Eps: 0.18401840184018403
Episode: 900, Reward: -120, Eps: 0.18201820182018202
Episode: 1000, Reward: -766, Eps: 0.18001800180018002
Episode: 1100, Reward: -341, Eps: 0.178017801780178
Episode: 1200, Reward: -15, Eps: 0.17601760176017603
Episode: 1300, Reward: -16, Eps: 0.17401740174017402
Episode: 1400, Reward: -17, Eps: 0.172017201720172
Episode: 1500, Reward: -13, Eps: 0.17001700170017
Episode: 1600, Reward: -15, Eps: 0.16801680168016803
Episode: 1700, Reward: -17, Eps: 0.16601660166016602
Episode: 1800, Reward: -15, Eps: 0.164016401640164
Episode: 1900, Reward: -13, Eps: 0.162016201620162
Episode: 2