In [0]:
import numpy as np
import gym
from gym import wrappers
from tqdm import tqdm
import matplotlib.pyplot as plt

# Okruženje `Frozen Lake`

<img src="https://miro.medium.com/max/1300/1*S6CG3jyp5rGxMUGw_Bqr3Q.png" alt="frozen lake">>

Mapa je veličine 4x4, pri čemu polje može biti:
- Start `S` (eng.start)
- Zamrznuto polje `F` (eng. frozen)
- Rupa `H` (eng. hole)
- Cilj `G` (eng. goal)

Agent u svakom trenutku može primeniti jednu od 4 akcije:
- Gore
- Dole
- Levo
- Desno

Važna napomena je da je jako hladno i klizavo, tako da može da se desi
da se agent oklizne i ne uspe da sprovede akciju koju je namerio, odnosno
da se pomeri u nekom drugom smeru. Nije poznato koliko često i kako se
ovo dešava.

### Parametri za Q učenje

In [0]:
ALPHA = 0.1 # learning rate
GAMMA = 0.99 # reward discount
MAX_EPISODES = 100000
TEST_COUNT = 10000

TURN_LIMIT = 100
IS_MONITOR = False


In [0]:
class Agent:
    def __init__(self, env, learning_rate, gamma, max_steps):
        self.env = env
        self.episode_reward = 0.0
        self.q_val = np.zeros(16 * 4).reshape(16, 4).astype(np.float32)
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.max_steps = max_steps

    def learn_step(self):
        # Uzimamo pocetno stanje
        state = self.env.reset()

        # Za slucaj da zelite vizuelizaciju tokom ucenja,
        # mozete korisiti `render()`.
        # self.env.render()
        
        for t in range(self.max_steps):
            # Uzimamo nasumicnu akciju
            act = self.env.action_space.sample() # random

            # Prosledjujemo je okruzenju i prihvatamo nove informacije.
            next_state, reward, done, _ = self.env.step(act)
            q_next_max = np.max(self.q_val[next_state])
            self.q_val[state][act] = (1 - self.learning_rate) * self.q_val[state][act] + self.learning_rate * (reward + self.gamma * q_next_max)
            
            # self.env.render()

            # Ako je agent stigao do cilja ili je upao u rupu, vracamo nagradu
            if done:
                return reward
            # Inace, azuriramo tekuce stanje i nastavljamo dalje.
            else:
                state = next_state

        return 0.0 # over limit

    def test_run(self, should_render=False):
        state = self.env.reset()

        if should_render:
            self.env.render()

        for t in range(self.max_steps):
            act = np.argmax(self.q_val[state])
            next_state, reward, done, info = self.env.step(act)

            if should_render:
                self.env.render()

            if done:
                return reward
            else:
                state = next_state
        return 0.0 # over limit

Konstruišemo okruženje i našeg agenta.

Biblioteka `gym` pruža veliki broj različitih okruženja koje je moguće
konstruisati. String `FrozenLake-v0` je identifikator okruženja
koje nam je potrebno.

In [0]:
env = gym.make("FrozenLake-v0")
agent = Agent(env, ALPHA, GAMMA, TURN_LIMIT)

Primenjujemo Q učenje tako što puštamo agenta da odigra određeni broj epizoda.
Ono što agent nauči biće dostupno u njegovoj *q tabeli*.

In [36]:
reward_total = 0.0
for i in tqdm(range(MAX_EPISODES)):
    reward_total += agent.learn_step()


100%|██████████| 100000/100000 [00:34<00:00, 2918.38it/s]


In [37]:
print("episodes      : {}".format(MAX_EPISODES))
print("total reward  : {}".format(reward_total))
print("average reward: {:.2f}".format(reward_total / MAX_EPISODES))
print("Q table:\n{}".format(agent.q_val))

episodes      : 100000
total reward  : 1345.0
average reward: 0.01
Q table:
[[0.5900988  0.5792401  0.57099426 0.56243795]
 [0.3203432  0.23426443 0.27651364 0.54371655]
 [0.47778538 0.4715293  0.46326944 0.495587  ]
 [0.34143043 0.30534077 0.2388659  0.48323044]
 [0.59793204 0.34002882 0.33286148 0.39986297]
 [0.         0.         0.         0.        ]
 [0.43849084 0.19339718 0.43643847 0.11554289]
 [0.         0.         0.         0.        ]
 [0.4169602  0.48160893 0.42613843 0.6321308 ]
 [0.43762892 0.6826096  0.46429408 0.4072147 ]
 [0.68904185 0.40753657 0.5046593  0.38935733]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.5323027  0.6501748  0.8146313  0.6486586 ]
 [0.82312495 0.9155629  0.8649174  0.827996  ]
 [0.         0.         0.         0.        ]]


Možemo pustiti agenta da se kreće kroz okruženje prateći svoju Q tabelu.

In [38]:
reward_total = 0.0
for i in range(TEST_COUNT):
    reward_total += agent.test_run()
print("episodes      : {}".format(TEST_COUNT))
print("total reward  : {}".format(reward_total))
print("average reward: {:.2f}".format(reward_total / TEST_COUNT))

episodes      : 10000
total reward  : 7369.0
average reward: 0.74


I vizuelizovati njegovo kretanje. Prisetite se okruženje ima određenu dozu stohastičnosti, tako da postoje situacije u kojim se agent ne pomeri u
željenom smeru.

In [39]:
reward = agent.test_run(True)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH

In [40]:
print(reward)

1.0
