# üßÆ Laborator 3 ‚Äî Agen»õi Value-Based √Æn Reinforcement Learning

**Mediile folosite:** `GridWorld (custom)`, `FrozenLake-v1`, `CartPole-v1`, `MountainCar-v0`, `Bandit 10-Arms`

**Algoritmi:** Value Iteration, Monte Carlo, TD(0), Q-learning (Œµ-greedy), Multi-Armed Bandits

> Rula»õi sec»õiunile √Æn ordine. Ajusta»õi hiper-parametrii pentru explorare.

## üîß Setup & utilitare

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gymnasium as gym
from collections import defaultdict

np.set_printoptions(precision=3, suppress=True)
rng = np.random.default_rng(42)

---
## 1Ô∏è‚É£ GridWorld (custom) ‚Äî Value Iteration (model-based)

Grid 4√ó4, douƒÉ stƒÉri terminale: `+1` (goal) »ôi `-1` (hole). Fiecare pas non-terminal prime»ôte `-0.01`.

In [None]:
# GridWorld setup
nrow, ncol = 4, 4
S, A, gamma = nrow*ncol, 4, 0.99
terminal_plus, terminal_minus = 15, 12

def to_rc(s): return divmod(s, ncol)
def to_s(r,c): return r*ncol + c

def step_gw(s, a):
    if s in (terminal_plus, terminal_minus):
        return s, 0.0, True
    r, c = to_rc(s)
    if a == 0: r = max(0, r-1)
    elif a == 1: c = min(ncol-1, c+1)
    elif a == 2: r = min(nrow-1, r+1)
    elif a == 3: c = max(0, c-1)
    s2 = to_s(r,c)
    if s2 == terminal_plus:  return s2, +1.0, True
    if s2 == terminal_minus: return s2, -1.0, True
    return s2, -0.01, False

def value_iteration(theta=1e-5, gamma=0.99):
    V = np.zeros(S)
    it = 0
    while True:
        delta = 0.0
        for s in range(S):
            if s in (terminal_plus, terminal_minus): continue
            q_vals = []
            for a in range(A):
                s2, r, done = step_gw(s,a)
                q_vals.append(r + (0 if done else gamma*V[s2]))
            new_v = max(q_vals)
            delta = max(delta, abs(V[s] - new_v))
            V[s] = new_v
        it += 1
        if delta < theta or it > 1000:
            break
    return V, it

V_gw, iters = value_iteration()
print(f"Convergen»õƒÉ √Æn {iters} itera»õii.")
print(V_gw.reshape(nrow, ncol))

# Vizualizare
plt.figure(figsize=(4,4))
grid = V_gw.reshape(nrow, ncol)
plt.imshow(grid, cmap="viridis")
for r in range(nrow):
    for c in range(ncol):
        plt.text(c, r, f"{grid[r,c]:.2f}", ha="center", va="center", color="white")
plt.title("Value Iteration ‚Äî V(s)")
plt.axis("off")
plt.colorbar(); plt.show()

---
## 2Ô∏è‚É£ FrozenLake-v1 ‚Äî Monte Carlo (state-value, episodic, model-free)

EstimƒÉm `V(s)` ca media returnurilor observate sub o politicƒÉ aleatoare.

In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=True)
gamma, episodes = 0.99, 5000
epsilon, min_eps, decay = 1.0, 0.01, 0.999

Q = defaultdict(lambda: np.zeros(env.action_space.n))
returns_sum, returns_count = defaultdict(float), defaultdict(int)

def epsilon_greedy_policy(state, eps):
    if np.random.rand() < eps:
        return env.action_space.sample()
    return np.argmax(Q[state])

reward_history = []

for ep in range(1, episodes + 1):
    obs, _ = env.reset(seed=ep)
    episode, done = [], False

    while not done:
        a = epsilon_greedy_policy(obs, epsilon)
        obs2, reward, terminated, truncated, _ = env.step(a)
        episode.append((obs, a, reward))
        obs = obs2
        done = terminated or truncated

    G, visited = 0.0, set()
    for s, a, r in reversed(episode):
        G = gamma * G + r
        if (s, a) not in visited:
            returns_sum[(s, a)] += G
            returns_count[(s, a)] += 1
            Q[s][a] = returns_sum[(s, a)] / returns_count[(s, a)]
            visited.add((s, a))

    epsilon = max(min_eps, epsilon * decay)
    reward_history.append(sum([r for _,_,r in episode]))

env.close()

# === Politica finalƒÉ ===
policy = np.array([np.argmax(Q[s]) for s in range(env.observation_space.n)]).reshape(4,4)
symbols = np.array(["‚Üê","‚Üì","‚Üí","‚Üë"])
policy_grid = np.full((4,4), "‚Ä¢", dtype="<U1")
for i in range(4):
    for j in range(4):
        idx = i*4 + j
        policy_grid[i,j] = symbols[policy[i,j]]

plt.imshow(policy.reshape(4,4), cmap="Blues")
for i in range(4):
    for j in range(4):
        plt.text(j, i, policy_grid[i,j], ha="center", va="center", color="black", fontsize=14)
plt.title("Politica finalƒÉ (Monte Carlo Control Œµ-Greedy)")
plt.axis("off")
plt.show()

# === Evolu»õia reward-urilor ===
plt.plot(np.convolve(reward_history, np.ones(50)/50, mode='valid'))
plt.title("Evolu»õia reward-ului mediu (MC Control)")
plt.xlabel("Episod"); plt.ylabel("Reward mediu")
plt.show()



---
## 3Ô∏è‚É£ CartPole-v1 ‚Äî Temporal Difference (TD(0))

DiscretizƒÉm observa»õiile pentru a aproxima `V(s)` tabular. PoliticƒÉ aleatoare (demonstra»õie).

In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

env = gym.make("CartPole-v1")
alpha, gamma = 0.1, 0.99
epsilon, eps_min, eps_decay = 1.0, 0.05, 0.995

# discretizare
bins = [
    np.linspace(-2.4, 2.4, 10),
    np.linspace(-3.0, 3.0, 10),
    np.linspace(-0.21, 0.21, 10),
    np.linspace(-3.5, 3.5, 10)
]

def discretize(obs):
    return tuple(np.digitize(obs[i], bins[i]) for i in range(4))

# ini»õializare Q(s,a)
Q = defaultdict(lambda: np.zeros(env.action_space.n))

def epsilon_greedy(s, eps):
    if np.random.rand() < eps:
        return env.action_space.sample()
    return int(np.argmax(Q[s]))

rewards = []

for ep in range(500):  # <<< cre»ôte la 1000 pentru convergen»õƒÉ mai bunƒÉ
    obs, _ = env.reset(seed=ep)
    s = discretize(obs)
    a = epsilon_greedy(s, epsilon)
    done, total = False, 0

    while not done:
        obs2, r, terminated, truncated, _ = env.step(a)
        s2 = discretize(obs2)
        a2 = epsilon_greedy(s2, epsilon)
        Q[s][a] += alpha * (r + gamma * Q[s2][a2] - Q[s][a]) 
        s, a = s2, a2
        done = terminated or truncated
        total += r

    epsilon = max(eps_min, epsilon * eps_decay)
    rewards.append(total)

env.close()

# === Grafic: evolu»õia performan»õei ===
plt.figure(figsize=(7,4))
plt.plot(np.convolve(rewards, np.ones(20)/20, mode='valid'))
plt.title("CartPole ‚Äî TD(0)")
plt.xlabel("Episod")
plt.ylabel("Reward mediu")
plt.grid(True, alpha=0.3)
plt.show()


---
## 4Ô∏è‚É£ MountainCar-v0 ‚Äî Q-learning cu Œµ-greedy (Œµ decay)

In [None]:

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

env = gym.make("MountainCar-v0")

# hiperparametri
alpha, gamma = 0.2, 0.99
epsilon, eps_min, eps_decay = 1.0, 0.01, 0.999  # explorare mai lentƒÉ
episodes = 5000

# discretizare mai finƒÉ
bins = [
    np.linspace(-1.2, 0.6, 40),   # pozi»õia ma»ôinii
    np.linspace(-0.07, 0.07, 40)  # viteza
]

def discretize(obs):
    return tuple(np.digitize(obs[i], bins[i]) for i in range(2))

Q = np.zeros((len(bins[0])+1, len(bins[1])+1, env.action_space.n))
returns = []

for ep in range(episodes):
    obs, _ = env.reset(seed=ep)
    s = discretize(obs)
    done = False
    total = 0
    
    while not done:
        # ac»õiune Œµ-greedy
        if np.random.rand() < epsilon:
            a = np.random.randint(env.action_space.n)
        else:
            a = int(np.argmax(Q[s]))
        
        obs2, reward, term, trunc, _ = env.step(a)
        s2 = discretize(obs2)

        # recompensƒÉ u»ôor modelatƒÉ (bonus pentru vitezƒÉ »ôi pozi»õie bunƒÉ)
        reward += abs(obs2[1]) * 0.5
        if obs2[0] > 0.45: reward += 5.0  # bonus mic pentru atingerea dealului
        
        best_next = np.max(Q[s2])
        Q[s][a] += alpha * (reward + gamma * best_next - Q[s][a])
        s = s2
        total += reward
        done = term or trunc

    epsilon = max(eps_min, epsilon * eps_decay)
    returns.append(total)

env.close()

# === Vizualizare ===
window = 100
plt.figure(figsize=(7,4))
plt.plot(pd.Series(returns).rolling(window).mean())
plt.title(f"MountainCar-v0 ‚Äî Q-learning")
plt.xlabel("Episod")
plt.ylabel("Reward mediu")
plt.grid(alpha=0.3)
plt.show()


---
## 5Ô∏è‚É£ Multi-Armed Bandit ‚Äî Œµ-greedy

In [None]:
np.random.seed(123)
arms = 10
true_q = np.random.normal(0, 1, arms)
Qb = np.zeros(arms)
Nb = np.zeros(arms)
eps = 0.1
T = 1000

rewards = []
for t in range(T):
    if np.random.rand() < eps:
        a = np.random.randint(arms)
    else:
        a = int(np.argmax(Qb))
    r = np.random.normal(true_q[a], 1.0)
    Nb[a] += 1
    Qb[a] += (r - Qb[a]) / Nb[a]
    rewards.append(r)

plt.plot(pd.Series(rewards).rolling(20).mean())
plt.title("Bandit 10-Arms ‚Äî reward")
plt.xlabel("Pas"); plt.ylabel("Reward"); plt.show()