![Graph](./seven_states_directed_graph.png)

In [None]:
# SARSA implementation for a 7-state directed graph
import numpy as np
import random

In [None]:

# Define the reward matrix R
R = np.array([
    [-1, -1, -1,  0, -1, -1, -1],
    [-1, -1,  0, -1, -1, -1, -1],
    [-1,  0, -1,  0, -1,  0, -1],
    [ 0, -1,  0, -1,  0, -1, -1],
    [-1, -1, -1,  0, -1,  0, 100],
    [-1, -1,  0, -1,  0, -1, 100],
    [-1, -1, -1, -1, -1, -1, 100],
])

In [None]:
n_states = R.shape[0]
Q = np.zeros_like(R, dtype=float)

In [None]:
# Hyperparameters
alpha = 0.9      # Learning rate
gamma = 0.8      # Discount factor
epsilon = 0.1    # Epsilon for epsilon-greedy policy
episodes = 5000  # Number of training episodes, sarsa is slow to converge

In [None]:
# Epsilon-greedy action selection

def choose_action(state):
    valid_actions = [a for a in range(n_states) if R[state, a] >= 0]
    if random.random() < epsilon:
        return random.choice(valid_actions)
    else:
        q_vals = Q[state]
        max_q = np.max([q_vals[a] if a in valid_actions else -np.inf for a in range(n_states)])
        best_actions = [a for a in valid_actions if q_vals[a] == max_q]
        return random.choice(best_actions)

In [None]:

# SARSA training loop
for _ in range(episodes):
    state = random.randint(0, n_states - 1)
    action = choose_action(state)

    while state != 6:
        next_state = action
        next_action = choose_action(next_state)

        # SARSA update rule
        Q[state, action] += alpha * (
            R[state, action] + gamma * Q[next_state, next_action] - Q[state, action]
        )

        state, action = next_state, next_action

In [None]:
# Normalize Q-table for easier interpretation
Q_normalized = Q / Q.max() * 100

# Print normalized Q-table
import pandas as pd
pd.set_option("display.precision", 2)
print("\n✅ Learned SARSA Q-table (normalized):\n")
print(pd.DataFrame(Q_normalized))

In [None]:


# Print optimal policy derived from Q-table
print("\n📌 Optimal policy from each state:")
for s in range(n_states):
    best_a = np.argmax(Q[s])
    print(f"From state {s} ➜ go to state {best_a}")
