![Graph](./seven_states_directed_graph.png)


In [3]:
# Q-Learning implementation for a 7-state directed graph
import numpy as np
import random


In [4]:

# Reward matrix R: shape (7, 7)
R = np.array([
    [-1, -1, -1,  0, -1, -1, -1],
    [-1, -1,  0, -1, -1, -1, -1],
    [-1,  0, -1,  0, -1,  0, -1],
    [ 0, -1,  0, -1,  0, -1, -1],
    [-1, -1, -1,  0, -1,  0, 100],
    [-1, -1,  0, -1,  0, -1, 100],
    [-1, -1, -1, -1, -1, -1, 100],
])

In [5]:
# Initialize Q table
Q = np.zeros_like(R, dtype=float)

In [6]:

gamma = 0.8      # Discount factor
alpha = 0.9      # Learning rate
n_states = R.shape[0]
episodes = 1000  # Number of training episodes

In [7]:
# Training process
for i in range(episodes):
    state = random.randint(0, n_states - 1)
    while state != 6:
        # Get all possible actions
        possible_actions = [a for a in range(n_states) if R[state, a] >= 0]
        if not possible_actions:
            break
        next_state = random.choice(possible_actions)
        Q[state, next_state] = Q[state, next_state] + alpha * (
            R[state, next_state] + gamma * Q[next_state].max() - Q[state, next_state]
        )
        state = next_state

In [8]:
# Normalize Q table for readability
Q_norm = Q / Q.max() * 100

# Print the final Q table
print("Learned Q-Table (Normalized):")
np.set_printoptions(precision=1, suppress=True)
print(Q_norm)

# Derive optimal policy
print("\nOptimal Policy from each state:")
for s in range(n_states):
    best_action = np.argmax(Q[s])
    print(f"From state {s} -> Go to {best_action}")

Learned Q-Table (Normalized):
[[  0.    0.    0.   64.    0.    0.    0. ]
 [  0.    0.   64.    0.    0.    0.    0. ]
 [  0.   51.2   0.   64.    0.   80.    0. ]
 [ 51.2   0.   64.    0.   80.    0.    0. ]
 [  0.    0.    0.   64.    0.   80.  100. ]
 [  0.    0.   64.    0.   80.    0.  100. ]
 [  0.    0.    0.    0.    0.    0.    0. ]]

Optimal Policy from each state:
From state 0 -> Go to 3
From state 1 -> Go to 2
From state 2 -> Go to 5
From state 3 -> Go to 4
From state 4 -> Go to 6
From state 5 -> Go to 6
From state 6 -> Go to 0


Learned Q-Table (Normalized):
[[  0.    0.    0.   64.    0.    0.    0. ]
 [  0.    0.   64.    0.    0.    0.    0. ]
 [  0.   51.2   0.   64.    0.   80.    0. ]
 [ 51.2   0.   64.    0.   80.    0.    0. ]
 [  0.    0.    0.   64.    0.   80.  100. ]
 [  0.    0.   64.    0.   80.    0.  100. ]
 [  0.    0.    0.    0.    0.    0.    0. ]]

Optimal Policy from each state:
From state 0 -> Go to 3
From state 1 -> Go to 2
From state 2 -> Go to 5
From state 3 -> Go to 4
From state 4 -> Go to 6
From state 5 -> Go to 6
From state 6 -> Go to 0
