In [6]:
import numpy as np
import random

# 🛠 Step 1: Setup the Environment
# Here, we define the "world" where the agent moves.
states = [0, 1, 2, 3, 4]  # The environment has 5 different states (positions).
actions = [0, 1]  # Two possible actions: 0 = move left, 1 = move right.
treasure_state = 4  # The goal is to reach state 4, where the treasure is located!

# 📄 Step 2: Initialize the Q-Table
# The Q-table is a lookup table where the agent "remembers" how good an action is at a given state.
q_table = np.zeros((len(states), len(actions)))  # Initially, all Q-values are 0 (no knowledge yet).
print(q_table)  # Let's print the empty table to see what it looks like.

# 🔧 Define the "rules of learning" (hyperparameters)
learning_rate = 0.1        # How fast the agent updates its knowledge (0 = slow, 1 = fast).
discount_factor = 0.9      # How much future rewards matter compared to immediate rewards.
epsilon = 0.2              # Probability of exploring random actions (instead of picking the best known one).
episodes = 1000            # How many full games (episodes) the agent will play to learn.

# 🎯 Step 3: Define How Rewards Are Given
# Rewards motivate the agent! Good actions should give high rewards, bad actions low or negative rewards.
def get_reward(state):
    if state == treasure_state:
        return 10  # Big reward for reaching the treasure (goal achieved! 🎉)
    else:
        return -1  # Small penalty for every move to encourage reaching the goal faster.

# 🧠 Step 4: The Main Q-Learning Algorithm (Learning happens here)
for episode in range(episodes):  # Repeat the learning process many times to get better.
    state = 0  # Always start at state 0 at the beginning of each episode.

    while state != treasure_state:  # Keep moving until the agent reaches the goal.
        # 🎲 Decide whether to Explore (try new random actions) or Exploit (choose best-known action)
        if random.uniform(0, 1) < epsilon:
            action = random.choice(actions)  # Explore: try something random to learn more.
        else:
            action = np.argmax(q_table[state])  # Exploit: choose the best action known so far.

        # 🚶‍♂️ Take the chosen action
        if action == 0:  # If action is 0, move left (can't move beyond 0).
            next_state = max(0, state - 1)
        else:  # If action is 1, move right (can't move beyond the treasure).
            next_state = min(treasure_state, state + 1)

        # 🎁 Receive a reward based on the next state
        reward = get_reward(next_state)

        # 🛠 Update the Q-table to remember what happened
        old_value = q_table[state, action]  # What we thought before about this action.
        next_max = np.max(q_table[next_state])  # The best value for the next state.

        # 🧪 New Q-value: a mix of what we knew + what we just learned
        new_value = (1 - learning_rate) * old_value + learning_rate * (reward + discount_factor * next_max)
        q_table[state, action] = new_value  # Update the table with the new value.

        # 🔄 Move the agent to the next state
        state = next_state

# 📊 Step 5: After all episodes, print the Learned Q-Table
print("Learned Q-Table:")
print(q_table)  # Now the Q-table should have smart values showing the best actions!

# 🧪 Step 6: Test Run: Let's See How the Agent Moves Now
print("\nTest Run:")
state = 0  # Start from the beginning
path = [state]  # We'll record the path taken.
#ind = np.unravel_index(np.argmax(a, axis=None), a.shape)
indexPath = [state]

while state != treasure_state:  # Until the treasure is reached
    action = np.argmax(q_table[state])  # Choose the best action (no randomness now).
    ind = np.unravel_index(np.argmax(q_table[state], axis=None), q_table[state].shape)
    if action == 0:
        state = max(0, state - 1)
    else:
        state = min(treasure_state, state + 1)
    path.append(state)  # Add the new state to the path.
    indexPath.append(ind)

# 🎉 Show the path the agent learned to follow to reach the goal!
print("\nPath to treasure: ")
count = 0
for num in range(len(path)):
    if indexPath[num] == 0:
        print(f"left {path[num]}, ", end="")
    else:
        if count == len(path)-1:
            print(f"right {path[num]} 🎉", end="")
        else:
            print(f"right {path[num]}, ", end="")
    count = count+1


[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]
Learned Q-Table:
[[ 3.12193797  4.58      ]
 [ 3.12196903  6.2       ]
 [ 4.57999575  8.        ]
 [ 6.19996566 10.        ]
 [ 0.          0.        ]]

Test Run:

Path to treasure: 
left 0, right 1, right 2, right 3, right 4 🎉