In [1]:
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers

# Step 1: Define the simple environment
states = [0, 1, 2, 3, 4]
actions = [0, 1]
treasure_state = 4

# Step 2: Build the Neural Network
def build_q_network():
    model = models.Sequential()
    model.add(layers.Input(shape=(1,)))         # Input layer: one value (the state)
    model.add(layers.Dense(16, activation='relu'))  # Hidden layer
    model.add(layers.Dense(2))                   # Output layer: two actions (left, right)
    return model

q_network = build_q_network()
optimizer = optimizers.Adam(learning_rate=0.01)
loss_fn = tf.keras.losses.MeanSquaredError()

# Hyperparameters
gamma = 0.9  # Discount factor
epsilon = 0.2
episodes = 1000

# Helper: get reward
def get_reward(state):
    if state == treasure_state:
        return 10
    else:
        return -1

# Step 3: DQN Training Loop
for episode in range(episodes):
    state = 0  # Start at state 0

    while state != treasure_state:
        state_tensor = np.array([[state]], dtype=np.float32)

        # Epsilon-greedy action selection
        if random.uniform(0, 1) < epsilon:
            action = random.choice(actions)
        else:
            q_values = q_network.predict(state_tensor, verbose=0)
            action = np.argmax(q_values[0])

        # Take action
        if action == 0:
            next_state = max(0, state - 1)
        else:
            next_state = min(treasure_state, state + 1)

        reward = get_reward(next_state)

        # Prepare tensors
        next_state_tensor = np.array([[next_state]], dtype=np.float32)

        # Predict Q-values
        q_values = q_network(state_tensor)
        next_q_values = q_network(next_state_tensor)

        # Target for current action
        target_q = q_values.numpy()
        target_q[0, action] = reward + gamma * np.max(next_q_values)

        # Train step
        with tf.GradientTape() as tape:
            q_pred = q_network(state_tensor, training=True)
            loss = loss_fn(target_q, q_pred)

        grads = tape.gradient(loss, q_network.trainable_variables)
        optimizer.apply_gradients(zip(grads, q_network.trainable_variables))

        state = next_state

print("\nTraining Done!")

# Step 4: Test the agent
state = 0
path = [state]
while state != treasure_state:
    state_tensor = np.array([[state]], dtype=np.float32)
    q_values = q_network.predict(state_tensor, verbose=0)
    action = np.argmax(q_values[0])
    if action == 0:
        state = max(0, state - 1)
    else:
        state = min(treasure_state, state + 1)
    path.append(state)

print(f"Path to treasure: {path}")


2025-04-30 11:23:33.123114: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.



Training Done!
Path to treasure: [0, 1, 2, 3, 4]
