<a href="https://colab.research.google.com/github/elangbijak4/LLM-SLM-Examples/blob/main/Demo_GPU_Reinforcement_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [None]:
# Memastikan TensorFlow menggunakan GPU
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate

# Environment setup (simple example)
state_space_size = 2
action_space_size = 1  # For simplicity, we consider a single action

# Neural network model
model = Sequential([
    Dense(2, input_dim=state_space_size, activation='relu'),  # Hidden layer
    Dense(action_space_size)  # Output layer
])
model.compile(optimizer=Adam(learning_rate=alpha), loss='mean_squared_error')

# Function to choose action based on epsilon-greedy policy
def choose_action(state):
    if np.random.rand() < epsilon:
        return np.random.randint(action_space_size)
    q_values = model.predict(state)
    return np.argmax(q_values[0])

# Function to update Q-values using the ANN
def update_q_values(state, action, reward, next_state):
    target_q = reward + gamma * np.max(model.predict(next_state)[0])
    q_values = model.predict(state)
    q_values[0][action] = target_q
    model.fit(state, q_values, epochs=1, verbose=0)

# Training loop (simple example)
num_episodes = 5

for episode in range(num_episodes):
    state = np.random.rand(1, state_space_size)  # Initialize state randomly
    total_reward = 0

    for step in range(100):  # Assuming max steps per episode is 100
        action = choose_action(state)
        next_state = np.random.rand(1, state_space_size)  # Random next state
        reward = np.random.rand()  # Random reward

        update_q_values(state, action, reward, next_state)

        state = next_state
        total_reward += reward

        if step == 99:  # End of episode
            break

    if episode % 100 == 0:
        print(f"Episode {episode}: Total Reward: {total_reward}")

print("Training completed.")

Num GPUs Available:  1
Episode 0: Total Reward: 52.114006701491384
