In [67]:
import numpy as np

In [68]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(1, 10)  # One input (state), 10 hidden units
        self.fc2 = nn.Linear(10, 2)  # Two outputs (Q-values for each action)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [108]:
# I wrote this function (not from ChatGPT)
def take_action(state,action):
    
    state_value = int(state.item())  # Extract the value from 1D tensor 
    
    new_state_value = state_value + action +  (-1 * (1 - action))
    new_state_value = min(4,new_state_value)
    new_state_value = max(0,new_state_value)
    
    new_state = torch.tensor([new_state_value], dtype=torch.float32)  # Cast state numerical value 1D tensor
    
    rewards = [10,10,10,10,25]
    reward = rewards[new_state_value]
    
    done = 1 if new_state_value == 4 else 0
    
    return new_state, reward, done

In [111]:
policy_net = DQN()
optimizer = optim.Adam(policy_net.parameters())
gamma = 0.99  # Discount factor
num_episodes = 500
epsilon = 0.1 # F0or epsilon-greedy action selection

for episode in range(num_episodes):
    state = torch.tensor([np.random.randint(5)], dtype=torch.float32)  # Start at a random state
    for t in range(100):  # Limit the number of steps per episode
        # Select action using epsilon-greedy
        if np.random.rand() < epsilon:
            action = np.random.randint(2)
        else:
            with torch.no_grad():
                # .item(): converts PyTorch tensor to a Python scalar
                # [1]: because first slot contains value second slot contains index (action 0 or 1)
                # max(0): policy_net output is 1D tensor (vector) with only 1 dim to take the max over (dim 0)
                action = policy_net(state).max(0)[1].item()

        # Take action and observe next_state, reward, done
        next_state, reward, done = take_action(state,action)  # Define according to game dynamics

        # Compute Q-value
        next_state = torch.tensor([next_state], dtype=torch.float32)
        next_state_value = policy_net(next_state).max(0)[0].detach()
        expected_q_value = reward + gamma * next_state_value * (1 - done)
        
        q_value = policy_net(state)[action]

        # Compute loss and update policy_net
        loss = F.smooth_l1_loss(q_value, expected_q_value)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if done:
            break
        if t == 99:
            ...
            #print('episode')
            #print(state)
            #print(next_state)
        state = next_state


In [113]:
print(
policy_net(torch.tensor([0], dtype=torch.float32)),
policy_net(torch.tensor([1], dtype=torch.float32)),
policy_net(torch.tensor([2], dtype=torch.float32)),
policy_net(torch.tensor([3], dtype=torch.float32)),
policy_net(torch.tensor([4], dtype=torch.float32))
)

tensor([1001.3024, 1001.3151], grad_fn=<AddBackward0>) tensor([1001.0605, 1000.9422], grad_fn=<AddBackward0>) tensor([1000.8186, 1000.5692], grad_fn=<AddBackward0>) tensor([1000.5767, 1000.1962], grad_fn=<AddBackward0>) tensor([1000.3348,  999.8232], grad_fn=<AddBackward0>)
