In [1]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

# Define the PGN Network
class PGN(nn.Module):
    def __init__(self):
        super(PGN, self).__init__()
        self.fc1 = nn.Linear(4, 24)  # CartPole state is 4-dimensional
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, 2)  # Two actions: push cart left or right

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.softmax(self.fc3(x), dim=1) # Need softmax for PG to output probabilities 
        
        return x

In [None]:
# Initialize environment, model, and optimizer
env = gym.make('CartPole-v1')
model = PGN()
learning_rate = 0.0001
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

# Training parameters
num_episodes = 10000
gamma = 0.90 # Discount factor

for episode in range(num_episodes):
    state = env.reset()[0]
    
    last_episode = []
    rewards = [] 
    actions = []
    
    # Simulate a full episode
    for t in range(1, 10000):  
        state_tensor = torch.from_numpy(state).float().unsqueeze(0)
        last_episode.append(state_tensor)

        with torch.no_grad():
            # Select action from action probability dist. 
            action = torch.multinomial(model(state_tensor), 1).item()
            actions.append(action)
        
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        
        if done:
            reward = -5
        
        state = next_state

        ############
        rewards.append(reward)
        ############

        if done:
            break
            
    # Train on past episode 
    rev_cumul_reward = 0
    episode_loss = 0
    for ridx, state in enumerate(reversed(last_episode)): # traverse list in reverse
        rev_cumul_reward *= gamma
        rev_cumul_reward += rewards[-ridx-1]

        # This line below is crucial need action to be same as one from step in episode (to boost or reduce its prob)
        # use target model so that action probs are same as at initial eval time! (nope just treat full episode as minibatch)
        # weird target model makes it much worse even with target_update = 1
        action_probability = model(last_episode[-ridx-1])[0][actions[-ridx-1]]  # should make this more readable

        log_policy = torch.log(action_probability) # Will always be negative 
        
        loss = -1 * log_policy * rev_cumul_reward
        episode_loss += loss

    episode_loss.backward()
    optimizer.step()
    episode_loss = 0

    if episode % 50 == 0:
        print(t)
        
        
        

env.close()

16
10
13
21
31
25
18
53
13
13
23
22
19
19
48
41
23
39
18
39
42
27
19
25
28
14
35
12
28
18
12
16
21
17
16
32
13
11
43
31
12
24
32
30
31
102
15
20
36
20
17
17
76
15
18
29
47
19
42
11
46
25
20
48
58
29
29
32
22
22
80
35
16
23
20
54
14
21
41
26
106
14
45
40
61
21
28
45
59
22
32
17
33
28
57
43
51
56
124
13
50
68
19
153
73
36
110
28
63
103
58
28
57
35
57
58
29
62
55
74
106
38
87
43
47
77
38
109
103
76
164
79
109
78
175
45
44
146
60
139
118
144
48
144
182
144
136
60
40
195
132
88
70
146
31
138
73
130
131
113
144
103
191
217
127
94
134
158
119
128
161
205
141
157
93
304
270
109
107
185
112
239


In [19]:
print(episode_loss)

tensor(266.2072, grad_fn=<AddBackward0>)
