In [40]:
!pip install gymnasium




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [41]:

import os
import glob
import time

import torch
import torch.nn as nn
from torch.distributions import MultivariateNormal #dùng cho môi trường liên tục
from torch.distributions import Categorical #Lựa chọn hành động từ phân phối rời rạc.


import numpy as np

import gymnasium as gym


In [42]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [43]:

class RolloutBuffer:
    def __init__(self):
        self.states = []
        self.actions = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []

    def clear(self):
        del self.states[:]
        del self.actions[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]


In [44]:

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, action_std):
        super(ActorCritic, self).__init__()

        self.actor = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, action_dim)
        )

        self.critic = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, 1)
        )

        self.action_var = torch.full((action_dim,), action_std * action_std).to(device)

    def act(self, state):
        state = torch.FloatTensor(state).to(device)
        action_mean = self.actor(state)
        cov_mat = torch.diag(self.action_var).to(device)

        dist = MultivariateNormal(action_mean, cov_mat)
        action = dist.sample()
        action_logprob = dist.log_prob(action)

        return action.detach().cpu().numpy(), action_logprob.detach()

    def evaluate(self, state, action):
        action_mean = self.actor(state)
        cov_mat = torch.diag_embed(self.action_var.expand_as(action_mean)).to(device)

        dist = MultivariateNormal(action_mean, cov_mat)

        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)

        return action_logprobs, state_values, dist_entropy


In [45]:

class PPO:
    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, action_std):
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        self.buffer = RolloutBuffer()

        self.policy = ActorCritic(state_dim, action_dim, action_std).to(device)
        self.optimizer = torch.optim.Adam([
            {'params': self.policy.actor.parameters(), 'lr': lr_actor},
            {'params': self.policy.critic.parameters(), 'lr': lr_critic}
        ])

        self.policy_old = ActorCritic(state_dim, action_dim, action_std).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())

        self.MseLoss = nn.MSELoss()

    def select_action(self, state):
        with torch.no_grad():
            action, action_logprob = self.policy_old.act(state)

        self.buffer.states.append(state)
        self.buffer.actions.append(action)
        self.buffer.logprobs.append(action_logprob)

        return action

    def update(self):
        # Convert lists to tensors
        states = torch.FloatTensor(self.buffer.states).to(device)
        actions = torch.FloatTensor(self.buffer.actions).to(device)
        logprobs = torch.FloatTensor(self.buffer.logprobs).to(device)
        rewards = []
        discounted_reward = 0

        for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        rewards = torch.FloatTensor(rewards).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        for _ in range(self.K_epochs):
            logprobs_new, state_values, dist_entropy = self.policy.evaluate(states, actions)
            ratios = torch.exp(logprobs_new - logprobs.detach())
            advantages = rewards - state_values.detach().squeeze()

            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages

            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values.squeeze(), rewards) - 0.01*dist_entropy

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        self.policy_old.load_state_dict(self.policy.state_dict())
        self.buffer.clear()


In [46]:
env = gym.make("Pendulum-v1")

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_std = 0.5
lr_actor = 0.0003
lr_critic = 0.001
gamma = 0.99
K_epochs = 80
eps_clip = 0.2
max_timesteps = 200
max_episodes = 1500
update_timestep = 4000

ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, action_std)

time_step = 0
best_reward = -float('inf')  #  Tổng reward tốt nhất
print_running_reward = 0

for episode in range(1, max_episodes + 1):
    state = env.reset()[0]
    current_ep_reward = 0

    for t in range(max_timesteps):
        time_step += 1
        action = ppo_agent.select_action(state)
        state, reward, done, _, _ = env.step(action)

        ppo_agent.buffer.rewards.append(reward)
        ppo_agent.buffer.is_terminals.append(done)

        current_ep_reward += reward

        if time_step % update_timestep == 0:
            ppo_agent.update()
            time_step = 0

        if done:
            break

    print(f"Episode {episode} \t Reward: {current_ep_reward:.2f}")

    # Chỉ lưu nếu reward lớn nhất từ trước đến nay
    if current_ep_reward > best_reward:
        best_reward = current_ep_reward
        print(f" Saving best model at episode {episode} with reward {current_ep_reward:.2f}")
        torch.save(ppo_agent.policy.state_dict(), "PPO_Pendulum_best.pth")


Episode 1 	 Reward: -1504.28
 Saving best model at episode 1 with reward -1504.28
Episode 2 	 Reward: -1292.81
 Saving best model at episode 2 with reward -1292.81
Episode 3 	 Reward: -1005.65
 Saving best model at episode 3 with reward -1005.65
Episode 4 	 Reward: -953.27
 Saving best model at episode 4 with reward -953.27
Episode 5 	 Reward: -1020.75
Episode 6 	 Reward: -1579.12
Episode 7 	 Reward: -881.90
 Saving best model at episode 7 with reward -881.90
Episode 8 	 Reward: -1470.22
Episode 9 	 Reward: -854.10
 Saving best model at episode 9 with reward -854.10
Episode 10 	 Reward: -898.08
Episode 11 	 Reward: -1142.29
Episode 12 	 Reward: -1077.40
Episode 13 	 Reward: -967.42
Episode 14 	 Reward: -741.13
 Saving best model at episode 14 with reward -741.13
Episode 15 	 Reward: -1792.03
Episode 16 	 Reward: -1133.32
Episode 17 	 Reward: -961.12
Episode 18 	 Reward: -1621.42
Episode 19 	 Reward: -1162.87
Episode 20 	 Reward: -1078.44
Episode 21 	 Reward: -881.75
Episode 22 	 Reward

In [54]:
import time
import torch

def evaluate_policy(env, policy, episodes=1, render=True):
    total_reward = 0.0

    for ep in range(episodes):
        state = env.reset()[0]
        ep_reward = 0.0
        done = False
        print(f"\n--- Evaluation Episode {ep + 1} ---")

        while not done:
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            with torch.no_grad():
                action_mean = policy.actor(state_tensor)
            action = action_mean.cpu().numpy().flatten()

            state, reward, done, _, _ = env.step(action)
            ep_reward += reward

            # ✅ In reward tại mỗi bước
            print(f"Step reward: {reward:.2f}")

            if render:
                env.render()
                time.sleep(0.1)  # chờ 20ms để nhìn thấy chuyển động

        print(f"Episode {ep + 1} Total Reward: {ep_reward:.2f}")
        total_reward += ep_reward

    avg_reward = total_reward / episodes
    # print(f"\n✅ Average Evaluation Reward over {episodes} episodes: {avg_reward:.2f}")
    return avg_reward


In [None]:
# Load best model
ppo_agent.policy.load_state_dict(torch.load("PPO_Pendulum_best.pth"))
env = gym.make("Pendulum-v1", render_mode="human")
# Đánh giá với hiển thị
evaluate_policy(env, ppo_agent.policy, episodes=1, render=True)



--- Evaluation Episode 1 ---
Step reward: -0.10
Step reward: -0.12
Step reward: -0.14
Step reward: -0.16
Step reward: -0.19
Step reward: -0.24
Step reward: -0.29
Step reward: -0.37
Step reward: -0.48
Step reward: -0.63
Step reward: -0.83
Step reward: -1.13
Step reward: -1.56
Step reward: -2.18
Step reward: -3.04
Step reward: -4.23
Step reward: -5.81
Step reward: -7.80
Step reward: -10.17
Step reward: -12.79
Step reward: -15.51
Step reward: -13.41
Step reward: -11.18
Step reward: -8.99
Step reward: -6.97
Step reward: -5.23
Step reward: -3.82
Step reward: -2.74
Step reward: -1.93
Step reward: -1.34
Step reward: -0.92
Step reward: -0.63
Step reward: -0.42
Step reward: -0.28
Step reward: -0.19
Step reward: -0.13
Step reward: -0.08
Step reward: -0.06
Step reward: -0.04
Step reward: -0.02
Step reward: -0.02
Step reward: -0.01
Step reward: -0.01
Step reward: -0.01
Step reward: -0.00
Step reward: -0.00
Step reward: -0.00
Step reward: -0.00
Step reward: -0.00
Step reward: -0.00
Step reward: -0

KeyboardInterrupt: 

: 