# Version 6

In [1]:
# Jerry Jiang

# V6 reward: Confidence-driven reward with entropy penalty
# Reward = (1.0 + 0.1 × confidence) if correct, else 0
# Then subtract 0.05 × entropy (applied to all samples)

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import torch.nn.functional as F
import pandas as pd
from tqdm import tqdm
import numpy as np
import json
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Global Variable
Version = "V6"
bert_model_path = "../Model/sentiment_bert"
train_data_path = "../Dataset/train_preprocessed.csv"
supervised_model_path = "../Model/policy_net_supervised.pt"
save_model_path = f"../Model/{Version}"
logs_path = f"../Logs/{Version}"

In [4]:
from transformers import BertTokenizer, BertModel, BertConfig

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using BERT model from: {bert_model_path}")

tokenizer = BertTokenizer.from_pretrained(str(bert_model_path), local_files_only=True)
config = BertConfig.from_pretrained(str(bert_model_path), output_hidden_states=True, local_files_only=True)
bert = BertModel.from_pretrained(str(bert_model_path), config=config, local_files_only=True).to(device)
bert.eval()


Using BERT model from: ../Model/sentiment_bert


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.3, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.3, inplace=False

In [5]:
train_data = pd.read_csv(train_data_path)
texts = train_data["Phrase"].astype(str).tolist()
labels = train_data["Sentiment"].tolist()

encodings = tokenizer(
    texts,
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt"
)

In [6]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = SentimentDataset(encodings, labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [7]:
# Policy (Actor) network
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=128, output_dim=5):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)  # logits

# Value (Critic) network
class ValueNetwork(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=128):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x).squeeze()

In [8]:
# === Step 1: Initialize policy network from supervised model ===
policy_net = PolicyNetwork().to(device)
policy_net.load_state_dict(torch.load(supervised_model_path))
policy_net.train()

# === Step 2: Evaluate initial accuracy and loss before RL training ===
from sklearn.metrics import accuracy_score

policy_net.eval()

all_preds = []
all_labels = []
total_loss = 0

with torch.no_grad():
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]

        logits = policy_net(cls_embeddings)
        loss = F.cross_entropy(logits, labels)
        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

loss_before = total_loss / len(train_loader)
acc_before = accuracy_score(all_labels, all_preds)

print(f"[Before RL] Accuracy: {acc_before:.4f} | CrossEntropy Loss: {loss_before:.4f}")

policy_net.train()

# === Step 3: Initialize value network and optimizers ===
value_net = ValueNetwork().to(device)
actor_optimizer = optim.Adam(policy_net.parameters(), lr=1e-5)
critic_optimizer = optim.Adam(value_net.parameters(), lr=1e-5)


  policy_net.load_state_dict(torch.load(supervised_model_path))


[Before RL] Accuracy: 0.7262 | CrossEntropy Loss: 0.6818


In [9]:
# V6 reward: Confidence-driven reward with entropy penalty
# Reward = (1.0 + 0.1 × confidence) if correct, else 0
# Then subtract 0.05 × entropy (applied to all samples)

def compute_reward(preds, labels):
    probs = torch.softmax(preds, dim=1)
    pred_labels = torch.argmax(probs, dim=1)
    correct = (pred_labels == labels).float()

    confidence = probs[range(len(labels)), pred_labels]
    entropy = -torch.sum(probs * torch.log(probs + 1e-8), dim=1)

    reward = correct * (1.0 + 0.1 * confidence) - 0.05 * entropy
    return reward


def compute_entropy(logits):
    prob = torch.softmax(logits, dim=1)
    entropy = -torch.sum(prob * torch.log(prob + 1e-8), dim=1)
    return entropy.mean().item()

# 1. A2C Begin Training

In [10]:
epochs = 7
train_logs = {
    "loss": [],
    "reward": [],
    "accuracy": [],
    "entropy": []
}

for epoch in range(epochs):
    total_loss = 0
    total_reward = 0
    total_entropy = 0
    correct = 0
    total = 0

    for batch in tqdm(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.no_grad():
            output = bert(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeddings = output.last_hidden_state[:, 0, :]

        # ---- Actor forward
        logits = policy_net(cls_embeddings)
        log_probs = torch.log_softmax(logits, dim=1)
        probs = torch.exp(log_probs)
        sampled_action = torch.multinomial(probs, num_samples=1).squeeze()
        log_prob = log_probs[range(len(sampled_action)), sampled_action]

        # ---- Critic forward
        value = value_net(cls_embeddings)  # [B]
        reward = compute_reward(logits, labels)
        advantage = reward - value.detach()

        # ---- Losses
        policy_loss = - (log_prob * advantage).mean()
        value_loss = F.mse_loss(value, reward)
        total_batch_loss = policy_loss + value_loss

        # ---- Accuracy and entropy
        pred = torch.argmax(logits, dim=1)
        correct += (pred == labels).sum().item()
        total += labels.size(0)
        entropy = compute_entropy(logits)

        # ---- Backprop
        actor_optimizer.zero_grad()
        critic_optimizer.zero_grad()
        total_batch_loss.backward()
        actor_optimizer.step()
        critic_optimizer.step()

        total_loss += total_batch_loss.item()
        total_reward += reward.mean().item()
        total_entropy += entropy

    epoch_acc = correct / total
    epoch_loss = total_loss
    epoch_reward = total_reward / len(train_loader)
    epoch_entropy = total_entropy / len(train_loader)

    train_logs["loss"].append(epoch_loss)
    train_logs["reward"].append(epoch_reward)
    train_logs["accuracy"].append(epoch_acc)
    train_logs["entropy"].append(epoch_entropy)

    print(f"[Epoch {epoch+1}] Loss: {epoch_loss:.4f} | Reward: {epoch_reward:.4f} | Accuracy: {epoch_acc:.4f} | Entropy: {epoch_entropy:.4f}")

# Save A2C model and value
torch.save(policy_net.state_dict(), os.path.join(save_model_path, "policy_net_rl_a2c_" + Version + ".pt"))
torch.save(value_net.state_dict(), os.path.join(save_model_path, "value_net_rl_a2c_" + Version + ".pt"))

with open(os.path.join(logs_path, "a2c_" + Version + ".json"), "w") as f:
    json.dump(train_logs, f, indent=2)

print("Saved A2C policy model to:", os.path.join(save_model_path, "policy_net_rl_a2c_" + Version + ".pt"))
print("Saved A2C value model to:", os.path.join(save_model_path, "value_net_rl_a2c_" + Version + ".pt"))
print("Saved A2C logs to:", os.path.join(logs_path, "a2c_" + Version + ".json"))

# compare final result
acc_after = train_logs["accuracy"][-1]
acc_change = acc_after - acc_before
acc_pct = (acc_change / acc_before) * 100 if acc_before > 0 else 0

print(f"[Comparison to Supervised]")
print(f"Accuracy Before: {acc_before:.4f} | After: {acc_after:.4f} | Δ: {acc_change:+.4f} ({acc_pct:+.2f}%)")

100%|██████████| 5853/5853 [05:16<00:00, 18.47it/s]


[Epoch 1] Loss: 1218.0549 | Reward: 0.4116 | Accuracy: 0.4501 | Entropy: 1.2381


100%|██████████| 5853/5853 [05:14<00:00, 18.61it/s]


[Epoch 2] Loss: 336.3214 | Reward: -0.0128 | Accuracy: 0.0652 | Entropy: 1.5919


100%|██████████| 5853/5853 [05:13<00:00, 18.64it/s]


[Epoch 3] Loss: 274.0104 | Reward: -0.0279 | Accuracy: 0.0510 | Entropy: 1.6003


100%|██████████| 5853/5853 [05:12<00:00, 18.70it/s]


[Epoch 4] Loss: 242.7623 | Reward: -0.0342 | Accuracy: 0.0450 | Entropy: 1.6028


100%|██████████| 5853/5853 [05:13<00:00, 18.67it/s]


[Epoch 5] Loss: 272.3822 | Reward: -0.0276 | Accuracy: 0.0514 | Entropy: 1.6041


100%|██████████| 5853/5853 [05:13<00:00, 18.67it/s]


[Epoch 6] Loss: 269.4605 | Reward: -0.0283 | Accuracy: 0.0509 | Entropy: 1.6051


100%|██████████| 5853/5853 [05:14<00:00, 18.64it/s]

[Epoch 7] Loss: 215.7213 | Reward: -0.0402 | Accuracy: 0.0393 | Entropy: 1.6059
Saved A2C policy model to: ../Model/V6\policy_net_rl_a2c_V6.pt
Saved A2C value model to: ../Model/V6\value_net_rl_a2c_V6.pt
Saved A2C logs to: ../Logs/V6\a2c_V6.json
[Comparison to Supervised]
Accuracy Before: 0.7262 | After: 0.0393 | Δ: -0.6869 (-94.59%)





# 2. REINFORCE Begin

In [11]:
# REINFORCE
policy_net = PolicyNetwork().to(device)
policy_net.load_state_dict(torch.load(supervised_model_path))
policy_net.train()

value_net = None  # REINFORCE does not use value network
actor_optimizer = torch.optim.Adam(policy_net.parameters(), lr=2e-5)

  policy_net.load_state_dict(torch.load(supervised_model_path))


In [12]:
train_logs = {"loss": [], "reward": [], "accuracy": [], "entropy": []}
epochs = 7

for epoch in range(epochs):
    total_loss, total_reward, total_entropy, correct, total = 0, 0, 0, 0, 0

    for batch in tqdm(train_loader, desc=f"REINFORCE Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.no_grad():
            outputs = bert(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeds = outputs.last_hidden_state[:, 0, :]

        logits = policy_net(cls_embeds)
        log_probs = torch.log_softmax(logits, dim=1)
        probs = torch.exp(log_probs)
        sampled_action = torch.multinomial(probs, num_samples=1).squeeze()
        log_prob = log_probs[range(len(sampled_action)), sampled_action]

        reward = compute_reward(logits, labels)
        entropy = compute_entropy(logits)

        loss = - (log_prob * reward.detach()).mean()
        actor_optimizer.zero_grad()
        loss.backward()
        actor_optimizer.step()

        total_loss += loss.item()
        total_reward += reward.mean().item()
        total_entropy += entropy
        pred = torch.argmax(logits, dim=1)
        correct += (pred == labels).sum().item()
        total += labels.size(0)

    acc = correct / total
    train_logs["loss"].append(total_loss)
    train_logs["reward"].append(total_reward / len(train_loader))
    train_logs["accuracy"].append(acc)
    train_logs["entropy"].append(total_entropy / len(train_loader))

    print(f"[REINFORCE][Epoch {epoch+1}] Loss: {total_loss:.4f} | Reward: {train_logs['reward'][-1]:.4f} | Acc: {acc:.4f}")

# Save REINFORCE policy only
torch.save(policy_net.state_dict(), os.path.join(save_model_path, "policy_net_rl_reinforce_" + Version + ".pt"))

with open(os.path.join(logs_path, "reinforce_" + Version + ".json"), "w") as f:
    json.dump(train_logs, f, indent=2)

print("Saved REINFORCE policy model to:", os.path.join(save_model_path, "policy_net_rl_reinforce_" + Version + ".pt"))
print("Saved REINFORCE logs to:", os.path.join(logs_path, "reinforce_" + Version + ".json"))

# compare final result
acc_after = train_logs["accuracy"][-1]
acc_change = acc_after - acc_before
acc_pct = (acc_change / acc_before) * 100 if acc_before > 0 else 0

print(f"[Comparison to Supervised]")
print(f"Accuracy Before: {acc_before:.4f} | After: {acc_after:.4f} | Δ: {acc_change:+.4f} ({acc_pct:+.2f}%)")

REINFORCE Epoch 1: 100%|██████████| 5853/5853 [05:11<00:00, 18.81it/s]


[REINFORCE][Epoch 1] Loss: 2432.5046 | Reward: 0.7387 | Acc: 0.7143


REINFORCE Epoch 2: 100%|██████████| 5853/5853 [05:17<00:00, 18.44it/s]


[REINFORCE][Epoch 2] Loss: 2158.9013 | Reward: 0.7276 | Acc: 0.7005


REINFORCE Epoch 3: 100%|██████████| 5853/5853 [05:16<00:00, 18.49it/s]


[REINFORCE][Epoch 3] Loss: 1996.8835 | Reward: 0.6805 | Acc: 0.6564


REINFORCE Epoch 4: 100%|██████████| 5853/5853 [05:12<00:00, 18.73it/s]


[REINFORCE][Epoch 4] Loss: 2469.4907 | Reward: 0.7109 | Acc: 0.6904


REINFORCE Epoch 5: 100%|██████████| 5853/5853 [05:09<00:00, 18.90it/s]


[REINFORCE][Epoch 5] Loss: 2485.3098 | Reward: 0.7163 | Acc: 0.6958


REINFORCE Epoch 6: 100%|██████████| 5853/5853 [05:10<00:00, 18.86it/s]


[REINFORCE][Epoch 6] Loss: 2951.1070 | Reward: 0.7003 | Acc: 0.6888


REINFORCE Epoch 7: 100%|██████████| 5853/5853 [05:10<00:00, 18.83it/s]

[REINFORCE][Epoch 7] Loss: 2382.9561 | Reward: 0.6731 | Acc: 0.6574
Saved REINFORCE policy model to: ../Model/V6\policy_net_rl_reinforce_V6.pt
Saved REINFORCE logs to: ../Logs/V6\reinforce_V6.json
[Comparison to Supervised]
Accuracy Before: 0.7262 | After: 0.6574 | Δ: -0.0687 (-9.47%)





# 3. REINFORCE_Baseline Begin

In [13]:
# REINFORCE_Baseline
policy_net = PolicyNetwork().to(device)
policy_net.load_state_dict(torch.load(supervised_model_path))
policy_net.train()

value_net = ValueNetwork().to(device)
actor_optimizer = torch.optim.Adam(policy_net.parameters(), lr=2e-5)
critic_optimizer = torch.optim.Adam(value_net.parameters(), lr=2e-5)

  policy_net.load_state_dict(torch.load(supervised_model_path))


In [14]:
train_logs = {"loss": [], "reward": [], "accuracy": [], "entropy": []}
epochs = 7

for epoch in range(epochs):
    total_loss, total_reward, total_entropy, correct, total = 0, 0, 0, 0, 0

    for batch in tqdm(train_loader, desc=f"REINFORCE_Baseline Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)


        with torch.no_grad():
            outputs = bert(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeds = outputs.last_hidden_state[:, 0, :]

        logits = policy_net(cls_embeds)
        log_probs = torch.log_softmax(logits, dim=1)
        probs = torch.exp(log_probs)
        sampled_action = torch.multinomial(probs, num_samples=1).squeeze()
        log_prob = log_probs[range(len(sampled_action)), sampled_action]

        reward = compute_reward(logits, labels)
        entropy = compute_entropy(logits)

        value = value_net(cls_embeds)
        advantage = reward - value.detach()

        policy_loss = - (log_prob * advantage).mean()
        value_loss = F.mse_loss(value, reward)
        loss = policy_loss + value_loss

        actor_optimizer.zero_grad()
        critic_optimizer.zero_grad()
        loss.backward()
        actor_optimizer.step()
        critic_optimizer.step()

        total_loss += loss.item()
        total_reward += reward.mean().item()
        total_entropy += entropy
        pred = torch.argmax(logits, dim=1)
        correct += (pred == labels).sum().item()
        total += labels.size(0)

    acc = correct / total
    train_logs["loss"].append(total_loss)
    train_logs["reward"].append(total_reward / len(train_loader))
    train_logs["accuracy"].append(acc)
    train_logs["entropy"].append(total_entropy / len(train_loader))

    print(f"[REINFORCE_Baseline][Epoch {epoch+1}] Loss: {total_loss:.4f} | Reward: {train_logs['reward'][-1]:.4f} | Acc: {acc:.4f}")

# Save REINFORCE_Baseline policy and value
torch.save(policy_net.state_dict(), os.path.join(save_model_path, "policy_net_rl_reinforce_baseline_" + Version + ".pt"))
torch.save(value_net.state_dict(), os.path.join(save_model_path, "value_net_rl_reinforce_baseline_" + Version + ".pt"))

with open(os.path.join(logs_path, "reinforce_baseline_" + Version + ".json"), "w") as f:
    json.dump(train_logs, f, indent=2)

print("Saved REINFORCE_Baseline policy model to:", os.path.join(save_model_path, "policy_net_rl_reinforce_baseline_" + Version + ".pt"))
print("Saved REINFORCE_Baseline value model to:", os.path.join(save_model_path, "value_net_rl_reinforce_baseline_" + Version + ".pt"))
print("Saved REINFORCE_Baseline logs to:", os.path.join(logs_path, "reinforce_baseline_" + Version + ".json"))

# compare final result
acc_after = train_logs["accuracy"][-1]
acc_change = acc_after - acc_before
acc_pct = (acc_change / acc_before) * 100 if acc_before > 0 else 0

print(f"[Comparison to Supervised]")
print(f"Accuracy Before: {acc_before:.4f} | After: {acc_after:.4f} | Δ: {acc_change:+.4f} ({acc_pct:+.2f}%)")

REINFORCE_Baseline Epoch 1: 100%|██████████| 5853/5853 [05:20<00:00, 18.24it/s]


[REINFORCE_Baseline][Epoch 1] Loss: 812.9989 | Reward: 0.2210 | Acc: 0.2782


REINFORCE_Baseline Epoch 2: 100%|██████████| 5853/5853 [05:20<00:00, 18.28it/s]


[REINFORCE_Baseline][Epoch 2] Loss: 284.4854 | Reward: -0.0250 | Acc: 0.0538


REINFORCE_Baseline Epoch 3: 100%|██████████| 5853/5853 [05:17<00:00, 18.43it/s]


[REINFORCE_Baseline][Epoch 3] Loss: 237.1368 | Reward: -0.0346 | Acc: 0.0446


REINFORCE_Baseline Epoch 4: 100%|██████████| 5853/5853 [05:15<00:00, 18.53it/s]


[REINFORCE_Baseline][Epoch 4] Loss: 335.6375 | Reward: -0.0116 | Acc: 0.0672


REINFORCE_Baseline Epoch 5: 100%|██████████| 5853/5853 [05:16<00:00, 18.49it/s]


[REINFORCE_Baseline][Epoch 5] Loss: 291.6787 | Reward: -0.0209 | Acc: 0.0582


REINFORCE_Baseline Epoch 6: 100%|██████████| 5853/5853 [05:16<00:00, 18.47it/s]


[REINFORCE_Baseline][Epoch 6] Loss: 235.9269 | Reward: -0.0336 | Acc: 0.0458


REINFORCE_Baseline Epoch 7: 100%|██████████| 5853/5853 [05:16<00:00, 18.50it/s]

[REINFORCE_Baseline][Epoch 7] Loss: 197.0830 | Reward: -0.0418 | Acc: 0.0377
Saved REINFORCE_Baseline policy model to: ../Model/V6\policy_net_rl_reinforce_baseline_V6.pt
Saved REINFORCE_Baseline value model to: ../Model/V6\value_net_rl_reinforce_baseline_V6.pt
Saved REINFORCE_Baseline logs to: ../Logs/V6\reinforce_baseline_V6.json
[Comparison to Supervised]
Accuracy Before: 0.7262 | After: 0.0377 | Δ: -0.6885 (-94.80%)





# 4. SCST Begin

In [15]:
# SCST
policy_net = PolicyNetwork().to(device)
policy_net.load_state_dict(torch.load(supervised_model_path))
policy_net.train()

value_net = ValueNetwork().to(device)
actor_optimizer = torch.optim.Adam(policy_net.parameters(), lr=2e-5)
critic_optimizer = torch.optim.Adam(value_net.parameters(), lr=2e-5)

  policy_net.load_state_dict(torch.load(supervised_model_path))


In [16]:
train_logs = {"loss": [], "reward": [], "accuracy": [], "entropy": []}
epochs = 7

for epoch in range(epochs):
    total_loss, total_reward, total_entropy, correct, total = 0, 0, 0, 0, 0

    for batch in tqdm(train_loader, desc=f"SCST Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)


        with torch.no_grad():
            outputs = bert(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeds = outputs.last_hidden_state[:, 0, :]

        logits = policy_net(cls_embeds)
        log_probs = torch.log_softmax(logits, dim=1)
        probs = torch.exp(log_probs)
        sampled_action = torch.multinomial(probs, num_samples=1).squeeze()
        log_prob = log_probs[range(len(sampled_action)), sampled_action]

        reward = compute_reward(logits, labels)
        entropy = compute_entropy(logits)

        value = value_net(cls_embeds)
        advantage = reward - value.detach()

        policy_loss = - (log_prob * advantage).mean()
        value_loss = F.mse_loss(value, reward)
        loss = policy_loss + value_loss

        actor_optimizer.zero_grad()
        critic_optimizer.zero_grad()
        loss.backward()
        actor_optimizer.step()
        critic_optimizer.step()

        total_loss += loss.item()
        total_reward += reward.mean().item()
        total_entropy += entropy
        pred = torch.argmax(logits, dim=1)
        correct += (pred == labels).sum().item()
        total += labels.size(0)

    acc = correct / total
    train_logs["loss"].append(total_loss)
    train_logs["reward"].append(total_reward / len(train_loader))
    train_logs["accuracy"].append(acc)
    train_logs["entropy"].append(total_entropy / len(train_loader))

    print(f"[SCST][Epoch {epoch+1}] Loss: {total_loss:.4f} | Reward: {train_logs['reward'][-1]:.4f} | Acc: {acc:.4f}")

# save SCST policy and value
torch.save(policy_net.state_dict(), os.path.join(save_model_path, "policy_net_rl_scst_" + Version + ".pt"))
torch.save(value_net.state_dict(), os.path.join(save_model_path, "value_net_rl_scst_" + Version + ".pt"))

with open(os.path.join(logs_path, "scst_" + Version + ".json"), "w") as f:
    json.dump(train_logs, f, indent=2)

print("Saved SCST policy model to:", os.path.join(save_model_path, "policy_net_rl_scst_" + Version + ".pt"))
print("Saved SCST value model to:", os.path.join(save_model_path, "value_net_rl_scst_" + Version + ".pt"))
print("Saved SCST logs to:", os.path.join(logs_path, "scst_" + Version + ".json"))

# compare final result
acc_after = train_logs["accuracy"][-1]
acc_change = acc_after - acc_before
acc_pct = (acc_change / acc_before) * 100 if acc_before > 0 else 0

print(f"[Comparison to Supervised]")
print(f"Accuracy Before: {acc_before:.4f} | After: {acc_after:.4f} | Δ: {acc_change:+.4f} ({acc_pct:+.2f}%)")

SCST Epoch 1: 100%|██████████| 5853/5853 [05:14<00:00, 18.64it/s]


[SCST][Epoch 1] Loss: 823.3102 | Reward: 0.2173 | Acc: 0.2744


SCST Epoch 2: 100%|██████████| 5853/5853 [05:13<00:00, 18.68it/s]


[SCST][Epoch 2] Loss: 313.8985 | Reward: -0.0196 | Acc: 0.0590


SCST Epoch 3: 100%|██████████| 5853/5853 [05:13<00:00, 18.65it/s]


[SCST][Epoch 3] Loss: 260.5779 | Reward: -0.0303 | Acc: 0.0488


SCST Epoch 4: 100%|██████████| 5853/5853 [05:14<00:00, 18.63it/s]


[SCST][Epoch 4] Loss: 253.3416 | Reward: -0.0325 | Acc: 0.0467


SCST Epoch 5: 100%|██████████| 5853/5853 [05:13<00:00, 18.67it/s]


[SCST][Epoch 5] Loss: 273.9247 | Reward: -0.0258 | Acc: 0.0534


SCST Epoch 6: 100%|██████████| 5853/5853 [05:13<00:00, 18.66it/s]


[SCST][Epoch 6] Loss: 198.6841 | Reward: -0.0427 | Acc: 0.0369


SCST Epoch 7: 100%|██████████| 5853/5853 [05:16<00:00, 18.52it/s]

[SCST][Epoch 7] Loss: 260.8303 | Reward: -0.0282 | Acc: 0.0511
Saved SCST policy model to: ../Model/V6\policy_net_rl_scst_V6.pt
Saved SCST value model to: ../Model/V6\value_net_rl_scst_V6.pt
Saved SCST logs to: ../Logs/V6\scst_V6.json
[Comparison to Supervised]
Accuracy Before: 0.7262 | After: 0.0511 | Δ: -0.6751 (-92.96%)





# 5. PPO Begin

In [17]:
# PPO
policy_net = PolicyNetwork().to(device)
policy_net.load_state_dict(torch.load(supervised_model_path))
policy_net.train()

value_net = ValueNetwork().to(device)
actor_optimizer = torch.optim.Adam(policy_net.parameters(), lr=2e-5)
critic_optimizer = torch.optim.Adam(value_net.parameters(), lr=2e-5)

  policy_net.load_state_dict(torch.load(supervised_model_path))


In [None]:
train_logs = {"loss": [], "reward": [], "accuracy": [], "entropy": []}
epochs = 7

for epoch in range(epochs):
    total_loss, total_reward, total_entropy, correct, total = 0, 0, 0, 0, 0

    for batch in tqdm(train_loader, desc=f"PPO Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)


        with torch.no_grad():
            outputs = bert(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeds = outputs.last_hidden_state[:, 0, :]

        logits = policy_net(cls_embeds)
        log_probs = torch.log_softmax(logits, dim=1)
        probs = torch.exp(log_probs)
        sampled_action = torch.multinomial(probs, num_samples=1).squeeze()
        log_prob = log_probs[range(len(sampled_action)), sampled_action]

        reward = compute_reward(logits, labels)
        entropy = compute_entropy(logits)

        value = value_net(cls_embeds)
        advantage = reward - value.detach()

        old_log_prob = log_prob.detach()
        new_logits = policy_net(cls_embeds)
        new_log_probs = torch.log_softmax(new_logits, dim=1)
        new_log_prob = new_log_probs[range(len(sampled_action)), sampled_action]

        ratio = torch.exp(new_log_prob - old_log_prob)
        surr1 = ratio * advantage
        surr2 = torch.clamp(ratio, 0.8, 1.2) * advantage

        policy_loss = -torch.min(surr1, surr2).mean()
        value_loss = F.mse_loss(value, reward)
        loss = policy_loss + value_loss

        actor_optimizer.zero_grad()
        critic_optimizer.zero_grad()
        loss.backward()
        actor_optimizer.step()
        critic_optimizer.step()

        total_loss += loss.item()
        total_reward += reward.mean().item()
        total_entropy += entropy
        pred = torch.argmax(logits, dim=1)
        correct += (pred == labels).sum().item()
        total += labels.size(0)

    acc = correct / total
    train_logs["loss"].append(total_loss)
    train_logs["reward"].append(total_reward / len(train_loader))
    train_logs["accuracy"].append(acc)
    train_logs["entropy"].append(total_entropy / len(train_loader))

    print(f"[PPO][Epoch {epoch+1}] Loss: {total_loss:.4f} | Reward: {train_logs['reward'][-1]:.4f} | Acc: {acc:.4f}")

# Save PPO policy and value
torch.save(policy_net.state_dict(), os.path.join(save_model_path, "policy_net_rl_ppo_" + Version + ".pt"))
torch.save(value_net.state_dict(), os.path.join(save_model_path, "value_net_rl_ppo_" + Version + ".pt"))

with open(os.path.join(logs_path, "ppo_" + Version + ".json"), "w") as f:
    json.dump(train_logs, f, indent=2)

print("Saved PPO policy model to:", os.path.join(save_model_path, "policy_net_rl_ppo_" + Version + ".pt"))
print("Saved PPO value model to:", os.path.join(save_model_path, "value_net_rl_ppo_" + Version + ".pt"))
print("Saved PPO logs to:", os.path.join(logs_path, "ppo_" + Version + ".json"))

# compare final result
acc_after = train_logs["accuracy"][-1]
acc_change = acc_after - acc_before
acc_pct = (acc_change / acc_before) * 100 if acc_before > 0 else 0

print(f"[Comparison to Supervised]")
print(f"Accuracy Before: {acc_before:.4f} | After: {acc_after:.4f} | Δ: {acc_change:+.4f} ({acc_pct:+.2f}%)")

PPO Epoch 1: 100%|██████████| 5853/5853 [05:22<00:00, 18.15it/s]


[PPO][Epoch 1] Loss: 1217.9293 | Reward: 0.7587 | Acc: 0.7137


PPO Epoch 2: 100%|██████████| 5853/5853 [05:21<00:00, 18.19it/s]


[PPO][Epoch 2] Loss: 1255.6847 | Reward: 0.7539 | Acc: 0.6953


PPO Epoch 3: 100%|██████████| 5853/5853 [05:22<00:00, 18.15it/s]


[PPO][Epoch 3] Loss: 1253.6914 | Reward: 0.7490 | Acc: 0.6874


PPO Epoch 4: 100%|██████████| 5853/5853 [05:21<00:00, 18.20it/s]


[PPO][Epoch 4] Loss: 1251.6453 | Reward: 0.7211 | Acc: 0.6606


PPO Epoch 5: 100%|██████████| 5853/5853 [05:21<00:00, 18.21it/s]


[PPO][Epoch 5] Loss: 1226.3196 | Reward: 0.6918 | Acc: 0.6328


PPO Epoch 6: 100%|██████████| 5853/5853 [05:21<00:00, 18.21it/s]


[PPO][Epoch 6] Loss: 1212.2298 | Reward: 0.6752 | Acc: 0.6170


PPO Epoch 7: 100%|██████████| 5853/5853 [05:21<00:00, 18.23it/s]

[PPO][Epoch 7] Loss: 1150.2770 | Reward: 0.6370 | Acc: 0.5814
Saved PPO policy model to: ../Model/V6\policy_net_rl_ppo_V6.pt
Saved PPO value model to: ../Model/V6\value_net_rl_ppo_V6.pt
Saved PPO logs to: ../Logs/V6\ppo_V6.json
[Comparison to Supervised]
Accuracy Before: 0.7262 | After: 0.5814 | Δ: -0.1447 (-19.93%)



