In [None]:
!pip install trl==0.11.3 -U datasets


Collecting trl==0.11.3
  Downloading trl-0.11.3-py3-none-any.whl.metadata (12 kB)
Collecting tyro>=0.5.11 (from trl==0.11.3)
  Downloading tyro-0.9.28-py3-none-any.whl.metadata (11 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl==0.11.3)
  Downloading shtab-1.7.2-py3-none-any.whl.metadata (7.4 kB)
Downloading trl-0.11.3-py3-none-any.whl (316 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.6/316.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tyro-0.9.28-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.2/129.2 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading shtab-1.7.2-py3-none-any.whl (14 kB)
Installing collected packages: shtab, tyro, trl
Successfully installed shtab-1.7.2 trl-0.11.3 tyro-0.9.28


In [None]:
import torch
import json
import os
import numpy as np
import random
import re
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer, AutoModelForSequenceClassification
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from datasets import Dataset as HFDataset
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import wandb
import time
from datetime import datetime

In [None]:
wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mericzhangez1006[0m ([33mericzhangez1006-ucl[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
import torch
import torch.nn as nn
import numpy as np
import json
import os
import re
import time
import random
import wandb
from datetime import datetime
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    AutoTokenizer,
    AutoModelForSequenceClassification
)
from datasets import Dataset as HFDataset
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from sentence_transformers import SentenceTransformer, util

# ====================== Custom PPO Trainer ======================
class CustomPPOTrainer(PPOTrainer):
    """Custom PPO Trainer that prevents data shuffling to maintain alignment"""

    def prepare_dataloader(self, dataset, data_collator):
        """Override prepare_dataloader to disable shuffling"""
        return DataLoader(
            dataset,
            batch_size=self.config.batch_size,
            shuffle=False,  # Disable shuffling
            collate_fn=data_collator,
            drop_last=False,
            pin_memory=True,
            num_workers=0,
        )

# ====================== Text Cleaning Functions ======================
def clean_therapy_text(text):
    """Remove descriptive text patterns from SFT data"""
    expression_pattern = r'\s*(The (?:speaker|emotion state)[^.]*\.(?:[^.]*\.)*)'
    match = re.search(expression_pattern, text, re.IGNORECASE | re.DOTALL)
    if match:
        return text[:match.start()].strip()
    return text.strip()

# ====================== Emotion Extraction ======================
def extract_emotion_and_text_v4(full_response):
    """Extract emotion and text from SFT format: 'therapist_text <therapist_emotion> emotion_word <eos>'"""
    full_response = full_response.strip()

    if full_response.endswith('<eos>'):
        full_response = full_response[:-5].strip()

    emotion_pattern = r'<therapist_emotion>'
    emotion_matches = list(re.finditer(emotion_pattern, full_response))

    if emotion_matches:
        last_emotion_match = emotion_matches[-1]
        last_emotion_start = last_emotion_match.end()

        emotion_part = full_response[last_emotion_start:].strip()

        therapist_text = full_response[:last_emotion_match.start()].strip()
        emotion_word = re.sub(r'<eos>.*$', '', emotion_part).strip().lower()

        emotion_words = emotion_word.split()
        if emotion_words:
            emotion_word = emotion_words[0]

        valid_emotions = {"anger", "joy", "neutral", "sadness", "depression", "disgust", "fear"}
        if emotion_word in valid_emotions:
            return therapist_text, emotion_word, True
        else:
            return therapist_text, emotion_word, True

    return full_response, "", False

# ====================== Advanced Reward Calculator ======================
class TherapyRewardCalculator:
    """Enhanced reward calculator with sentiment and empathy models"""

    def __init__(self, device):
        self.device = device

        # 1. Sentiment classifier (distillBERT)
        self.sentiment_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
        self.sentiment_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english").to(device)

        # 2. Empathy classifier (bert_empathy)
        self.empathy_tokenizer = AutoTokenizer.from_pretrained("paragon-analytics/bert_empathy")
        self.empathy_model = AutoModelForSequenceClassification.from_pretrained("paragon-analytics/bert_empathy").to(device)

    def compute_empathy_reward(self, text):
        """Component 4: bert_empathy score"""
        try:
            inputs = self.empathy_tokenizer(text, return_tensors="pt",
                                          truncation=True, padding=True, max_length=512).to(self.device)

            with torch.no_grad():
                outputs = self.empathy_model(**inputs)
                probs = torch.softmax(outputs.logits, dim=-1)
                empathy_score = probs[0][1].item()

            return empathy_score
        except:
            return 0.5

    def compute_sentiment_reward(self, text):
        """Component 2: distillBERT sentiment (positive sentiment reward)"""
        try:
            inputs = self.sentiment_tokenizer(text, return_tensors="pt",
                                            truncation=True, padding=True, max_length=512).to(self.device)

            with torch.no_grad():
                outputs = self.sentiment_model(**inputs)
                probs = torch.softmax(outputs.logits, dim=-1)
                positive_score = probs[0][1].item()  

            return positive_score
        except:
            return 0.5 

# ====================== Dataset Class ======================
class TherapyDatasetV4(Dataset):
    """Dataset class for 4thFIXED preprocessed data"""

    def __init__(self, json_path, tokenizer_path=None):
        with open(json_path, 'r', encoding='utf-8') as f:
            self._data = json.load(f)

        if tokenizer_path and os.path.exists(tokenizer_path):
            self.tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
            print(f"Loaded tokenizer from {tokenizer_path}")
        else:
            raise ValueError(f"Tokenizer not found at {tokenizer_path}")

        self.bos_id = self.tokenizer.bos_token_id
        self.eos_id = self.tokenizer.eos_token_id
        self.pad_id = self.tokenizer.pad_token_id
        self.problem_id = self.tokenizer.convert_tokens_to_ids("<problem>")
        self.user_id = self.tokenizer.convert_tokens_to_ids("<user>")
        self.user_emotion_id = self.tokenizer.convert_tokens_to_ids("<user_emotion>")
        self.therapist_id = self.tokenizer.convert_tokens_to_ids("<therapist>")
        self.therapist_emotion_id = self.tokenizer.convert_tokens_to_ids("<therapist_emotion>")

        self.max_length = 128
        self.processed_data = []
        self._preprocess_data_v4()

    def _preprocess_data_v4(self):
        """Preprocess dialog data using 4thFIXED format"""
        for conv in tqdm(self._data, desc="Processing data"):
            problem_type = conv.get("problem_type", "").strip()
            dialog = conv.get("dialog", [])

            user_text_parts = []
            user_emotions = []

            for turn in dialog:
                speaker = turn.get("speaker", "")
                text = clean_therapy_text(turn.get("text", ""))
                emotion = turn.get("emotion", "").strip()

                if speaker != "sys":  # User turn
                    if text:
                        user_text_parts.append(text)
                    if emotion:
                        user_emotions.append(emotion)
                else:  # Therapist turn
                    if not user_text_parts:
                        continue

                    therapist_text = clean_therapy_text(text)
                    therapist_emotion = emotion

                    combined_user_text = " ".join(user_text_parts)
                    last_user_emotion = user_emotions[-1] if user_emotions else ""

                    input_parts = []
                    if problem_type:
                        input_parts.append(f"<problem>{problem_type}")

                    input_parts.append(f"<user>{combined_user_text}")
                    if last_user_emotion:
                        input_parts.append(f"<user_emotion>{last_user_emotion}")

                    input_parts.append("<therapist>")
                    input_text = " ".join(input_parts)

                    input_encoding = self.tokenizer(
                        input_text,
                        max_length=self.max_length,
                        padding='max_length',
                        truncation=True,
                        return_tensors='pt'
                    )

                    self.processed_data.append({
                        'input_ids': input_encoding['input_ids'].squeeze(),
                        'attention_mask': input_encoding['attention_mask'].squeeze(),
                        'query': input_text,
                        'target_text': therapist_text,
                        'target_emotion': therapist_emotion,
                        'user_input': combined_user_text,
                        'user_emotion': last_user_emotion
                    })

                    user_text_parts = []
                    user_emotions = []

        print(f"Processed {len(self.processed_data)} therapist responses")

    def __len__(self):
        return len(self.processed_data)

    def __getitem__(self, idx):
        return self.processed_data[idx]

    def to_hf_dataset(self):
        """Convert to HuggingFace Dataset format"""
        return HFDataset.from_dict({
            'input_ids': [x['input_ids'].tolist() for x in self.processed_data],
            'attention_mask': [x['attention_mask'].tolist() for x in self.processed_data],
            'query': [x['query'] for x in self.processed_data],
            'target_text': [x['target_text'] for x in self.processed_data],
            'target_emotion': [x['target_emotion'] for x in self.processed_data],
            'user_input': [x['user_input'] for x in self.processed_data],
            'user_emotion': [x['user_emotion'] for x in self.processed_data]
        })

# ====================== Generation Function ======================
def generate_therapy_response(model, tokenizer, query_tensor, device, max_new_tokens=128):
    """Generate therapy response with conservative parameters"""

    generation_kwargs = {
        "do_sample": True,
        "top_p": 1.0,
        "top_k": 0.0,
        "pad_token_id": tokenizer.pad_token_id,
        "eos_token_id": tokenizer.eos_token_id,
        "max_new_tokens": max_new_tokens,
    }

    with torch.no_grad():
        outputs = model.generate(
            query_tensor.unsqueeze(0).to(device),
            **generation_kwargs
        )

    generated_ids = outputs[0].cpu()
    new_tokens = generated_ids[len(query_tensor):]

    full_response = tokenizer.decode(
        new_tokens.tolist(),
        skip_special_tokens=False,
        clean_up_tokenization_spaces=True
    ).strip()

    therapist_text, therapist_emotion, has_emotion = extract_emotion_and_text_v4(full_response)

    return new_tokens.to(device), therapist_text, therapist_emotion, has_emotion

# ====================== ENHANCED Reward Functions ======================
def detect_local_ngram_repetition(text, n=2, window_size=10, max_repeats=1, weight=3.0):
    """Enhanced repetition detection with stricter penalties"""
    words = text.split()
    if len(words) < n:
        return 0.0

    penalty = 0.0
    for i in range(len(words) - window_size + 1):
        window = words[i:i + window_size]
        ngrams = [tuple(window[j:j + n]) for j in range(len(window) - n + 1)]

        ngram_counts = {}
        for ngram in ngrams:
            ngram_counts[ngram] = ngram_counts.get(ngram, 0) + 1

        for ngram, count in ngram_counts.items():
            if count > max_repeats:
                penalty += (count - max_repeats) ** 1.5 * weight

    return penalty

def compute_text_quality_score_enhanced(response):
    """Text quality scoring with enhanced penalties for poor responses"""
    if not response or not response.strip():
        return -10.0  # Stricter penalty for empty responses

    response_clean = response.strip()
    words = response_clean.split()

    base_score = 2.0  # Baseline score

    # Enhanced repetition penalties
    bigram_penalty = detect_local_ngram_repetition(response_clean, n=2, window_size=6, max_repeats=1, weight=4.0)
    trigram_penalty = detect_local_ngram_repetition(response_clean, n=3, window_size=5, max_repeats=1, weight=6.0)

    # Word-level repetition penalty
    word_penalty = 0.0
    word_counts = {}
    for word in words:
        clean_word = re.sub(r'[^\w]', '', word.lower())
        if len(clean_word) > 3:
            word_counts[clean_word] = word_counts.get(clean_word, 0) + 1

    for token, count in word_counts.items():
        if count > 4:  
            # Exponential penalty: more repeats = heavier punishment
            penalty = min((count - 1) ** 1.7 * 1.5, 8.0)
            word_penalty -= penalty

    # Meaningless pattern detection (enhanced)
    meaningless_patterns = [
        r'^[^a-zA-Z]*$',      # No letters
        r'^\W+$',             # All symbols
        r'^(.)\1{4,}$',       # Repeated characters (lower threshold)
        r'[^\x00-\x7F]{3,}',  # Non-standard characters (lower threshold)
        r'\b(?:yes|no|ok|thanks?)\b',  # Simple responses
        r'.{100,}'            # Too long responses
    ]

    for pattern in meaningless_patterns:
        if re.search(pattern, response_clean, re.IGNORECASE):
            return -10.0  # Stricter penalty for meaningless responses

    quality_score = base_score - bigram_penalty - trigram_penalty + word_penalty
    return max(min(quality_score, 8.0), -10.0)  # Extended negative range

def compute_emotion_score_enhanced(has_emotion_token, generated_emotion, target_emotion):
    """Emotion scoring with enhanced rewards for non-neutral matches"""
    if not has_emotion_token:
        return -7.0  # Enhanced penalty for missing emotion tokens

    valid_emotions = {"anger", "joy", "neutral", "sadness", "depression", "disgust", "fear"}
    generated_emotion = generated_emotion.lower()

    score = 0.0

    # Stricter penalty for invalid emotions
    if generated_emotion not in valid_emotions:
        return -6.0

    score += 1.0  # Base score for valid emotions

    if not target_emotion:
        return score

    target_emotion = target_emotion.lower()

    # Emotion group definitions
    emotion_groups = {
        "negative": {"sadness", "depression", "anger", "disgust", "fear"},
        "positive": {"joy"},
        "neutral": {"neutral"}
    }

    gen_group = next((g for g, e in emotion_groups.items() if generated_emotion in e), None)
    target_group = next((g for g, e in emotion_groups.items() if target_emotion in e), None)

    # Exact match rewards (enhanced for non-neutral emotions)
    if generated_emotion == target_emotion:
        if gen_group != "neutral":
            return score + 6.0  
        else:
            return score + 1.0  # Neutral exact match reward

    # Group match rewards (enhanced for non-neutral emotions)
    if gen_group and target_group and gen_group == target_group:
        if gen_group != "neutral":
            return score + 3.0  # Non-neutral group match reward
        else:
            return score + 1.0  # Neutral group match reward

    # Strict penalty for emotional conflicts
    if (gen_group == "positive" and target_group == "negative") or \
       (gen_group == "negative" and target_group == "positive"):
        return -5.0

    return score - 1.0  # Base penalty for mismatches

def compute_contextual_relevance_enhanced(generated_text, user_input, similarity_model, device):
    """Contextual relevance with enhanced penalties for irrelevance"""
    if not generated_text or not user_input:
        return -8.0  # Enhanced penalty for missing inputs

    try:
        # Text cleaning
        gen_clean = re.sub(r'[^\w\s]', ' ', generated_text.lower()).strip()
        user_clean = re.sub(r'[^\w\s]', ' ', user_input.lower()).strip()

        if not gen_clean or not user_clean:
            return -5.0

        # Semantic similarity calculation
        embeddings = similarity_model.encode([gen_clean, user_clean], convert_to_tensor=True)
        embeddings = embeddings.to(device)
        cos_sim = util.pytorch_cos_sim(embeddings[0], embeddings[1])
        similarity = float(cos_sim.item())

        # Enhanced penalty mechanism
        if similarity < 0.03:
            return -10.0  
        elif similarity < 0.05:
            return -8.0
        elif similarity < 0.08:
            return -6.0
        elif similarity < 0.12:
            return -3.0
        elif similarity < 0.20:
            return 0.0
        elif similarity < 0.30:
            return 2.0
        elif similarity < 0.50:
            return 4.0
        elif similarity < 0.70:
            return 6.0
        else:
            return 8.0

    except Exception as e:
        print(f"Error in relevance calculation: {e}")
        return -2.0  

def compute_comprehensive_rewards_enhanced(
    text_responses,
    emotion_responses,
    has_emotion_flags,
    target_texts,
    target_emotions,
    user_inputs,
    user_emotions,
    similarity_model,
    reward_calculator,
    device
):
    """Enhanced reward function with wider dynamic range (-12 to 10)"""
    rewards = []

    for i, (text, emotion, has_emotion, target_text, target_emotion) in enumerate(zip(
        text_responses, emotion_responses, has_emotion_flags, target_texts, target_emotions
    )):
        quality_score = compute_text_quality_score_enhanced(text)
        emotion_score = compute_emotion_score_enhanced(has_emotion, emotion, target_emotion)

        user_input = user_inputs[i] if i < len(user_inputs) else ""
        relevance_score = compute_contextual_relevance_enhanced(text, user_input, similarity_model, device)

        # Sentiment and empathy scores (narrowed range)
        if reward_calculator and text.strip():
            sentiment_score = reward_calculator.compute_sentiment_reward(text) * 1.1 - 0.5  # [-0.5, 1.0]
            empathy_score = reward_calculator.compute_empathy_reward(text) * 1.1 - 0.5      # [-0.5, 1.0]
        else:
            sentiment_score = 0.0
            empathy_score = 0.0

        # New penalty mechanisms
        penalty = 0.0

        # Emotional conflict penalty (user emotion vs response emotion)
        if user_emotions and i < len(user_emotions):
            user_emo = user_emotions[i].lower()
            if (user_emo in ["sadness", "depression", "anger", "disgust", "fear"] and
                emotion.lower() in ["joy"]):
                penalty -= 1.5

        # Balanced weighted calculation
        total_reward = (
            quality_score * 1.1 +        # Increased quality weight
            emotion_score * 1.2 +
            relevance_score * 1.1 +      # Increased relevance weight
            sentiment_score * 0.7 +      # Reduced sentiment weight
            empathy_score * 0.7 +        # Reduced empathy weight
            penalty                      # New penalty term
        )

        bounded_reward = max(min(total_reward, 10.0), -10.0)
        rewards.append(torch.tensor(bounded_reward, dtype=torch.float32))

    return rewards

# ====================== Legacy Function Names (for compatibility) ======================
def compute_text_quality_score_fixed(response):
    """Legacy wrapper for enhanced text quality scoring"""
    return compute_text_quality_score_enhanced(response)

def compute_emotion_score(has_emotion_token, generated_emotion, target_emotion):
    """Legacy wrapper for enhanced emotion scoring"""
    return compute_emotion_score_enhanced(has_emotion_token, generated_emotion, target_emotion)

def compute_contextual_relevance_score_progressive(generated_text, user_input, similarity_model, device):
    """Legacy wrapper for enhanced contextual relevance scoring"""
    return compute_contextual_relevance_enhanced(generated_text, user_input, similarity_model, device)

# ====================== Data Collator ======================
def ppo_collator(batch):
    """PPO data collator"""
    return {
        'input_ids': torch.stack([torch.tensor(item['input_ids'], dtype=torch.long) for item in batch]),
        'attention_mask': torch.stack([torch.tensor(item['attention_mask'], dtype=torch.long) for item in batch]),
    }

# ====================== Weights & Biases Logging ======================
def log_sample_conversations(text_responses, user_inputs, batch_idx, epoch, max_samples=3):
    """Log sample conversation pairs to W&B"""
    conversations = []
    for i in range(min(len(text_responses), max_samples)):
        conversations.append({
            "epoch": epoch,
            "batch": batch_idx,
            "sample_id": i,
            "user_input": user_inputs[i],
            "therapist_response": text_responses[i]
        })

    table = wandb.Table(
        columns=["epoch", "batch", "sample_id", "user_input", "therapist_response"],
        data=[[conv["epoch"], conv["batch"], conv["sample_id"],
               conv["user_input"], conv["therapist_response"]] for conv in conversations]
    )

    wandb.log({"sample_conversations": table})

def log_score_distributions(scores_dict, step):
    """Log score distributions as histograms"""
    for name, scores in scores_dict.items():
        if scores and len(scores) > 0:
            wandb.log({
                f"distributions/{name}": wandb.Histogram(scores),
                f"distributions/{name}_mean": np.mean(scores),
                f"distributions/{name}_std": np.std(scores)
            }, step=step)

# ====================== Main Training Function ======================
def run_ppo_training_with_wandb(
    train_data_path,
    val_data_path,
    sft_model_path,
    tokenizer_path,
    num_epochs=3,
    batch_size=8,
    save_dir="ppo_model_with_wandb",
    project_name="therapy-ppo-training",
    run_name=None
):
    """Run PPO training with comprehensive W&B logging"""

    print("Starting PPO training with Weights & Biases integration...")

    torch.manual_seed(42)
    np.random.seed(42)
    random.seed(42)

    os.makedirs(save_dir, exist_ok=True)

    if run_name is None:
        run_name = f"8m18therapy_ppo_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

    wandb.init(
        project=project_name,
        name=run_name,
        config={
            "architecture": "GPT2",
            "dataset": "therapy_conversations",
            "num_epochs": num_epochs,
            "batch_size": batch_size,
            "max_length": 128,
            "max_new_tokens": 128,
            "relevance_function": "enhanced_semantic_scoring"
        }
    )

    print(f"W&B run: {wandb.run.name}")
    print(f"W&B url: {wandb.run.url}")

    global_step = 0

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    print(f"Tokenizer vocabulary size: {len(tokenizer)}")

    # Load base model
    print("Loading base model...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.resize_token_embeddings(len(tokenizer))

    # Load SFT checkpoint
    print(f"Loading SFT checkpoint from {sft_model_path}...")
    try:
        checkpoint = torch.load(sft_model_path, map_location='cpu', weights_only=False)

        if 'model_state_dict' in checkpoint:
            state_dict = checkpoint['model_state_dict']
        elif 'state_dict' in checkpoint:
            state_dict = checkpoint['state_dict']
        else:
            state_dict = checkpoint

        cleaned_state_dict = {}
        for key, value in state_dict.items():
            if key.startswith('model.'):
                cleaned_state_dict[key[6:]] = value
            else:
                cleaned_state_dict[key] = value

        model.load_state_dict(cleaned_state_dict, strict=False)
        model.to(device)
        print("SFT checkpoint loaded successfully")

    except Exception as e:
        print(f"Error loading SFT checkpoint: {e}")
        print("Continuing with base model...")

    # Initialize similarity model (REQUIRED)
    print("Loading similarity model...")
    try:
        similarity_model = SentenceTransformer('all-MiniLM-L6-v2').to(device)
        print("Similarity model loaded successfully")
    except Exception as e:
        print(f"CRITICAL ERROR: Failed to load similarity model: {e}")
        print("Similarity model is required for relevance scoring. Training cannot continue.")
        raise RuntimeError("Similarity model is required but failed to load")

    # Initialize enhanced reward calculator with sentiment and empathy models
    print("Loading sentiment and empathy models...")
    try:
        reward_calculator = TherapyRewardCalculator(device)
        print("Sentiment and empathy models loaded successfully")
    except Exception as e:
        print(f"Error loading sentiment/empathy models: {e}")
        reward_calculator = None

    # Create PPO configuration
    print("Creating PPO configuration...")

    ppo_config = PPOConfig(
        model_name='gpt2',
        whiten_rewards=True,
        learning_rate=1e-6,
        batch_size=batch_size,
        mini_batch_size=batch_size,
        gradient_accumulation_steps=1,
        seed=42,
    )

    # Log PPO hyperparameters to W&B
    wandb.config.update({
        "ppo_learning_rate": ppo_config.learning_rate,
        "ppo_batch_size": ppo_config.batch_size,
        "ppo_whiten_rewards": ppo_config.whiten_rewards,
        "generation_top_p": 1.0,
        "generation_top_k": 0.0,
        "enhanced_rewards": "comprehensive_enhanced_scoring",
        "quality_function": "enhanced_repetition_detection",
        "relevance_function": "enhanced_semantic_similarity",
        "data_alignment": "custom_trainer_no_shuffle"
    })

    # Create PPO models
    print("Creating PPO models...")
    ppo_model = AutoModelForCausalLMWithValueHead.from_pretrained(
        model,
        torch_dtype=torch.float32
    )
    ppo_model.to(device)

    ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
        model,
        torch_dtype=torch.float32
    )
    ref_model.to(device)

    # Load datasets
    print("Loading datasets...")
    train_therapy_dataset = TherapyDatasetV4(train_data_path, tokenizer_path)
    val_therapy_dataset = TherapyDatasetV4(val_data_path, tokenizer_path)

    full_train_dataset = train_therapy_dataset.to_hf_dataset()
    full_val_dataset = val_therapy_dataset.to_hf_dataset()

    # Extract metadata
    train_target_texts = full_train_dataset['target_text']
    train_target_emotions = full_train_dataset['target_emotion']
    train_queries = full_train_dataset['query']
    train_user_inputs = full_train_dataset['user_input']
    train_user_emotions = full_train_dataset['user_emotion']

    val_target_texts = full_val_dataset['target_text']
    val_target_emotions = full_val_dataset['target_emotion']
    val_user_inputs = full_val_dataset['user_input']
    val_user_emotions = full_val_dataset['user_emotion']

    # Create PPO datasets
    train_dataset = full_train_dataset.remove_columns([
        'target_text', 'target_emotion', 'query', 'user_input', 'user_emotion'
    ])
    val_dataset = full_val_dataset.remove_columns([
        'target_text', 'target_emotion', 'query', 'user_input', 'user_emotion'
    ])

    print(f"Training dataset size: {len(train_dataset)}")
    print(f"Validation dataset size: {len(val_dataset)}")

    # Log dataset info to W&B
    wandb.config.update({
        "train_dataset_size": len(train_dataset),
        "val_dataset_size": len(val_dataset),
        "total_parameters": sum(p.numel() for p in ppo_model.parameters()),
        "trainable_parameters": sum(p.numel() for p in ppo_model.parameters() if p.requires_grad)
    })

    # Initialize CUSTOM PPO trainer (prevents data shuffling)
    print("Initializing Custom PPO trainer...")
    ppo_trainer = CustomPPOTrainer(
        config=ppo_config,
        model=ppo_model,
        ref_model=ref_model,
        tokenizer=tokenizer,
        dataset=train_dataset,
        data_collator=ppo_collator
    )

    # Training loop
    print("Starting PPO training...")
    start_time = time.time()

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        epoch_start_time = time.time()

        # Epoch metrics
        epoch_rewards = []
        epoch_quality_scores = []
        epoch_emotion_scores = []
        epoch_relevance_scores = []
        epoch_sentiment_scores = []
        epoch_empathy_scores = []
        epoch_emotion_success = []

        for batch_idx, batch in enumerate(tqdm(ppo_trainer.dataloader, desc="Training batches")):
            try:
                query_tensors = [item.to(device) for item in batch["input_ids"]]

                # Get current batch data with perfect alignment (no shuffling)
                start_idx = batch_idx * batch_size
                end_idx = min(start_idx + batch_size, len(train_target_texts))
                current_batch_size = end_idx - start_idx

                # Skip if we're beyond the data bounds
                if start_idx >= len(train_target_texts):
                    continue

                # Extract metadata for current batch
                batch_target_texts = train_target_texts[start_idx:end_idx]
                batch_target_emotions = train_target_emotions[start_idx:end_idx]
                batch_user_inputs = train_user_inputs[start_idx:end_idx]
                batch_user_emotions = train_user_emotions[start_idx:end_idx]

                # Ensure tensor batch size matches metadata batch size
                query_tensors = query_tensors[:current_batch_size]

                # Generate responses
                response_tensors = []
                text_responses = []
                emotion_responses = []
                has_emotion_flags = []

                for query_tensor in query_tensors:
                    response_tokens, therapist_text, therapist_emotion, has_emotion = generate_therapy_response(
                        ppo_model.pretrained_model,
                        tokenizer,
                        query_tensor,
                        device
                    )
                    response_tensors.append(response_tokens)
                    text_responses.append(therapist_text)
                    emotion_responses.append(therapist_emotion)
                    has_emotion_flags.append(has_emotion)

                # Compute rewards with ENHANCED scoring
                rewards = compute_comprehensive_rewards_enhanced(
                    text_responses,
                    emotion_responses,
                    has_emotion_flags,
                    batch_target_texts,
                    batch_target_emotions,
                    batch_user_inputs,
                    batch_user_emotions,
                    similarity_model,
                    reward_calculator,
                    device
                )

                # Compute individual component scores for detailed logging
                batch_quality_scores = [compute_text_quality_score_enhanced(text) for text in text_responses]
                batch_emotion_scores = [
                    compute_emotion_score_enhanced(has_emotion, emotion, target_emotion)
                    for has_emotion, emotion, target_emotion in zip(has_emotion_flags, emotion_responses, batch_target_emotions)
                ]
                batch_relevance_scores = [
                    compute_contextual_relevance_enhanced(text, user_input, similarity_model, device)
                    for text, user_input in zip(text_responses, batch_user_inputs)
                ]

                # Track sentiment and empathy scores
                batch_sentiment_scores = [
                    reward_calculator.compute_sentiment_reward(text) if reward_calculator and text.strip() else 0.0
                    for text in text_responses
                ]
                batch_empathy_scores = [
                    reward_calculator.compute_empathy_reward(text) if reward_calculator and text.strip() else 0.0
                    for text in text_responses
                ]

                # Track epoch metrics
                epoch_rewards.extend([r.item() for r in rewards])
                epoch_quality_scores.extend(batch_quality_scores)
                epoch_emotion_scores.extend(batch_emotion_scores)
                epoch_relevance_scores.extend(batch_relevance_scores)
                epoch_sentiment_scores.extend(batch_sentiment_scores)
                epoch_empathy_scores.extend(batch_empathy_scores)
                epoch_emotion_success.extend(has_emotion_flags)

                # PPO training step
                stats = ppo_trainer.step(query_tensors, response_tensors, rewards)

                # W&B logging for this batch
                batch_metrics = {
                    "train/batch_reward": np.mean([r.item() for r in rewards]),
                    "train/batch_quality": np.mean(batch_quality_scores),
                    "train/batch_emotion": np.mean(batch_emotion_scores),
                    "train/batch_relevance": np.mean(batch_relevance_scores),
                    "train/batch_sentiment": np.mean(batch_sentiment_scores),
                    "train/batch_empathy": np.mean(batch_empathy_scores),
                    "train/batch_emotion_success_rate": np.mean(has_emotion_flags),
                    "train/epoch": epoch,
                    "train/batch": batch_idx,
                }

                # Log PPO stats if available
                if stats:
                    for key, value in stats.items():
                        if isinstance(value, (int, float, torch.Tensor)):
                            if isinstance(value, torch.Tensor):
                                value = value.item()
                            batch_metrics[f"ppo/{key}"] = value

                wandb.log(batch_metrics, step=global_step)

                # Log sample responses periodically
                if batch_idx % 10 == 0:
                    log_sample_conversations(text_responses, batch_user_inputs, batch_idx, epoch)

                # Progress logging
                if batch_idx % 5 == 0:
                    avg_reward = np.mean([r.item() for r in rewards])
                    avg_quality = np.mean(batch_quality_scores)
                    avg_emotion = np.mean(batch_emotion_scores)
                    avg_relevance = np.mean(batch_relevance_scores)
                    avg_sentiment = np.mean(batch_sentiment_scores)
                    avg_empathy = np.mean(batch_empathy_scores)
                    emotion_success_rate = np.mean(has_emotion_flags)

                    print(f"Batch {batch_idx}: Reward={avg_reward:.3f}, Quality={avg_quality:.3f}, " +
                          f"Emotion={avg_emotion:.3f}, Relevance={avg_relevance:.3f}, " +
                          f"Sentiment={avg_sentiment:.3f}, Empathy={avg_empathy:.3f}, Success={emotion_success_rate:.2%}")

                global_step += 1

            except Exception as e:
                print(f"Error in batch {batch_idx}: {e}")
                continue

        # Epoch summary
        epoch_time = time.time() - epoch_start_time

        avg_epoch_reward = np.mean(epoch_rewards) if epoch_rewards else 0.0
        avg_epoch_quality = np.mean(epoch_quality_scores) if epoch_quality_scores else 0.0
        avg_epoch_emotion = np.mean(epoch_emotion_scores) if epoch_emotion_scores else 0.0
        avg_epoch_relevance = np.mean(epoch_relevance_scores) if epoch_relevance_scores else 0.0
        avg_epoch_sentiment = np.mean(epoch_sentiment_scores) if epoch_sentiment_scores else 0.0
        avg_epoch_empathy = np.mean(epoch_empathy_scores) if epoch_empathy_scores else 0.0
        emotion_success_rate = np.mean(epoch_emotion_success) if epoch_emotion_success else 0.0

        print(f"\nEpoch {epoch+1} Results:")
        print(f"Average total reward: {avg_epoch_reward:.4f}")
        print(f"Average quality score: {avg_epoch_quality:.4f}")
        print(f"Average emotion score: {avg_epoch_emotion:.4f}")
        print(f"Average relevance score: {avg_epoch_relevance:.4f}")
        print(f"Average sentiment score: {avg_epoch_sentiment:.4f}")
        print(f"Average empathy score: {avg_epoch_empathy:.4f}")
        print(f"Emotion generation success: {emotion_success_rate:.2%}")
        print(f"Epoch time: {epoch_time:.1f}s")

        # Log epoch metrics and distributions
        epoch_metrics = {
            "epoch/reward": avg_epoch_reward,
            "epoch/quality": avg_epoch_quality,
            "epoch/emotion": avg_epoch_emotion,
            "epoch/relevance": avg_epoch_relevance,
            "epoch/sentiment": avg_epoch_sentiment,
            "epoch/empathy": avg_epoch_empathy,
            "epoch/emotion_success_rate": emotion_success_rate,
            "epoch/time_seconds": epoch_time,
            "epoch/number": epoch + 1,
        }

        wandb.log(epoch_metrics, step=global_step)

        # Log score distributions
        log_score_distributions({
            "reward": epoch_rewards,
            "quality": epoch_quality_scores,
            "emotion": epoch_emotion_scores,
            "relevance": epoch_relevance_scores,
            "sentiment": epoch_sentiment_scores,
            "empathy": epoch_empathy_scores
        }, global_step)

        # Save checkpoint with tokenizer
        epoch_save_path = os.path.join(save_dir, f"epoch_{epoch+1}")
        ppo_trainer.save_pretrained(epoch_save_path)
        tokenizer.save_pretrained(epoch_save_path)  # Save tokenizer too
        print(f"Saved checkpoint to {epoch_save_path}")

        # Log model checkpoint as W&B artifact
        artifact = wandb.Artifact(f"model_epoch_{epoch+1}", type="model")
        artifact.add_dir(epoch_save_path)
        wandb.log_artifact(artifact)

        # Validation
        print("Running validation...")
        val_start_time = time.time()

        val_rewards = []
        val_quality_scores = []
        val_emotion_scores = []
        val_relevance_scores = []
        val_sentiment_scores = []
        val_empathy_scores = []
        val_emotion_success = []

        val_dataloader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=ppo_collator, shuffle=False)

        for val_batch_idx, val_batch in enumerate(tqdm(val_dataloader, desc="Validation", leave=False)):
            try:
                query_tensors = [item.to(device) for item in val_batch["input_ids"]]

                # Use safer indexing for validation too
                actual_batch_size = len(query_tensors)
                start_idx = val_batch_idx * batch_size
                end_idx = min(start_idx + batch_size, len(val_target_texts))
                current_batch_size = min(actual_batch_size, end_idx - start_idx)

                # Skip if we're beyond the data bounds
                if start_idx >= len(val_target_texts):
                    continue

                # Use only the valid portion
                query_tensors = query_tensors[:current_batch_size]
                batch_target_texts = val_target_texts[start_idx:start_idx + current_batch_size]
                batch_target_emotions = val_target_emotions[start_idx:start_idx + current_batch_size]
                batch_user_inputs = val_user_inputs[start_idx:start_idx + current_batch_size]
                batch_user_emotions = val_user_emotions[start_idx:start_idx + current_batch_size]

                text_responses = []
                emotion_responses = []
                has_emotion_flags = []

                for query_tensor in query_tensors:
                    _, therapist_text, therapist_emotion, has_emotion = generate_therapy_response(
                        ppo_model.pretrained_model,
                        tokenizer,
                        query_tensor,
                        device
                    )
                    text_responses.append(therapist_text)
                    emotion_responses.append(therapist_emotion)
                    has_emotion_flags.append(has_emotion)

                rewards = compute_comprehensive_rewards_enhanced(
                    text_responses,
                    emotion_responses,
                    has_emotion_flags,
                    batch_target_texts,
                    batch_target_emotions,
                    batch_user_inputs,
                    batch_user_emotions,
                    similarity_model,
                    reward_calculator,
                    device
                )

                # Compute component scores
                batch_quality_scores = [compute_text_quality_score_enhanced(text) for text in text_responses]
                batch_emotion_scores = [
                    compute_emotion_score_enhanced(has_emotion, emotion, target_emotion)
                    for has_emotion, emotion, target_emotion in zip(has_emotion_flags, emotion_responses, batch_target_emotions)
                ]
                batch_relevance_scores = [
                    compute_contextual_relevance_enhanced(text, user_input, similarity_model, device)
                    for text, user_input in zip(text_responses, batch_user_inputs)
                ]
                batch_sentiment_scores = [
                    reward_calculator.compute_sentiment_reward(text) if reward_calculator and text.strip() else 0.0
                    for text in text_responses
                ]
                batch_empathy_scores = [
                    reward_calculator.compute_empathy_reward(text) if reward_calculator and text.strip() else 0.0
                    for text in text_responses
                ]

                val_rewards.extend([r.item() for r in rewards])
                val_quality_scores.extend(batch_quality_scores)
                val_emotion_scores.extend(batch_emotion_scores)
                val_relevance_scores.extend(batch_relevance_scores)
                val_sentiment_scores.extend(batch_sentiment_scores)
                val_empathy_scores.extend(batch_empathy_scores)
                val_emotion_success.extend(has_emotion_flags)

            except Exception as e:
                print(f"Error in validation batch: {e}")
                continue

        val_time = time.time() - val_start_time

        val_avg_reward = np.mean(val_rewards) if val_rewards else 0.0
        val_avg_quality = np.mean(val_quality_scores) if val_quality_scores else 0.0
        val_avg_emotion = np.mean(val_emotion_scores) if val_emotion_scores else 0.0
        val_avg_relevance = np.mean(val_relevance_scores) if val_relevance_scores else 0.0
        val_avg_sentiment = np.mean(val_sentiment_scores) if val_sentiment_scores else 0.0
        val_avg_empathy = np.mean(val_empathy_scores) if val_empathy_scores else 0.0
        val_emotion_success_rate = np.mean(val_emotion_success) if val_emotion_success else 0.0

        print(f"Validation Results:")
        print(f"Average reward: {val_avg_reward:.4f}")
        print(f"Average quality: {val_avg_quality:.4f}")
        print(f"Average emotion: {val_avg_emotion:.4f}")
        print(f"Average relevance: {val_avg_relevance:.4f}")
        print(f"Average sentiment: {val_avg_sentiment:.4f}")
        print(f"Average empathy: {val_avg_empathy:.4f}")
        print(f"Emotion generation success: {val_emotion_success_rate:.2%}")
        print(f"Validation time: {val_time:.1f}s")

        # Log validation metrics
        val_metrics = {
            "val/reward": val_avg_reward,
            "val/quality": val_avg_quality,
            "val/emotion": val_avg_emotion,
            "val/relevance": val_avg_relevance,
            "val/sentiment": val_avg_sentiment,
            "val/empathy": val_avg_empathy,
            "val/emotion_success_rate": val_emotion_success_rate,
            "val/time_seconds": val_time,
        }

        wandb.log(val_metrics, step=global_step)

        # Log validation score distributions
        log_score_distributions({
            "val_reward": val_rewards,
            "val_quality": val_quality_scores,
            "val_emotion": val_emotion_scores,
            "val_relevance": val_relevance_scores,
            "val_sentiment": val_sentiment_scores,
            "val_empathy": val_empathy_scores
        }, global_step)

    # Save final model with tokenizer
    final_save_path = os.path.join(save_dir, "final_model")
    ppo_trainer.save_pretrained(final_save_path)
    tokenizer.save_pretrained(final_save_path)  # Save tokenizer too

    # Log final model as artifact
    final_artifact = wandb.Artifact("final_model", type="model")
    final_artifact.add_dir(final_save_path)
    wandb.log_artifact(final_artifact)

    total_time = time.time() - start_time
    print(f"\nTraining completed! Total time: {total_time:.1f}s")
    print(f"Final model saved to {final_save_path}")
    print(f"W&B run url: {wandb.run.url}")

    # Log final summary
    wandb.log({
        "training/total_time_seconds": total_time,
        "training/total_steps": global_step,
        "training/epochs_completed": num_epochs
    })

    # Finish W&B run
    wandb.finish()

    return ppo_trainer

if __name__ == "__main__":
    train_dataset = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/train.json"
    val_dataset = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/val.json"
    sft_model = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/therapy_model_4thFIXED_epoch_7_loss_2.2373.ckpt"
    tokenizer_path = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/train_processed_4thFIXED_tokenizer"

    try:
        ppo_model = run_ppo_training_with_wandb(
            train_data_path=train_dataset,
            val_data_path=val_dataset,
            sft_model_path=sft_model,
            tokenizer_path=tokenizer_path,
            num_epochs=8,
            batch_size=16,
            save_dir="/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model",
            project_name="8m19therapy-chatbot-ppo-enhanced",
            run_name="8m19therapy_ppo_enhanced_rewards"
        )

        print("PPO training with enhanced reward components completed successfully!")

    except Exception as e:
        print(f"Training failed: {e}")
        import traceback
        traceback.print_exc()
        raise

Starting PPO training with Weights & Biases integration...


0,1
ppo/objective/entropy,▁▁▁▁▁▂▁▁▁▁▁▁▂▂▂▂▂███▇▇▆▆▄▄▄▃▃▃▃▂▃▂▃▂▃▂▃▂
ppo/objective/kl,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
ppo/objective/kl_coef,█████▇▇▇▇▇▇▇▆▆▆▆▆▆▅▅▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▁▁▁
ppo/ppo/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
ppo/ppo/loss/policy,▇▃▃▃▃▁▂▁▃▂▂▂▂▃▂▁▁▁▁▂▄▂▃▃▃▄▅▅▆▅█▆▆▆▆▅▅▄▆▆
ppo/ppo/loss/total,█▇▅▄▄▃▃▃▃▂▃▅▃▂▂▂▃▂▂▂▂▂▂▂▂▁▁▂▂▂▂▂▂▁▂▁▁▁▂▁
ppo/ppo/loss/value,▂▇█▆▄▃▃▃▃▅▂▂▂▂▂▂▃▂▂▂▂▂▂▂▁▁▁▂▁▂▁▂▂▁▂▁▁▂▁▂
ppo/ppo/mean_non_score_reward,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
ppo/ppo/mean_scores,▅▅▃▄▅▆▇█▇▆▆▆▇▆▇▅▇▂▂▁▁▁▂▁▂▂▁▂▂▁▂▃▂▂▂▂▃▂▃▃
ppo/ppo/policy/advantages_mean,▁▃▄▁▅▅▄▅▅▅▄▅▃▅▄▄▄▅▅▅▄█▅▅▅▅▅▄▄▄▄▄▄▅▄▄▄▄▄▄

0,1
ppo/objective/entropy,108.64667
ppo/objective/kl,0.0
ppo/objective/kl_coef,0.1932
ppo/ppo/learning_rate,2e-05
ppo/ppo/loss/policy,-0.00957
ppo/ppo/loss/total,0.20326
ppo/ppo/loss/value,2.12825
ppo/ppo/mean_non_score_reward,0.0
ppo/ppo/mean_scores,-3.32949
ppo/ppo/policy/advantages_mean,0.0


W&B run: 8m19therapy_ppo_enhanced_rewards
W&B url: https://wandb.ai/ericzhangez1006-ucl/8m19therapy-chatbot-ppo-enhanced/runs/agsy9y6e
Loading tokenizer...
Tokenizer vocabulary size: 50269
Loading base model...
Loading SFT checkpoint from /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/therapy_model_4thFIXED_epoch_7_loss_2.2373.ckpt...
SFT checkpoint loaded successfully
Loading similarity model...
Similarity model loaded successfully
Loading sentiment and empathy models...
Sentiment and empathy models loaded successfully
Creating PPO configuration...
Creating PPO models...
Loading datasets...




Loaded tokenizer from /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/train_processed_4thFIXED_tokenizer


Processing data: 100%|██████████| 815/815 [00:02<00:00, 335.42it/s]


Processed 3721 therapist responses
Loaded tokenizer from /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/train_processed_4thFIXED_tokenizer


Processing data: 100%|██████████| 102/102 [00:00<00:00, 317.15it/s]


Processed 445 therapist responses




Training dataset size: 3721
Validation dataset size: 445
Initializing Custom PPO trainer...
Starting PPO training...

Epoch 1/8


Training batches:   0%|          | 1/233 [00:04<17:33,  4.54s/it]

Batch 0: Reward=6.349, Quality=2.000, Emotion=3.000, Relevance=0.562, Sentiment=0.348, Empathy=0.853, Success=100.00%


Training batches:   3%|▎         | 6/233 [00:30<19:54,  5.26s/it]

Batch 5: Reward=1.391, Quality=1.250, Emotion=2.438, Relevance=-2.812, Sentiment=0.394, Empathy=0.756, Success=100.00%


Training batches:   5%|▍         | 11/233 [00:57<20:23,  5.51s/it]

Batch 10: Reward=2.429, Quality=-1.000, Emotion=2.188, Relevance=-0.688, Sentiment=0.357, Empathy=0.704, Success=93.75%


Training batches:   7%|▋         | 16/233 [01:24<19:16,  5.33s/it]

Batch 15: Reward=3.454, Quality=-0.250, Emotion=2.375, Relevance=0.750, Sentiment=0.345, Empathy=0.832, Success=93.75%


Training batches:   9%|▉         | 21/233 [01:51<18:21,  5.19s/it]

Batch 20: Reward=4.879, Quality=-0.250, Emotion=3.000, Relevance=1.375, Sentiment=0.377, Empathy=0.852, Success=100.00%


Training batches:  11%|█         | 26/233 [02:15<17:32,  5.08s/it]

Batch 25: Reward=2.210, Quality=-1.000, Emotion=1.750, Relevance=1.125, Sentiment=0.229, Empathy=0.841, Success=87.50%


Training batches:  13%|█▎        | 31/233 [02:41<16:55,  5.03s/it]

Batch 30: Reward=3.498, Quality=0.500, Emotion=1.312, Relevance=1.375, Sentiment=0.432, Empathy=0.859, Success=100.00%


Training batches:  15%|█▌        | 36/233 [03:08<17:24,  5.30s/it]

Batch 35: Reward=3.673, Quality=-0.250, Emotion=1.875, Relevance=1.438, Sentiment=0.550, Empathy=0.847, Success=100.00%


Training batches:  18%|█▊        | 41/233 [03:34<16:21,  5.11s/it]

Batch 40: Reward=3.049, Quality=0.500, Emotion=2.375, Relevance=-0.375, Sentiment=0.169, Empathy=0.832, Success=93.75%


Training batches:  20%|█▉        | 46/233 [03:58<15:41,  5.03s/it]

Batch 45: Reward=3.684, Quality=-1.000, Emotion=2.812, Relevance=1.125, Sentiment=0.379, Empathy=0.853, Success=100.00%


Training batches:  22%|██▏       | 51/233 [04:24<15:29,  5.11s/it]

Batch 50: Reward=4.778, Quality=0.500, Emotion=2.250, Relevance=1.188, Sentiment=0.441, Empathy=0.825, Success=100.00%


Training batches:  24%|██▍       | 56/233 [04:48<14:08,  4.79s/it]

Batch 55: Reward=3.532, Quality=0.500, Emotion=2.375, Relevance=-0.500, Sentiment=0.588, Empathy=0.862, Success=93.75%


Training batches:  26%|██▌       | 61/233 [05:13<14:50,  5.18s/it]

Batch 60: Reward=2.409, Quality=0.500, Emotion=0.562, Relevance=0.750, Sentiment=0.580, Empathy=0.870, Success=81.25%


Training batches:  28%|██▊       | 66/233 [05:37<13:38,  4.90s/it]

Batch 65: Reward=2.598, Quality=-0.500, Emotion=1.750, Relevance=0.500, Sentiment=0.199, Empathy=0.838, Success=87.50%


Training batches:  30%|███       | 71/233 [06:01<13:13,  4.90s/it]

Batch 70: Reward=-0.009, Quality=-0.250, Emotion=1.188, Relevance=-1.500, Sentiment=0.249, Empathy=0.861, Success=87.50%


Training batches:  33%|███▎      | 76/233 [06:26<12:43,  4.86s/it]

Batch 75: Reward=1.683, Quality=-0.250, Emotion=1.750, Relevance=-0.062, Sentiment=0.263, Empathy=0.841, Success=87.50%


Training batches:  35%|███▍      | 81/233 [06:49<11:59,  4.73s/it]

Batch 80: Reward=3.734, Quality=0.500, Emotion=0.750, Relevance=0.938, Sentiment=0.122, Empathy=0.795, Success=81.25%


Training batches:  37%|███▋      | 86/233 [07:16<12:55,  5.27s/it]

Batch 85: Reward=1.326, Quality=-1.000, Emotion=1.750, Relevance=0.062, Sentiment=0.463, Empathy=0.798, Success=87.50%


Training batches:  39%|███▉      | 91/233 [07:38<10:55,  4.62s/it]

Batch 90: Reward=1.348, Quality=2.000, Emotion=0.750, Relevance=-2.375, Sentiment=0.393, Empathy=0.849, Success=81.25%


Training batches:  41%|████      | 96/233 [08:03<11:32,  5.06s/it]

Batch 95: Reward=3.984, Quality=-1.000, Emotion=2.375, Relevance=0.875, Sentiment=0.357, Empathy=0.757, Success=93.75%


Training batches:  43%|████▎     | 101/233 [08:28<10:57,  4.98s/it]

Batch 100: Reward=2.407, Quality=-1.000, Emotion=2.188, Relevance=-0.250, Sentiment=0.097, Empathy=0.810, Success=93.75%


Training batches:  45%|████▌     | 106/233 [08:51<10:12,  4.82s/it]

Batch 105: Reward=3.099, Quality=-1.000, Emotion=3.000, Relevance=0.500, Sentiment=0.315, Empathy=0.839, Success=100.00%


Training batches:  48%|████▊     | 111/233 [09:15<10:33,  5.19s/it]

Batch 110: Reward=1.135, Quality=-0.250, Emotion=1.125, Relevance=-1.625, Sentiment=0.395, Empathy=0.849, Success=81.25%


Training batches:  50%|████▉     | 116/233 [09:39<08:52,  4.55s/it]

Batch 115: Reward=1.594, Quality=1.500, Emotion=1.750, Relevance=-2.875, Sentiment=0.224, Empathy=0.848, Success=87.50%


Training batches:  52%|█████▏    | 121/233 [10:03<08:49,  4.73s/it]

Batch 120: Reward=1.358, Quality=-0.250, Emotion=0.625, Relevance=0.625, Sentiment=0.379, Empathy=0.833, Success=87.50%


Training batches:  54%|█████▍    | 126/233 [10:28<08:36,  4.83s/it]

Batch 125: Reward=4.212, Quality=1.250, Emotion=2.188, Relevance=0.250, Sentiment=0.388, Empathy=0.863, Success=93.75%


Training batches:  56%|█████▌    | 131/233 [10:53<08:18,  4.89s/it]

Batch 130: Reward=4.280, Quality=2.000, Emotion=1.125, Relevance=0.500, Sentiment=0.210, Empathy=0.836, Success=81.25%


Training batches:  58%|█████▊    | 136/233 [11:15<07:05,  4.39s/it]

Batch 135: Reward=1.638, Quality=0.500, Emotion=0.500, Relevance=-0.312, Sentiment=0.288, Empathy=0.827, Success=75.00%


Training batches:  61%|██████    | 141/233 [11:36<06:53,  4.50s/it]

Batch 140: Reward=3.032, Quality=1.250, Emotion=0.500, Relevance=0.688, Sentiment=0.071, Empathy=0.871, Success=75.00%


Training batches:  63%|██████▎   | 146/233 [11:59<06:40,  4.60s/it]

Batch 145: Reward=4.411, Quality=0.500, Emotion=1.750, Relevance=0.562, Sentiment=0.191, Empathy=0.820, Success=87.50%


Training batches:  65%|██████▍   | 151/233 [12:23<06:34,  4.81s/it]

Batch 150: Reward=6.027, Quality=1.250, Emotion=2.438, Relevance=1.625, Sentiment=0.277, Empathy=0.858, Success=100.00%


Training batches:  67%|██████▋   | 156/233 [12:49<06:34,  5.12s/it]

Batch 155: Reward=2.865, Quality=0.500, Emotion=1.812, Relevance=0.000, Sentiment=0.352, Empathy=0.851, Success=93.75%


Training batches:  69%|██████▉   | 161/233 [13:16<06:16,  5.23s/it]

Batch 160: Reward=2.200, Quality=0.500, Emotion=1.750, Relevance=-0.562, Sentiment=0.289, Empathy=0.867, Success=87.50%


Training batches:  71%|███████   | 166/233 [13:38<05:08,  4.60s/it]

Batch 165: Reward=2.275, Quality=1.250, Emotion=0.938, Relevance=-0.750, Sentiment=0.297, Empathy=0.833, Success=81.25%


Training batches:  73%|███████▎  | 171/233 [14:00<04:28,  4.33s/it]

Batch 170: Reward=-0.000, Quality=0.500, Emotion=0.500, Relevance=-2.250, Sentiment=0.187, Empathy=0.844, Success=75.00%


Training batches:  76%|███████▌  | 176/233 [14:23<04:23,  4.62s/it]

Batch 175: Reward=2.571, Quality=0.500, Emotion=1.750, Relevance=-0.188, Sentiment=0.131, Empathy=0.848, Success=87.50%


Training batches:  78%|███████▊  | 181/233 [14:46<04:05,  4.71s/it]

Batch 180: Reward=1.501, Quality=0.500, Emotion=1.125, Relevance=-0.875, Sentiment=0.216, Empathy=0.844, Success=81.25%


Training batches:  80%|███████▉  | 186/233 [15:09<03:39,  4.67s/it]

Batch 185: Reward=4.697, Quality=0.500, Emotion=2.375, Relevance=1.000, Sentiment=0.413, Empathy=0.800, Success=93.75%


Training batches:  82%|████████▏ | 191/233 [15:32<03:21,  4.79s/it]

Batch 190: Reward=3.858, Quality=1.250, Emotion=2.375, Relevance=-0.688, Sentiment=0.422, Empathy=0.878, Success=93.75%


Training batches:  84%|████████▍ | 196/233 [15:57<02:57,  4.81s/it]

Batch 195: Reward=2.534, Quality=-0.250, Emotion=1.750, Relevance=0.625, Sentiment=0.147, Empathy=0.846, Success=87.50%


Training batches:  86%|████████▋ | 201/233 [16:22<02:42,  5.08s/it]

Batch 200: Reward=1.385, Quality=-0.250, Emotion=0.500, Relevance=-0.312, Sentiment=0.196, Empathy=0.866, Success=75.00%


Training batches:  88%|████████▊ | 206/233 [16:46<02:06,  4.70s/it]

Batch 205: Reward=4.140, Quality=2.000, Emotion=1.125, Relevance=0.438, Sentiment=0.219, Empathy=0.857, Success=81.25%


Training batches:  91%|█████████ | 211/233 [17:09<01:44,  4.74s/it]

Batch 210: Reward=2.245, Quality=-0.250, Emotion=1.750, Relevance=-1.312, Sentiment=0.539, Empathy=0.809, Success=87.50%


Training batches:  93%|█████████▎| 216/233 [17:36<01:29,  5.25s/it]

Batch 215: Reward=5.296, Quality=1.250, Emotion=3.000, Relevance=0.312, Sentiment=0.409, Empathy=0.853, Success=100.00%


Training batches:  95%|█████████▍| 221/233 [18:01<01:02,  5.21s/it]

Batch 220: Reward=2.159, Quality=0.500, Emotion=1.750, Relevance=-0.438, Sentiment=0.236, Empathy=0.870, Success=87.50%


Training batches:  97%|█████████▋| 226/233 [18:27<00:35,  5.01s/it]

Batch 225: Reward=3.891, Quality=2.000, Emotion=0.500, Relevance=0.750, Sentiment=0.334, Empathy=0.851, Success=75.00%


Training batches:  99%|█████████▉| 231/233 [18:54<00:10,  5.42s/it]

Batch 230: Reward=5.720, Quality=0.500, Emotion=3.000, Relevance=1.188, Sentiment=0.422, Empathy=0.845, Success=100.00%


Training batches: 100%|██████████| 233/233 [19:01<00:00,  4.90s/it]

Error in batch 232: Batch size (16) does not match number of examples - but got 9 for: queries

Epoch 1 Results:
Average total reward: 2.8942
Average quality score: 0.2940
Average emotion score: 1.8492
Average relevance score: -0.0992
Average sentiment score: 0.3238
Average empathy score: 0.8349
Emotion generation success: 90.08%
Epoch time: 1141.8s



[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_1)... 

Saved checkpoint to /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_1


Done. 1.6s


Running validation...




Validation Results:
Average reward: 3.9118
Average quality: 0.3371
Average emotion: 2.3640
Average relevance: 0.3865
Average sentiment: 0.3459
Average empathy: 0.8476
Emotion generation success: 94.38%
Validation time: 124.1s

Epoch 2/8


Training batches:   0%|          | 1/233 [00:05<21:52,  5.66s/it]

Batch 0: Reward=7.820, Quality=2.000, Emotion=3.000, Relevance=2.125, Sentiment=0.077, Empathy=0.861, Success=100.00%


Training batches:   3%|▎         | 6/233 [00:33<20:01,  5.29s/it]

Batch 5: Reward=3.235, Quality=1.250, Emotion=1.750, Relevance=-0.375, Sentiment=0.241, Empathy=0.843, Success=87.50%


Training batches:   5%|▍         | 11/233 [00:58<19:09,  5.18s/it]

Batch 10: Reward=3.082, Quality=-0.250, Emotion=1.688, Relevance=0.812, Sentiment=0.344, Empathy=0.846, Success=100.00%


Training batches:   7%|▋         | 16/233 [01:25<19:29,  5.39s/it]

Batch 15: Reward=1.122, Quality=-2.500, Emotion=2.375, Relevance=-0.312, Sentiment=0.334, Empathy=0.852, Success=93.75%


Training batches:   9%|▉         | 21/233 [01:51<18:41,  5.29s/it]

Batch 20: Reward=5.787, Quality=1.250, Emotion=2.375, Relevance=1.438, Sentiment=0.142, Empathy=0.841, Success=93.75%


Training batches:  11%|█         | 26/233 [02:19<19:16,  5.59s/it]

Batch 25: Reward=1.060, Quality=-1.000, Emotion=1.188, Relevance=0.125, Sentiment=0.418, Empathy=0.865, Success=87.50%


Training batches:  13%|█▎        | 31/233 [02:44<16:47,  4.99s/it]

Batch 30: Reward=4.142, Quality=2.000, Emotion=3.000, Relevance=-1.625, Sentiment=0.327, Empathy=0.835, Success=100.00%


Training batches:  15%|█▌        | 36/233 [03:10<16:44,  5.10s/it]

Batch 35: Reward=4.696, Quality=0.500, Emotion=2.375, Relevance=1.250, Sentiment=0.330, Empathy=0.848, Success=93.75%


Training batches:  18%|█▊        | 41/233 [03:35<16:49,  5.26s/it]

Batch 40: Reward=2.124, Quality=-1.000, Emotion=1.812, Relevance=-0.562, Sentiment=0.245, Empathy=0.823, Success=93.75%


Training batches:  20%|█▉        | 46/233 [04:05<17:29,  5.61s/it]

Batch 45: Reward=1.910, Quality=-0.750, Emotion=2.375, Relevance=-0.875, Sentiment=0.320, Empathy=0.801, Success=93.75%


Training batches:  22%|██▏       | 51/233 [04:33<17:08,  5.65s/it]

Batch 50: Reward=2.770, Quality=-1.250, Emotion=2.188, Relevance=0.875, Sentiment=0.310, Empathy=0.865, Success=93.75%


Training batches:  24%|██▍       | 56/233 [04:57<15:05,  5.11s/it]

Batch 55: Reward=6.062, Quality=1.250, Emotion=3.000, Relevance=1.125, Sentiment=0.326, Empathy=0.861, Success=100.00%


Training batches:  26%|██▌       | 61/233 [05:23<14:29,  5.05s/it]

Batch 60: Reward=6.137, Quality=1.250, Emotion=3.000, Relevance=1.188, Sentiment=0.297, Empathy=0.756, Success=100.00%


Training batches:  28%|██▊       | 66/233 [05:55<16:56,  6.09s/it]

Batch 65: Reward=1.659, Quality=-1.000, Emotion=2.375, Relevance=-0.625, Sentiment=0.299, Empathy=0.864, Success=93.75%


Training batches:  30%|███       | 71/233 [06:19<13:55,  5.16s/it]

Batch 70: Reward=3.528, Quality=0.500, Emotion=3.000, Relevance=-0.938, Sentiment=0.500, Empathy=0.849, Success=100.00%


Training batches:  33%|███▎      | 76/233 [06:47<14:22,  5.49s/it]

Batch 75: Reward=1.240, Quality=-1.500, Emotion=3.000, Relevance=-1.125, Sentiment=0.315, Empathy=0.855, Success=100.00%


Training batches:  35%|███▍      | 81/233 [07:13<12:58,  5.12s/it]

Batch 80: Reward=4.069, Quality=1.250, Emotion=1.625, Relevance=0.312, Sentiment=0.151, Empathy=0.826, Success=93.75%


Training batches:  37%|███▋      | 86/233 [07:39<12:48,  5.23s/it]

Batch 85: Reward=4.276, Quality=0.500, Emotion=3.000, Relevance=-0.062, Sentiment=0.491, Empathy=0.852, Success=100.00%


Training batches:  39%|███▉      | 91/233 [08:06<12:32,  5.30s/it]

Batch 90: Reward=6.188, Quality=2.000, Emotion=2.625, Relevance=1.500, Sentiment=0.194, Empathy=0.868, Success=100.00%


Training batches:  41%|████      | 96/233 [08:33<12:05,  5.29s/it]

Batch 95: Reward=1.309, Quality=0.500, Emotion=0.500, Relevance=0.062, Sentiment=0.222, Empathy=0.854, Success=75.00%


Training batches:  43%|████▎     | 101/233 [09:00<12:08,  5.52s/it]

Batch 100: Reward=3.235, Quality=1.250, Emotion=1.562, Relevance=-0.500, Sentiment=0.247, Empathy=0.867, Success=87.50%


Training batches:  45%|████▌     | 106/233 [09:25<10:49,  5.12s/it]

Batch 105: Reward=4.102, Quality=-0.250, Emotion=3.000, Relevance=0.375, Sentiment=0.239, Empathy=0.857, Success=100.00%


Training batches:  48%|████▊     | 111/233 [09:49<10:11,  5.02s/it]

Batch 110: Reward=6.782, Quality=2.000, Emotion=2.375, Relevance=1.750, Sentiment=0.265, Empathy=0.851, Success=93.75%


Training batches:  50%|████▉     | 116/233 [10:16<10:00,  5.13s/it]

Batch 115: Reward=3.588, Quality=-0.250, Emotion=3.000, Relevance=-0.062, Sentiment=0.336, Empathy=0.835, Success=100.00%


Training batches:  52%|█████▏    | 121/233 [10:43<10:29,  5.62s/it]

Batch 120: Reward=4.270, Quality=0.500, Emotion=1.875, Relevance=1.125, Sentiment=0.380, Empathy=0.831, Success=100.00%


Training batches:  54%|█████▍    | 126/233 [11:08<09:11,  5.15s/it]

Batch 125: Reward=3.950, Quality=0.250, Emotion=2.188, Relevance=1.188, Sentiment=0.304, Empathy=0.867, Success=93.75%


Training batches:  56%|█████▌    | 131/233 [11:33<08:26,  4.97s/it]

Batch 130: Reward=4.047, Quality=2.000, Emotion=1.188, Relevance=-0.125, Sentiment=0.393, Empathy=0.846, Success=87.50%


Training batches:  58%|█████▊    | 136/233 [11:57<07:59,  4.94s/it]

Batch 135: Reward=5.882, Quality=0.500, Emotion=3.000, Relevance=1.500, Sentiment=0.521, Empathy=0.851, Success=100.00%


Training batches:  61%|██████    | 141/233 [12:22<07:23,  4.82s/it]

Batch 140: Reward=4.821, Quality=2.000, Emotion=2.375, Relevance=-0.312, Sentiment=0.392, Empathy=0.813, Success=93.75%


Training batches:  63%|██████▎   | 146/233 [12:48<07:38,  5.28s/it]

Batch 145: Reward=7.377, Quality=1.250, Emotion=3.000, Relevance=2.500, Sentiment=0.259, Empathy=0.851, Success=100.00%


Training batches:  65%|██████▍   | 151/233 [13:16<07:32,  5.52s/it]

Batch 150: Reward=6.678, Quality=1.250, Emotion=3.000, Relevance=1.375, Sentiment=0.407, Empathy=0.862, Success=100.00%


Training batches:  67%|██████▋   | 156/233 [13:42<06:59,  5.45s/it]

Batch 155: Reward=4.491, Quality=-1.000, Emotion=3.000, Relevance=1.750, Sentiment=0.316, Empathy=0.863, Success=100.00%


Training batches:  69%|██████▉   | 161/233 [14:13<06:54,  5.76s/it]

Batch 160: Reward=3.795, Quality=0.000, Emotion=2.375, Relevance=0.875, Sentiment=0.315, Empathy=0.861, Success=93.75%


Training batches:  71%|███████   | 166/233 [14:39<06:17,  5.63s/it]

Batch 165: Reward=3.820, Quality=-0.250, Emotion=1.562, Relevance=0.500, Sentiment=0.369, Empathy=0.807, Success=87.50%


Training batches:  73%|███████▎  | 171/233 [15:06<05:14,  5.07s/it]

Batch 170: Reward=6.399, Quality=2.000, Emotion=2.375, Relevance=1.250, Sentiment=0.286, Empathy=0.859, Success=93.75%


Training batches:  76%|███████▌  | 176/233 [15:33<05:20,  5.62s/it]

Batch 175: Reward=1.765, Quality=-0.750, Emotion=1.750, Relevance=-0.625, Sentiment=0.473, Empathy=0.857, Success=87.50%


Training batches:  78%|███████▊  | 181/233 [16:00<04:47,  5.53s/it]

Batch 180: Reward=3.897, Quality=-0.250, Emotion=2.375, Relevance=0.875, Sentiment=0.598, Empathy=0.869, Success=93.75%


Training batches:  80%|███████▉  | 186/233 [16:24<03:42,  4.74s/it]

Batch 185: Reward=6.024, Quality=1.250, Emotion=3.000, Relevance=1.000, Sentiment=0.461, Empathy=0.859, Success=100.00%


Training batches:  82%|████████▏ | 191/233 [16:51<03:42,  5.29s/it]

Batch 190: Reward=3.725, Quality=1.250, Emotion=2.438, Relevance=-0.812, Sentiment=0.499, Empathy=0.857, Success=100.00%


Training batches:  84%|████████▍ | 196/233 [17:16<03:08,  5.08s/it]

Batch 195: Reward=5.059, Quality=-0.250, Emotion=3.000, Relevance=1.562, Sentiment=0.257, Empathy=0.865, Success=100.00%


Training batches:  86%|████████▋ | 201/233 [17:43<02:56,  5.52s/it]

Batch 200: Reward=6.576, Quality=0.500, Emotion=3.000, Relevance=2.250, Sentiment=0.133, Empathy=0.881, Success=100.00%


Training batches:  88%|████████▊ | 206/233 [18:07<02:15,  5.00s/it]

Batch 205: Reward=6.505, Quality=1.250, Emotion=3.000, Relevance=1.375, Sentiment=0.379, Empathy=0.868, Success=100.00%


Training batches:  91%|█████████ | 211/233 [18:33<01:54,  5.20s/it]

Batch 210: Reward=6.527, Quality=0.500, Emotion=3.000, Relevance=2.375, Sentiment=0.145, Empathy=0.868, Success=100.00%


Training batches:  93%|█████████▎| 216/233 [19:02<01:36,  5.68s/it]

Batch 215: Reward=3.106, Quality=-1.000, Emotion=2.438, Relevance=1.375, Sentiment=0.319, Empathy=0.791, Success=100.00%


Training batches:  95%|█████████▍| 221/233 [19:31<01:13,  6.13s/it]

Batch 220: Reward=3.528, Quality=0.500, Emotion=1.750, Relevance=0.250, Sentiment=0.344, Empathy=0.834, Success=87.50%


Training batches:  97%|█████████▋| 226/233 [20:01<00:41,  5.88s/it]

Batch 225: Reward=4.808, Quality=-0.250, Emotion=3.000, Relevance=1.750, Sentiment=0.268, Empathy=0.859, Success=100.00%


Training batches:  99%|█████████▉| 231/233 [20:33<00:12,  6.31s/it]

Batch 230: Reward=7.529, Quality=1.250, Emotion=3.000, Relevance=2.438, Sentiment=0.368, Empathy=0.830, Success=100.00%


Training batches: 100%|██████████| 233/233 [20:40<00:00,  5.33s/it]

Error in batch 232: Batch size (16) does not match number of examples - but got 9 for: queries

Epoch 2 Results:
Average total reward: 4.2435
Average quality score: 0.4413
Average emotion score: 2.3848
Average relevance score: 0.6426
Average sentiment score: 0.3234
Average empathy score: 0.8526
Emotion generation success: 94.92%
Epoch time: 1241.0s



[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_2)... 

Saved checkpoint to /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_2


Done. 1.6s


Running validation...




Validation Results:
Average reward: 4.5232
Average quality: 0.4090
Average emotion: 2.5461
Average relevance: 0.8966
Average sentiment: 0.2832
Average empathy: 0.8568
Emotion generation success: 97.08%
Validation time: 135.7s

Epoch 3/8


Training batches:   0%|          | 1/233 [00:04<18:37,  4.82s/it]

Batch 0: Reward=8.002, Quality=2.000, Emotion=3.000, Relevance=1.938, Sentiment=0.343, Empathy=0.835, Success=100.00%


Training batches:   3%|▎         | 6/233 [00:38<25:12,  6.66s/it]

Batch 5: Reward=1.497, Quality=-1.000, Emotion=1.750, Relevance=0.688, Sentiment=0.443, Empathy=0.855, Success=87.50%


Training batches:   5%|▍         | 11/233 [01:07<22:17,  6.03s/it]

Batch 10: Reward=1.115, Quality=-1.750, Emotion=2.250, Relevance=0.000, Sentiment=0.187, Empathy=0.843, Success=100.00%


Training batches:   7%|▋         | 16/233 [01:34<20:08,  5.57s/it]

Batch 15: Reward=2.057, Quality=-2.500, Emotion=2.375, Relevance=0.688, Sentiment=0.381, Empathy=0.816, Success=93.75%


Training batches:   9%|▉         | 21/233 [02:04<20:42,  5.86s/it]

Batch 20: Reward=4.242, Quality=0.750, Emotion=1.188, Relevance=1.875, Sentiment=0.249, Empathy=0.865, Success=87.50%


Training batches:  11%|█         | 26/233 [02:31<18:46,  5.44s/it]

Batch 25: Reward=1.686, Quality=-3.000, Emotion=3.000, Relevance=1.062, Sentiment=0.457, Empathy=0.852, Success=100.00%


Training batches:  13%|█▎        | 31/233 [03:01<18:42,  5.56s/it]

Batch 30: Reward=7.818, Quality=1.250, Emotion=3.000, Relevance=2.750, Sentiment=0.567, Empathy=0.866, Success=100.00%


Training batches:  15%|█▌        | 36/233 [03:34<22:11,  6.76s/it]

Batch 35: Reward=4.406, Quality=-0.250, Emotion=1.750, Relevance=1.750, Sentiment=0.410, Empathy=0.880, Success=87.50%


Training batches:  18%|█▊        | 41/233 [04:01<17:07,  5.35s/it]

Batch 40: Reward=4.104, Quality=1.250, Emotion=1.188, Relevance=0.812, Sentiment=0.351, Empathy=0.861, Success=87.50%


Training batches:  20%|█▉        | 46/233 [04:34<17:53,  5.74s/it]

Batch 45: Reward=2.958, Quality=0.500, Emotion=2.375, Relevance=-0.500, Sentiment=0.296, Empathy=0.818, Success=93.75%


Training batches:  22%|██▏       | 51/233 [05:09<20:40,  6.82s/it]

Batch 50: Reward=2.647, Quality=-1.500, Emotion=1.562, Relevance=2.125, Sentiment=0.141, Empathy=0.866, Success=87.50%


Training batches:  24%|██▍       | 56/233 [05:39<17:23,  5.90s/it]

Batch 55: Reward=5.259, Quality=2.000, Emotion=2.438, Relevance=0.375, Sentiment=0.543, Empathy=0.846, Success=100.00%


Training batches:  26%|██▌       | 61/233 [06:13<18:38,  6.50s/it]

Batch 60: Reward=5.442, Quality=1.250, Emotion=3.000, Relevance=0.562, Sentiment=0.331, Empathy=0.844, Success=100.00%


Training batches:  28%|██▊       | 66/233 [06:46<17:13,  6.19s/it]

Batch 65: Reward=4.251, Quality=1.250, Emotion=1.750, Relevance=0.438, Sentiment=0.117, Empathy=0.851, Success=87.50%


Training batches:  30%|███       | 71/233 [07:13<14:55,  5.53s/it]

Batch 70: Reward=5.025, Quality=-0.250, Emotion=3.000, Relevance=1.750, Sentiment=0.248, Empathy=0.796, Success=100.00%


Training batches:  33%|███▎      | 76/233 [07:40<13:36,  5.20s/it]

Batch 75: Reward=3.457, Quality=0.500, Emotion=3.000, Relevance=-0.688, Sentiment=0.361, Empathy=0.845, Success=100.00%


Training batches:  35%|███▍      | 81/233 [08:09<14:19,  5.65s/it]

Batch 80: Reward=5.278, Quality=1.250, Emotion=2.188, Relevance=1.250, Sentiment=0.459, Empathy=0.856, Success=93.75%


Training batches:  37%|███▋      | 86/233 [08:40<14:24,  5.88s/it]

Batch 85: Reward=1.412, Quality=-1.250, Emotion=2.375, Relevance=-0.250, Sentiment=0.499, Empathy=0.821, Success=93.75%


Training batches:  39%|███▉      | 91/233 [09:08<13:36,  5.75s/it]

Batch 90: Reward=2.661, Quality=0.250, Emotion=1.438, Relevance=0.125, Sentiment=0.420, Empathy=0.867, Success=93.75%


Training batches:  41%|████      | 96/233 [09:35<12:09,  5.33s/it]

Batch 95: Reward=4.860, Quality=1.000, Emotion=2.375, Relevance=0.938, Sentiment=0.278, Empathy=0.837, Success=93.75%


Training batches:  43%|████▎     | 101/233 [10:03<13:34,  6.17s/it]

Batch 100: Reward=3.030, Quality=-0.750, Emotion=2.250, Relevance=1.000, Sentiment=0.238, Empathy=0.863, Success=100.00%


Training batches:  45%|████▌     | 106/233 [10:28<11:11,  5.29s/it]

Batch 105: Reward=4.806, Quality=-0.750, Emotion=2.375, Relevance=3.000, Sentiment=0.301, Empathy=0.847, Success=93.75%


Training batches:  48%|████▊     | 111/233 [11:00<13:02,  6.41s/it]

Batch 110: Reward=2.631, Quality=-1.000, Emotion=1.750, Relevance=1.125, Sentiment=0.316, Empathy=0.866, Success=87.50%


Training batches:  50%|████▉     | 116/233 [11:29<10:37,  5.45s/it]

Batch 115: Reward=6.367, Quality=1.000, Emotion=3.000, Relevance=1.438, Sentiment=0.399, Empathy=0.875, Success=100.00%


Training batches:  52%|█████▏    | 121/233 [11:53<09:15,  4.96s/it]

Batch 120: Reward=6.332, Quality=1.250, Emotion=1.875, Relevance=3.438, Sentiment=0.201, Empathy=0.864, Success=100.00%


Training batches:  54%|█████▍    | 126/233 [12:22<10:03,  5.64s/it]

Batch 125: Reward=3.604, Quality=-0.500, Emotion=2.188, Relevance=1.125, Sentiment=0.117, Empathy=0.845, Success=93.75%


Training batches:  56%|█████▌    | 131/233 [12:51<10:36,  6.24s/it]

Batch 130: Reward=4.449, Quality=-0.250, Emotion=1.750, Relevance=2.312, Sentiment=0.377, Empathy=0.871, Success=87.50%


Training batches:  58%|█████▊    | 136/233 [13:16<08:12,  5.08s/it]

Batch 135: Reward=6.202, Quality=2.000, Emotion=2.375, Relevance=1.562, Sentiment=0.258, Empathy=0.879, Success=93.75%


Training batches:  61%|██████    | 141/233 [13:43<07:46,  5.07s/it]

Batch 140: Reward=6.031, Quality=1.000, Emotion=3.000, Relevance=1.500, Sentiment=0.467, Empathy=0.819, Success=100.00%


Training batches:  63%|██████▎   | 146/233 [14:10<07:38,  5.27s/it]

Batch 145: Reward=7.858, Quality=2.000, Emotion=3.000, Relevance=1.875, Sentiment=0.442, Empathy=0.854, Success=100.00%


Training batches:  65%|██████▍   | 151/233 [14:44<08:39,  6.33s/it]

Batch 150: Reward=6.172, Quality=1.000, Emotion=2.375, Relevance=2.000, Sentiment=0.394, Empathy=0.862, Success=93.75%


Training batches:  67%|██████▋   | 156/233 [15:11<07:32,  5.87s/it]

Batch 155: Reward=3.666, Quality=-0.250, Emotion=1.750, Relevance=1.250, Sentiment=0.191, Empathy=0.874, Success=87.50%


Training batches:  69%|██████▉   | 161/233 [15:41<06:42,  5.59s/it]

Batch 160: Reward=3.916, Quality=0.500, Emotion=2.375, Relevance=0.562, Sentiment=0.281, Empathy=0.844, Success=93.75%


Training batches:  71%|███████   | 166/233 [16:11<06:51,  6.15s/it]

Batch 165: Reward=3.231, Quality=-0.250, Emotion=2.188, Relevance=0.875, Sentiment=0.364, Empathy=0.854, Success=93.75%


Training batches:  73%|███████▎  | 171/233 [16:37<05:23,  5.22s/it]

Batch 170: Reward=4.968, Quality=1.250, Emotion=2.375, Relevance=0.188, Sentiment=0.261, Empathy=0.855, Success=93.75%


Training batches:  76%|███████▌  | 176/233 [17:05<05:18,  5.58s/it]

Batch 175: Reward=4.805, Quality=0.500, Emotion=2.375, Relevance=1.375, Sentiment=0.373, Empathy=0.868, Success=93.75%


Training batches:  78%|███████▊  | 181/233 [17:30<04:45,  5.50s/it]

Batch 180: Reward=4.647, Quality=0.500, Emotion=1.750, Relevance=1.250, Sentiment=0.469, Empathy=0.870, Success=87.50%


Training batches:  80%|███████▉  | 186/233 [17:54<03:51,  4.92s/it]

Batch 185: Reward=4.166, Quality=0.500, Emotion=2.375, Relevance=0.125, Sentiment=0.392, Empathy=0.859, Success=93.75%


Training batches:  82%|████████▏ | 191/233 [18:26<04:22,  6.26s/it]

Batch 190: Reward=4.672, Quality=-0.250, Emotion=3.000, Relevance=1.562, Sentiment=0.225, Empathy=0.858, Success=100.00%


Training batches:  84%|████████▍ | 196/233 [18:50<03:08,  5.09s/it]

Batch 195: Reward=5.491, Quality=1.250, Emotion=2.375, Relevance=0.875, Sentiment=0.253, Empathy=0.862, Success=93.75%


Training batches:  86%|████████▋ | 201/233 [19:14<02:37,  4.92s/it]

Batch 200: Reward=4.191, Quality=0.500, Emotion=2.438, Relevance=0.625, Sentiment=0.280, Empathy=0.861, Success=100.00%


Training batches:  88%|████████▊ | 206/233 [19:41<02:24,  5.37s/it]

Batch 205: Reward=4.458, Quality=-0.250, Emotion=2.375, Relevance=1.750, Sentiment=0.160, Empathy=0.876, Success=93.75%


Training batches:  91%|█████████ | 211/233 [20:10<02:10,  5.91s/it]

Batch 210: Reward=5.546, Quality=0.500, Emotion=2.375, Relevance=2.188, Sentiment=0.452, Empathy=0.874, Success=93.75%


Training batches:  93%|█████████▎| 216/233 [20:38<01:37,  5.74s/it]

Batch 215: Reward=7.511, Quality=1.250, Emotion=2.375, Relevance=2.625, Sentiment=0.270, Empathy=0.859, Success=93.75%


Training batches:  95%|█████████▍| 221/233 [21:06<01:09,  5.76s/it]

Batch 220: Reward=4.046, Quality=-1.000, Emotion=2.375, Relevance=1.625, Sentiment=0.192, Empathy=0.876, Success=93.75%


Training batches:  97%|█████████▋| 226/233 [21:31<00:36,  5.27s/it]

Batch 225: Reward=4.349, Quality=0.500, Emotion=3.000, Relevance=0.125, Sentiment=0.500, Empathy=0.861, Success=100.00%


Training batches:  99%|█████████▉| 231/233 [21:56<00:10,  5.02s/it]

Batch 230: Reward=6.419, Quality=1.250, Emotion=2.438, Relevance=1.438, Sentiment=0.268, Empathy=0.855, Success=100.00%


Training batches: 100%|██████████| 233/233 [22:05<00:00,  5.69s/it]

Error in batch 232: Batch size (16) does not match number of examples - but got 9 for: queries

Epoch 3 Results:
Average total reward: 4.6823
Average quality score: 0.3714
Average emotion score: 2.4319
Average relevance score: 1.1422
Average sentiment score: 0.3169
Average empathy score: 0.8539
Emotion generation success: 95.38%
Epoch time: 1325.6s



[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_3)... 

Saved checkpoint to /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_3


Done. 1.6s


Running validation...




Validation Results:
Average reward: 5.2135
Average quality: 0.6697
Average emotion: 2.4854
Average relevance: 1.2966
Average sentiment: 0.2698
Average empathy: 0.8550
Emotion generation success: 97.08%
Validation time: 122.8s

Epoch 4/8


Training batches:   0%|          | 1/233 [00:07<28:35,  7.40s/it]

Batch 0: Reward=4.475, Quality=-0.250, Emotion=1.750, Relevance=2.062, Sentiment=0.233, Empathy=0.880, Success=87.50%


Training batches:   3%|▎         | 6/233 [00:39<23:31,  6.22s/it]

Batch 5: Reward=8.122, Quality=1.750, Emotion=3.000, Relevance=2.438, Sentiment=0.523, Empathy=0.854, Success=100.00%


Training batches:   5%|▍         | 11/233 [01:05<18:58,  5.13s/it]

Batch 10: Reward=5.656, Quality=1.250, Emotion=2.812, Relevance=1.188, Sentiment=0.310, Empathy=0.873, Success=100.00%


Training batches:   7%|▋         | 16/233 [01:32<18:32,  5.13s/it]

Batch 15: Reward=6.641, Quality=2.000, Emotion=2.375, Relevance=1.312, Sentiment=0.261, Empathy=0.850, Success=93.75%


Training batches:   9%|▉         | 21/233 [02:00<19:17,  5.46s/it]

Batch 20: Reward=4.329, Quality=-0.250, Emotion=3.000, Relevance=1.062, Sentiment=0.188, Empathy=0.857, Success=100.00%


Training batches:  11%|█         | 26/233 [02:26<17:43,  5.14s/it]

Batch 25: Reward=6.910, Quality=2.000, Emotion=3.000, Relevance=1.188, Sentiment=0.299, Empathy=0.847, Success=100.00%


Training batches:  13%|█▎        | 31/233 [02:54<17:25,  5.17s/it]

Batch 30: Reward=5.701, Quality=2.000, Emotion=2.375, Relevance=0.500, Sentiment=0.493, Empathy=0.854, Success=93.75%


Training batches:  15%|█▌        | 36/233 [03:25<18:22,  5.60s/it]

Batch 35: Reward=3.629, Quality=-0.250, Emotion=2.375, Relevance=0.875, Sentiment=0.439, Empathy=0.870, Success=93.75%


Training batches:  18%|█▊        | 41/233 [03:52<17:03,  5.33s/it]

Batch 40: Reward=3.968, Quality=-0.250, Emotion=2.375, Relevance=1.375, Sentiment=0.202, Empathy=0.866, Success=93.75%


Training batches:  20%|█▉        | 46/233 [04:32<24:28,  7.85s/it]

Batch 45: Reward=2.083, Quality=-1.750, Emotion=2.438, Relevance=0.750, Sentiment=0.190, Empathy=0.866, Success=100.00%


Training batches:  22%|██▏       | 51/233 [05:06<20:24,  6.73s/it]

Batch 50: Reward=4.557, Quality=-0.250, Emotion=1.562, Relevance=2.250, Sentiment=0.366, Empathy=0.861, Success=87.50%


Training batches:  24%|██▍       | 56/233 [05:32<15:26,  5.23s/it]

Batch 55: Reward=4.775, Quality=0.500, Emotion=2.375, Relevance=1.000, Sentiment=0.281, Empathy=0.852, Success=93.75%


Training batches:  26%|██▌       | 61/233 [06:04<18:10,  6.34s/it]

Batch 60: Reward=2.958, Quality=-0.250, Emotion=1.250, Relevance=1.125, Sentiment=0.365, Empathy=0.875, Success=93.75%


Training batches:  28%|██▊       | 66/233 [06:35<17:44,  6.37s/it]

Batch 65: Reward=3.686, Quality=-1.000, Emotion=1.750, Relevance=1.562, Sentiment=0.159, Empathy=0.881, Success=87.50%


Training batches:  30%|███       | 71/233 [07:00<13:59,  5.18s/it]

Batch 70: Reward=6.178, Quality=1.250, Emotion=3.000, Relevance=1.375, Sentiment=0.277, Empathy=0.839, Success=100.00%


Training batches:  33%|███▎      | 76/233 [07:28<13:56,  5.33s/it]

Batch 75: Reward=6.570, Quality=1.250, Emotion=3.000, Relevance=1.312, Sentiment=0.341, Empathy=0.871, Success=100.00%


Training batches:  35%|███▍      | 81/233 [07:58<14:56,  5.90s/it]

Batch 80: Reward=4.331, Quality=-0.250, Emotion=2.812, Relevance=1.250, Sentiment=0.213, Empathy=0.847, Success=100.00%


Training batches:  37%|███▋      | 86/233 [08:25<13:19,  5.44s/it]

Batch 85: Reward=8.615, Quality=2.000, Emotion=3.000, Relevance=2.812, Sentiment=0.356, Empathy=0.821, Success=100.00%


Training batches:  39%|███▉      | 91/233 [08:51<12:28,  5.27s/it]

Batch 90: Reward=4.200, Quality=-0.250, Emotion=2.625, Relevance=1.062, Sentiment=0.308, Empathy=0.854, Success=100.00%


Training batches:  41%|████      | 96/233 [09:17<11:59,  5.25s/it]

Batch 95: Reward=6.398, Quality=0.250, Emotion=3.000, Relevance=1.938, Sentiment=0.378, Empathy=0.860, Success=100.00%


Training batches:  43%|████▎     | 101/233 [09:43<11:14,  5.11s/it]

Batch 100: Reward=2.849, Quality=-1.000, Emotion=2.812, Relevance=0.438, Sentiment=0.368, Empathy=0.876, Success=100.00%


Training batches:  45%|████▌     | 106/233 [10:11<11:21,  5.36s/it]

Batch 105: Reward=6.936, Quality=1.250, Emotion=3.000, Relevance=2.062, Sentiment=0.473, Empathy=0.817, Success=100.00%


Training batches:  48%|████▊     | 111/233 [10:39<11:22,  5.59s/it]

Batch 110: Reward=6.093, Quality=1.250, Emotion=1.812, Relevance=2.250, Sentiment=0.220, Empathy=0.886, Success=93.75%


Training batches:  50%|████▉     | 116/233 [11:04<10:31,  5.40s/it]

Batch 115: Reward=5.072, Quality=-0.250, Emotion=2.375, Relevance=2.375, Sentiment=0.517, Empathy=0.865, Success=93.75%


Training batches:  52%|█████▏    | 121/233 [11:29<09:34,  5.13s/it]

Batch 120: Reward=4.895, Quality=0.500, Emotion=1.625, Relevance=2.312, Sentiment=0.508, Empathy=0.850, Success=100.00%


Training batches:  54%|█████▍    | 126/233 [11:57<09:40,  5.42s/it]

Batch 125: Reward=4.892, Quality=1.250, Emotion=2.812, Relevance=0.562, Sentiment=0.177, Empathy=0.846, Success=100.00%


Training batches:  56%|█████▌    | 131/233 [12:27<10:31,  6.19s/it]

Batch 130: Reward=4.111, Quality=-1.000, Emotion=3.000, Relevance=1.375, Sentiment=0.559, Empathy=0.856, Success=100.00%


Training batches:  58%|█████▊    | 136/233 [12:55<09:14,  5.72s/it]

Batch 135: Reward=7.099, Quality=1.250, Emotion=3.000, Relevance=1.812, Sentiment=0.395, Empathy=0.877, Success=100.00%


Training batches:  61%|██████    | 141/233 [13:27<10:06,  6.59s/it]

Batch 140: Reward=5.459, Quality=1.250, Emotion=2.375, Relevance=0.438, Sentiment=0.397, Empathy=0.854, Success=93.75%


Training batches:  63%|██████▎   | 146/233 [13:56<08:35,  5.92s/it]

Batch 145: Reward=6.134, Quality=1.250, Emotion=1.812, Relevance=2.375, Sentiment=0.472, Empathy=0.866, Success=93.75%


Training batches:  65%|██████▍   | 151/233 [14:26<08:17,  6.06s/it]

Batch 150: Reward=5.888, Quality=0.500, Emotion=3.000, Relevance=1.812, Sentiment=0.275, Empathy=0.867, Success=100.00%


Training batches:  67%|██████▋   | 156/233 [14:54<07:24,  5.78s/it]

Batch 155: Reward=2.269, Quality=-1.750, Emotion=2.375, Relevance=1.125, Sentiment=0.260, Empathy=0.876, Success=93.75%


Training batches:  69%|██████▉   | 161/233 [15:25<06:59,  5.83s/it]

Batch 160: Reward=3.763, Quality=-1.750, Emotion=3.000, Relevance=1.812, Sentiment=0.329, Empathy=0.870, Success=100.00%


Training batches:  71%|███████   | 166/233 [15:54<06:49,  6.12s/it]

Batch 165: Reward=2.353, Quality=0.500, Emotion=0.938, Relevance=0.500, Sentiment=0.381, Empathy=0.857, Success=81.25%


Training batches:  73%|███████▎  | 171/233 [16:21<05:42,  5.53s/it]

Batch 170: Reward=4.077, Quality=0.500, Emotion=1.750, Relevance=1.062, Sentiment=0.272, Empathy=0.866, Success=87.50%


Training batches:  76%|███████▌  | 176/233 [16:49<05:40,  5.98s/it]

Batch 175: Reward=1.008, Quality=-3.250, Emotion=2.375, Relevance=1.188, Sentiment=0.387, Empathy=0.818, Success=93.75%


Training batches:  78%|███████▊  | 181/233 [17:18<05:07,  5.91s/it]

Batch 180: Reward=3.880, Quality=0.500, Emotion=1.562, Relevance=1.000, Sentiment=0.445, Empathy=0.887, Success=87.50%


Training batches:  80%|███████▉  | 186/233 [17:45<04:17,  5.49s/it]

Batch 185: Reward=5.546, Quality=2.000, Emotion=2.375, Relevance=0.125, Sentiment=0.343, Empathy=0.871, Success=93.75%


Training batches:  82%|████████▏ | 191/233 [18:12<03:54,  5.57s/it]

Batch 190: Reward=3.860, Quality=1.250, Emotion=1.812, Relevance=0.188, Sentiment=0.138, Empathy=0.861, Success=93.75%


Training batches:  84%|████████▍ | 196/233 [18:38<03:13,  5.24s/it]

Batch 195: Reward=5.148, Quality=1.250, Emotion=2.375, Relevance=0.875, Sentiment=0.304, Empathy=0.868, Success=93.75%


Training batches:  86%|████████▋ | 201/233 [19:10<03:19,  6.23s/it]

Batch 200: Reward=4.779, Quality=0.500, Emotion=1.188, Relevance=2.250, Sentiment=0.170, Empathy=0.843, Success=87.50%


Training batches:  88%|████████▊ | 206/233 [19:38<02:27,  5.46s/it]

Batch 205: Reward=6.866, Quality=2.000, Emotion=3.000, Relevance=1.188, Sentiment=0.225, Empathy=0.843, Success=100.00%


Training batches:  91%|█████████ | 211/233 [20:05<02:00,  5.50s/it]

Batch 210: Reward=6.449, Quality=0.500, Emotion=3.000, Relevance=2.125, Sentiment=0.297, Empathy=0.878, Success=100.00%


Training batches:  93%|█████████▎| 216/233 [20:36<01:44,  6.17s/it]

Batch 215: Reward=4.723, Quality=-1.000, Emotion=3.000, Relevance=2.125, Sentiment=0.249, Empathy=0.864, Success=100.00%


Training batches:  95%|█████████▍| 221/233 [21:02<01:06,  5.55s/it]

Batch 220: Reward=2.494, Quality=-0.250, Emotion=2.375, Relevance=-0.250, Sentiment=0.337, Empathy=0.878, Success=93.75%


Training batches:  97%|█████████▋| 226/233 [21:31<00:40,  5.78s/it]

Batch 225: Reward=4.380, Quality=-0.250, Emotion=1.875, Relevance=2.000, Sentiment=0.384, Empathy=0.856, Success=100.00%


Training batches:  99%|█████████▉| 231/233 [21:58<00:10,  5.36s/it]

Batch 230: Reward=7.803, Quality=1.250, Emotion=3.000, Relevance=3.250, Sentiment=0.187, Empathy=0.860, Success=100.00%


Training batches: 100%|██████████| 233/233 [22:07<00:00,  5.70s/it]

Error in batch 232: Batch size (16) does not match number of examples - but got 9 for: queries

Epoch 4 Results:
Average total reward: 5.2109
Average quality score: 0.4832
Average emotion score: 2.4840
Average relevance score: 1.5329
Average sentiment score: 0.3186
Average empathy score: 0.8612
Emotion generation success: 96.35%
Epoch time: 1328.0s



[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_4)... 

Saved checkpoint to /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_4


Done. 1.6s


Running validation...




Validation Results:
Average reward: 5.4200
Average quality: 0.7865
Average emotion: 2.3213
Average relevance: 1.7393
Average sentiment: 0.2774
Average empathy: 0.8572
Emotion generation success: 96.18%
Validation time: 141.1s

Epoch 5/8


Training batches:   0%|          | 1/233 [00:05<20:16,  5.25s/it]

Batch 0: Reward=6.198, Quality=1.250, Emotion=2.375, Relevance=1.688, Sentiment=0.284, Empathy=0.863, Success=93.75%


Training batches:   3%|▎         | 6/233 [00:37<24:06,  6.37s/it]

Batch 5: Reward=4.668, Quality=0.250, Emotion=2.438, Relevance=1.125, Sentiment=0.216, Empathy=0.866, Success=100.00%


Training batches:   5%|▍         | 11/233 [01:03<20:22,  5.51s/it]

Batch 10: Reward=3.141, Quality=-1.750, Emotion=2.188, Relevance=1.000, Sentiment=0.367, Empathy=0.795, Success=93.75%


Training batches:   7%|▋         | 16/233 [01:33<22:07,  6.12s/it]

Batch 15: Reward=6.858, Quality=1.250, Emotion=2.375, Relevance=2.312, Sentiment=0.560, Empathy=0.848, Success=93.75%


Training batches:   9%|▉         | 21/233 [02:03<20:49,  5.89s/it]

Batch 20: Reward=5.111, Quality=0.500, Emotion=2.375, Relevance=1.938, Sentiment=0.211, Empathy=0.848, Success=93.75%


Training batches:  11%|█         | 26/233 [02:29<17:56,  5.20s/it]

Batch 25: Reward=8.455, Quality=1.250, Emotion=3.000, Relevance=3.438, Sentiment=0.365, Empathy=0.860, Success=100.00%


Training batches:  13%|█▎        | 31/233 [03:04<21:53,  6.50s/it]

Batch 30: Reward=5.115, Quality=0.500, Emotion=3.000, Relevance=0.812, Sentiment=0.246, Empathy=0.870, Success=100.00%


Training batches:  15%|█▌        | 36/233 [03:42<22:23,  6.82s/it]

Batch 35: Reward=4.070, Quality=-0.250, Emotion=2.375, Relevance=1.375, Sentiment=0.339, Empathy=0.866, Success=93.75%


Training batches:  18%|█▊        | 41/233 [04:10<18:50,  5.89s/it]

Batch 40: Reward=5.082, Quality=0.500, Emotion=2.438, Relevance=1.750, Sentiment=0.095, Empathy=0.879, Success=100.00%


Training batches:  20%|█▉        | 46/233 [04:46<21:49,  7.00s/it]

Batch 45: Reward=3.646, Quality=-0.750, Emotion=1.812, Relevance=1.125, Sentiment=0.291, Empathy=0.860, Success=93.75%


Training batches:  22%|██▏       | 51/233 [05:22<21:59,  7.25s/it]

Batch 50: Reward=3.430, Quality=-0.250, Emotion=1.562, Relevance=0.750, Sentiment=0.244, Empathy=0.850, Success=87.50%


Training batches:  24%|██▍       | 56/233 [05:54<19:06,  6.48s/it]

Batch 55: Reward=3.646, Quality=-1.250, Emotion=3.000, Relevance=1.250, Sentiment=0.285, Empathy=0.822, Success=100.00%


Training batches:  26%|██▌       | 61/233 [06:28<19:46,  6.90s/it]

Batch 60: Reward=5.528, Quality=-0.250, Emotion=2.438, Relevance=2.875, Sentiment=0.254, Empathy=0.859, Success=100.00%


Training batches:  28%|██▊       | 66/233 [07:06<21:10,  7.61s/it]

Batch 65: Reward=2.124, Quality=-1.000, Emotion=0.562, Relevance=1.688, Sentiment=0.265, Empathy=0.876, Success=81.25%


Training batches:  30%|███       | 71/233 [07:37<17:32,  6.50s/it]

Batch 70: Reward=5.973, Quality=0.500, Emotion=2.438, Relevance=2.500, Sentiment=0.268, Empathy=0.874, Success=100.00%


Training batches:  33%|███▎      | 76/233 [08:06<14:45,  5.64s/it]

Batch 75: Reward=7.522, Quality=2.000, Emotion=3.000, Relevance=2.062, Sentiment=0.471, Empathy=0.847, Success=100.00%


Training batches:  35%|███▍      | 81/233 [08:45<17:15,  6.81s/it]

Batch 80: Reward=5.388, Quality=1.250, Emotion=2.812, Relevance=0.688, Sentiment=0.149, Empathy=0.848, Success=100.00%


Training batches:  37%|███▋      | 86/233 [09:22<18:28,  7.54s/it]

Batch 85: Reward=3.822, Quality=-1.250, Emotion=1.812, Relevance=2.562, Sentiment=0.385, Empathy=0.852, Success=93.75%


Training batches:  39%|███▉      | 91/233 [09:52<14:00,  5.92s/it]

Batch 90: Reward=4.804, Quality=1.250, Emotion=2.000, Relevance=0.312, Sentiment=0.448, Empathy=0.805, Success=93.75%


Training batches:  41%|████      | 96/233 [10:20<12:54,  5.65s/it]

Batch 95: Reward=4.234, Quality=-0.250, Emotion=1.750, Relevance=2.375, Sentiment=0.005, Empathy=0.871, Success=87.50%


Training batches:  43%|████▎     | 101/233 [10:49<13:25,  6.10s/it]

Batch 100: Reward=3.540, Quality=0.500, Emotion=2.188, Relevance=-0.375, Sentiment=0.453, Empathy=0.882, Success=93.75%


Training batches:  45%|████▌     | 106/233 [11:21<14:28,  6.84s/it]

Batch 105: Reward=4.270, Quality=-0.250, Emotion=2.438, Relevance=1.312, Sentiment=0.190, Empathy=0.847, Success=100.00%


Training batches:  48%|████▊     | 111/233 [12:02<16:54,  8.31s/it]

Batch 110: Reward=2.598, Quality=-2.500, Emotion=1.812, Relevance=2.625, Sentiment=0.105, Empathy=0.877, Success=93.75%


Training batches:  50%|████▉     | 116/233 [12:42<14:23,  7.38s/it]

Batch 115: Reward=6.477, Quality=1.250, Emotion=3.000, Relevance=1.500, Sentiment=0.440, Empathy=0.903, Success=100.00%


Training batches:  52%|█████▏    | 121/233 [13:17<13:17,  7.12s/it]

Batch 120: Reward=7.409, Quality=1.250, Emotion=1.875, Relevance=3.562, Sentiment=0.255, Empathy=0.848, Success=100.00%


Training batches:  54%|█████▍    | 126/233 [14:01<14:27,  8.10s/it]

Batch 125: Reward=3.078, Quality=0.000, Emotion=0.125, Relevance=2.250, Sentiment=0.131, Empathy=0.874, Success=93.75%


Training batches:  56%|█████▌    | 131/233 [14:41<14:01,  8.25s/it]

Batch 130: Reward=7.589, Quality=2.000, Emotion=3.000, Relevance=1.875, Sentiment=0.285, Empathy=0.862, Success=100.00%


Training batches:  58%|█████▊    | 136/233 [15:11<10:24,  6.44s/it]

Batch 135: Reward=5.490, Quality=2.000, Emotion=2.438, Relevance=0.625, Sentiment=0.378, Empathy=0.851, Success=100.00%


Training batches:  61%|██████    | 141/233 [15:52<11:25,  7.45s/it]

Batch 140: Reward=2.524, Quality=-0.250, Emotion=0.562, Relevance=1.000, Sentiment=0.193, Empathy=0.805, Success=81.25%


Training batches:  63%|██████▎   | 146/233 [16:47<15:41, 10.83s/it]

Batch 145: Reward=3.892, Quality=-0.250, Emotion=1.250, Relevance=2.062, Sentiment=0.255, Empathy=0.884, Success=93.75%


Training batches:  65%|██████▍   | 151/233 [17:43<14:24, 10.54s/it]

Batch 150: Reward=5.628, Quality=1.250, Emotion=1.312, Relevance=2.312, Sentiment=0.319, Empathy=0.869, Success=100.00%


Training batches:  67%|██████▋   | 156/233 [18:24<11:28,  8.94s/it]

Batch 155: Reward=2.991, Quality=-1.750, Emotion=1.812, Relevance=1.438, Sentiment=0.141, Empathy=0.868, Success=93.75%


Training batches:  69%|██████▉   | 161/233 [19:11<10:25,  8.68s/it]

Batch 160: Reward=3.616, Quality=-1.000, Emotion=3.000, Relevance=1.000, Sentiment=0.164, Empathy=0.892, Success=100.00%


Training batches:  71%|███████   | 166/233 [19:59<10:27,  9.37s/it]

Batch 165: Reward=4.298, Quality=0.500, Emotion=0.375, Relevance=1.812, Sentiment=0.510, Empathy=0.877, Success=81.25%


Training batches:  73%|███████▎  | 171/233 [20:39<08:55,  8.64s/it]

Batch 170: Reward=1.796, Quality=-0.250, Emotion=0.000, Relevance=0.875, Sentiment=0.271, Empathy=0.870, Success=81.25%


Training batches:  76%|███████▌  | 176/233 [21:13<06:52,  7.23s/it]

Batch 175: Reward=4.501, Quality=-0.250, Emotion=1.812, Relevance=1.562, Sentiment=0.482, Empathy=0.879, Success=93.75%


Training batches:  78%|███████▊  | 181/233 [22:04<09:27, 10.91s/it]

Batch 180: Reward=1.660, Quality=-1.000, Emotion=1.312, Relevance=-0.438, Sentiment=0.348, Empathy=0.861, Success=100.00%


Training batches:  80%|███████▉  | 186/233 [22:51<07:28,  9.55s/it]

Batch 185: Reward=3.652, Quality=-0.250, Emotion=1.812, Relevance=1.062, Sentiment=0.317, Empathy=0.894, Success=93.75%


Training batches:  82%|████████▏ | 191/233 [23:30<05:04,  7.26s/it]

Batch 190: Reward=6.484, Quality=1.250, Emotion=2.375, Relevance=2.188, Sentiment=0.173, Empathy=0.853, Success=93.75%


Training batches:  84%|████████▍ | 196/233 [24:17<05:54,  9.57s/it]

Batch 195: Reward=0.609, Quality=-1.000, Emotion=-0.562, Relevance=0.750, Sentiment=0.257, Empathy=0.871, Success=81.25%


Training batches:  86%|████████▋ | 201/233 [25:09<05:22, 10.08s/it]

Batch 200: Reward=2.817, Quality=-0.250, Emotion=0.062, Relevance=0.750, Sentiment=0.271, Empathy=0.880, Success=87.50%


Training batches:  88%|████████▊ | 206/233 [26:00<04:47, 10.64s/it]

Batch 205: Reward=2.208, Quality=-1.000, Emotion=0.688, Relevance=1.000, Sentiment=0.135, Empathy=0.885, Success=93.75%


Training batches:  91%|█████████ | 211/233 [26:47<03:26,  9.37s/it]

Batch 210: Reward=4.033, Quality=0.500, Emotion=2.375, Relevance=0.938, Sentiment=0.086, Empathy=0.882, Success=93.75%


Training batches:  93%|█████████▎| 216/233 [27:29<02:31,  8.92s/it]

Batch 215: Reward=3.088, Quality=0.500, Emotion=0.750, Relevance=1.250, Sentiment=0.606, Empathy=0.862, Success=100.00%


Training batches:  95%|█████████▍| 221/233 [28:15<01:45,  8.82s/it]

Batch 220: Reward=2.447, Quality=1.250, Emotion=1.125, Relevance=-0.938, Sentiment=0.116, Empathy=0.867, Success=81.25%


Training batches:  97%|█████████▋| 226/233 [29:03<01:01,  8.78s/it]

Batch 225: Reward=4.921, Quality=1.250, Emotion=2.438, Relevance=0.438, Sentiment=0.526, Empathy=0.881, Success=100.00%


Training batches:  99%|█████████▉| 231/233 [29:48<00:19,  9.82s/it]

Batch 230: Reward=2.695, Quality=-1.000, Emotion=-0.562, Relevance=2.375, Sentiment=0.219, Empathy=0.867, Success=81.25%


Training batches: 100%|██████████| 233/233 [30:02<00:00,  7.73s/it]

Error in batch 232: Batch size (16) does not match number of examples - but got 9 for: queries

Epoch 5 Results:
Average total reward: 4.2183
Average quality score: 0.1328
Average emotion score: 1.7804
Average relevance score: 1.4004
Average sentiment score: 0.3084
Average empathy score: 0.8636
Emotion generation success: 94.46%
Epoch time: 1802.0s



[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_5)... 

Saved checkpoint to /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_5


Done. 2.0s


Running validation...




Validation Results:
Average reward: 3.6016
Average quality: -0.1124
Average emotion: 1.1348
Average relevance: 1.5348
Average sentiment: 0.2811
Average empathy: 0.8650
Emotion generation success: 94.16%
Validation time: 239.1s

Epoch 6/8


Training batches:   0%|          | 1/233 [00:13<50:28, 13.06s/it]

Batch 0: Reward=2.353, Quality=-1.750, Emotion=0.062, Relevance=2.000, Sentiment=0.186, Empathy=0.861, Success=87.50%


Training batches:   3%|▎         | 6/233 [01:18<46:30, 12.29s/it]

Batch 5: Reward=4.165, Quality=0.500, Emotion=1.312, Relevance=1.062, Sentiment=0.323, Empathy=0.853, Success=100.00%


Training batches:   5%|▍         | 11/233 [02:05<35:26,  9.58s/it]

Batch 10: Reward=3.933, Quality=1.250, Emotion=1.688, Relevance=-0.125, Sentiment=0.436, Empathy=0.867, Success=100.00%


Training batches:   7%|▋         | 16/233 [02:53<34:29,  9.53s/it]

Batch 15: Reward=1.718, Quality=0.500, Emotion=0.000, Relevance=-0.125, Sentiment=0.307, Empathy=0.880, Success=81.25%


Training batches:   9%|▉         | 21/233 [03:46<39:44, 11.25s/it]

Batch 20: Reward=0.702, Quality=-1.750, Emotion=0.062, Relevance=1.438, Sentiment=0.078, Empathy=0.877, Success=87.50%


Training batches:  11%|█         | 26/233 [04:49<42:44, 12.39s/it]

Batch 25: Reward=0.275, Quality=-1.750, Emotion=-0.500, Relevance=2.312, Sentiment=0.230, Empathy=0.851, Success=87.50%


Training batches:  13%|█▎        | 31/233 [05:47<39:40, 11.78s/it]

Batch 30: Reward=2.304, Quality=0.000, Emotion=0.750, Relevance=0.500, Sentiment=0.519, Empathy=0.849, Success=100.00%


Training batches:  15%|█▌        | 36/233 [06:54<41:06, 12.52s/it]

Batch 35: Reward=1.287, Quality=-0.250, Emotion=-1.125, Relevance=2.000, Sentiment=0.114, Empathy=0.874, Success=81.25%


Training batches:  18%|█▊        | 41/233 [07:54<39:51, 12.46s/it]

Batch 40: Reward=3.492, Quality=-0.250, Emotion=0.750, Relevance=1.625, Sentiment=0.337, Empathy=0.873, Success=100.00%


Training batches:  20%|█▉        | 46/233 [09:03<43:59, 14.11s/it]

Batch 45: Reward=-0.306, Quality=-0.250, Emotion=-1.062, Relevance=-0.688, Sentiment=0.196, Empathy=0.887, Success=87.50%


Training batches:  22%|██▏       | 51/233 [10:03<34:54, 11.51s/it]

Batch 50: Reward=3.662, Quality=-0.500, Emotion=1.000, Relevance=1.625, Sentiment=0.244, Empathy=0.869, Success=87.50%


Training batches:  24%|██▍       | 56/233 [11:05<32:31, 11.03s/it]

Batch 55: Reward=4.255, Quality=1.250, Emotion=1.125, Relevance=1.188, Sentiment=0.591, Empathy=0.862, Success=81.25%


Training batches:  26%|██▌       | 61/233 [11:56<27:56,  9.75s/it]

Batch 60: Reward=3.492, Quality=0.500, Emotion=1.250, Relevance=1.062, Sentiment=0.429, Empathy=0.838, Success=93.75%


Training batches:  28%|██▊       | 66/233 [13:03<37:32, 13.49s/it]

Batch 65: Reward=0.214, Quality=-1.750, Emotion=-1.188, Relevance=1.062, Sentiment=0.096, Empathy=0.891, Success=75.00%


Training batches:  30%|███       | 71/233 [13:58<27:21, 10.13s/it]

Batch 70: Reward=3.241, Quality=-0.250, Emotion=1.188, Relevance=1.562, Sentiment=0.367, Empathy=0.868, Success=87.50%


Training batches:  33%|███▎      | 76/233 [14:46<21:53,  8.36s/it]

Batch 75: Reward=4.688, Quality=1.250, Emotion=2.438, Relevance=0.250, Sentiment=0.319, Empathy=0.849, Success=100.00%


Training batches:  35%|███▍      | 81/233 [15:54<28:53, 11.41s/it]

Batch 80: Reward=4.398, Quality=0.500, Emotion=1.688, Relevance=1.812, Sentiment=0.378, Empathy=0.884, Success=100.00%


Training batches:  37%|███▋      | 86/233 [16:41<23:32,  9.61s/it]

Batch 85: Reward=2.845, Quality=-0.250, Emotion=1.188, Relevance=0.250, Sentiment=0.268, Empathy=0.884, Success=87.50%


Training batches:  39%|███▉      | 91/233 [17:22<19:27,  8.22s/it]

Batch 90: Reward=4.369, Quality=1.250, Emotion=2.000, Relevance=-0.250, Sentiment=0.227, Empathy=0.875, Success=93.75%


Training batches:  41%|████      | 96/233 [18:04<17:21,  7.60s/it]

Batch 95: Reward=4.912, Quality=0.500, Emotion=2.375, Relevance=1.438, Sentiment=0.350, Empathy=0.860, Success=93.75%


Training batches:  43%|████▎     | 101/233 [18:50<19:16,  8.76s/it]

Batch 100: Reward=2.873, Quality=-0.250, Emotion=1.250, Relevance=1.000, Sentiment=0.368, Empathy=0.875, Success=93.75%


Training batches:  45%|████▌     | 106/233 [19:47<23:06, 10.92s/it]

Batch 105: Reward=3.217, Quality=-0.250, Emotion=0.000, Relevance=1.688, Sentiment=0.268, Empathy=0.863, Success=81.25%


Training batches:  48%|████▊     | 111/233 [20:43<22:21, 11.00s/it]

Batch 110: Reward=4.230, Quality=0.500, Emotion=1.125, Relevance=1.562, Sentiment=0.245, Empathy=0.859, Success=81.25%


Training batches:  50%|████▉     | 116/233 [21:42<21:28, 11.01s/it]

Batch 115: Reward=4.190, Quality=1.250, Emotion=1.188, Relevance=0.250, Sentiment=0.230, Empathy=0.838, Success=87.50%


Training batches:  52%|█████▏    | 121/233 [22:34<17:16,  9.26s/it]

Batch 120: Reward=5.942, Quality=2.000, Emotion=1.312, Relevance=2.562, Sentiment=0.192, Empathy=0.862, Success=100.00%


Training batches:  54%|█████▍    | 126/233 [23:37<21:20, 11.97s/it]

Batch 125: Reward=-2.073, Quality=-3.250, Emotion=-0.875, Relevance=-0.062, Sentiment=0.135, Empathy=0.876, Success=68.75%


Training batches:  56%|█████▌    | 131/233 [24:33<18:56, 11.14s/it]

Batch 130: Reward=5.917, Quality=0.500, Emotion=1.750, Relevance=2.438, Sentiment=0.367, Empathy=0.886, Success=87.50%


Training batches:  58%|█████▊    | 136/233 [25:09<12:48,  7.92s/it]

Batch 135: Reward=4.906, Quality=0.500, Emotion=1.875, Relevance=1.688, Sentiment=0.381, Empathy=0.860, Success=100.00%


Training batches:  61%|██████    | 141/233 [26:03<14:04,  9.18s/it]

Batch 140: Reward=5.769, Quality=1.250, Emotion=2.438, Relevance=1.375, Sentiment=0.251, Empathy=0.854, Success=100.00%


Training batches:  63%|██████▎   | 146/233 [27:05<19:18, 13.32s/it]

Batch 145: Reward=1.599, Quality=-1.750, Emotion=-1.188, Relevance=2.375, Sentiment=0.116, Empathy=0.885, Success=75.00%


Training batches:  65%|██████▍   | 151/233 [28:06<16:26, 12.03s/it]

Batch 150: Reward=3.668, Quality=0.500, Emotion=0.688, Relevance=1.562, Sentiment=0.186, Empathy=0.872, Success=93.75%


Training batches:  67%|██████▋   | 156/233 [28:53<13:03, 10.17s/it]

Batch 155: Reward=3.541, Quality=-1.000, Emotion=1.750, Relevance=2.000, Sentiment=0.119, Empathy=0.857, Success=87.50%


Training batches:  69%|██████▉   | 161/233 [29:44<12:07, 10.10s/it]

Batch 160: Reward=2.254, Quality=-0.750, Emotion=0.688, Relevance=0.625, Sentiment=0.196, Empathy=0.865, Success=93.75%


Training batches:  71%|███████   | 166/233 [30:36<12:20, 11.06s/it]

Batch 165: Reward=3.348, Quality=-0.250, Emotion=0.625, Relevance=2.188, Sentiment=0.267, Empathy=0.862, Success=87.50%


Training batches:  73%|███████▎  | 171/233 [31:13<07:51,  7.60s/it]

Batch 170: Reward=5.266, Quality=1.250, Emotion=3.000, Relevance=0.250, Sentiment=0.203, Empathy=0.868, Success=100.00%


Training batches:  76%|███████▌  | 176/233 [32:03<08:36,  9.06s/it]

Batch 175: Reward=3.167, Quality=-0.500, Emotion=1.750, Relevance=0.500, Sentiment=0.374, Empathy=0.871, Success=87.50%


Training batches:  78%|███████▊  | 181/233 [32:53<09:26, 10.90s/it]

Batch 180: Reward=2.905, Quality=-1.000, Emotion=-0.062, Relevance=0.688, Sentiment=0.476, Empathy=0.875, Success=75.00%


Training batches:  80%|███████▉  | 186/233 [33:58<09:25, 12.02s/it]

Batch 185: Reward=4.831, Quality=0.500, Emotion=1.812, Relevance=0.875, Sentiment=0.344, Empathy=0.864, Success=93.75%


Training batches:  82%|████████▏ | 191/233 [34:46<06:47,  9.70s/it]

Batch 190: Reward=4.929, Quality=0.500, Emotion=1.812, Relevance=0.312, Sentiment=0.310, Empathy=0.882, Success=93.75%


Training batches:  84%|████████▍ | 196/233 [35:41<06:16, 10.19s/it]

Batch 195: Reward=4.394, Quality=1.250, Emotion=1.812, Relevance=-0.438, Sentiment=0.239, Empathy=0.862, Success=93.75%


Training batches:  86%|████████▋ | 201/233 [36:43<06:29, 12.16s/it]

Batch 200: Reward=-0.006, Quality=-3.750, Emotion=-1.188, Relevance=3.750, Sentiment=0.008, Empathy=0.886, Success=75.00%


Training batches:  88%|████████▊ | 206/233 [37:43<05:28, 12.17s/it]

Batch 205: Reward=1.270, Quality=0.250, Emotion=-0.500, Relevance=0.000, Sentiment=0.368, Empathy=0.880, Success=87.50%


Training batches:  91%|█████████ | 211/233 [38:29<03:32,  9.67s/it]

Batch 210: Reward=4.274, Quality=-0.250, Emotion=1.250, Relevance=1.625, Sentiment=0.153, Empathy=0.825, Success=93.75%


Training batches:  93%|█████████▎| 216/233 [39:16<02:46,  9.81s/it]

Batch 215: Reward=1.727, Quality=-1.000, Emotion=0.562, Relevance=0.250, Sentiment=0.200, Empathy=0.887, Success=81.25%


Training batches:  95%|█████████▍| 221/233 [40:05<01:58,  9.86s/it]

Batch 220: Reward=4.706, Quality=1.250, Emotion=2.375, Relevance=-0.188, Sentiment=0.205, Empathy=0.869, Success=93.75%


Training batches:  97%|█████████▋| 226/233 [40:50<01:04,  9.22s/it]

Batch 225: Reward=6.203, Quality=1.000, Emotion=1.750, Relevance=2.312, Sentiment=0.247, Empathy=0.898, Success=87.50%


Training batches:  99%|█████████▉| 231/233 [41:37<00:21, 10.52s/it]

Batch 230: Reward=1.641, Quality=-1.000, Emotion=-0.562, Relevance=0.938, Sentiment=0.250, Empathy=0.901, Success=81.25%


Training batches: 100%|██████████| 233/233 [41:46<00:00, 10.76s/it]

Error in batch 232: Batch size (16) does not match number of examples - but got 9 for: queries

Epoch 6 Results:
Average total reward: 2.9685
Average quality score: -0.1360
Average emotion score: 0.6952
Average relevance score: 1.0193
Average sentiment score: 0.2865
Average empathy score: 0.8702
Emotion generation success: 87.53%
Epoch time: 2506.7s



[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_6)... 

Saved checkpoint to /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_6


Done. 1.6s


Running validation...




Validation Results:
Average reward: 3.3543
Average quality: 0.0494
Average emotion: 0.9865
Average relevance: 0.9393
Average sentiment: 0.3031
Average empathy: 0.8729
Emotion generation success: 86.07%
Validation time: 238.5s

Epoch 7/8


Training batches:   0%|          | 1/233 [00:14<54:15, 14.03s/it]

Batch 0: Reward=1.263, Quality=-1.750, Emotion=-0.688, Relevance=0.812, Sentiment=0.286, Empathy=0.832, Success=68.75%


Training batches:   3%|▎         | 6/233 [01:19<49:18, 13.03s/it]

Batch 5: Reward=2.721, Quality=-1.000, Emotion=0.562, Relevance=0.062, Sentiment=0.205, Empathy=0.866, Success=81.25%


Training batches:   5%|▍         | 11/233 [02:08<36:11,  9.78s/it]

Batch 10: Reward=5.344, Quality=1.250, Emotion=2.812, Relevance=0.625, Sentiment=0.449, Empathy=0.872, Success=100.00%


Training batches:   7%|▋         | 16/233 [02:56<36:01,  9.96s/it]

Batch 15: Reward=2.770, Quality=-1.000, Emotion=0.500, Relevance=1.250, Sentiment=0.063, Empathy=0.887, Success=75.00%


Training batches:   9%|▉         | 21/233 [03:52<38:29, 10.90s/it]

Batch 20: Reward=3.610, Quality=0.500, Emotion=0.562, Relevance=1.688, Sentiment=0.287, Empathy=0.868, Success=81.25%


Training batches:  11%|█         | 26/233 [04:48<37:57, 11.00s/it]

Batch 25: Reward=4.145, Quality=0.500, Emotion=0.688, Relevance=2.312, Sentiment=0.361, Empathy=0.869, Success=93.75%


Training batches:  13%|█▎        | 31/233 [05:47<37:06, 11.02s/it]

Batch 30: Reward=3.486, Quality=1.000, Emotion=0.562, Relevance=1.125, Sentiment=0.320, Empathy=0.848, Success=81.25%


Training batches:  15%|█▌        | 36/233 [06:51<39:50, 12.14s/it]

Batch 35: Reward=2.888, Quality=-1.000, Emotion=-0.062, Relevance=1.375, Sentiment=0.245, Empathy=0.887, Success=75.00%


Training batches:  18%|█▊        | 41/233 [07:48<38:34, 12.05s/it]

Batch 40: Reward=3.038, Quality=-0.750, Emotion=0.000, Relevance=2.500, Sentiment=0.398, Empathy=0.866, Success=81.25%


Training batches:  20%|█▉        | 46/233 [08:57<45:16, 14.53s/it]

Batch 45: Reward=-2.616, Quality=-3.250, Emotion=-1.938, Relevance=-1.062, Sentiment=0.213, Empathy=0.886, Success=56.25%


Training batches:  22%|██▏       | 51/233 [10:05<41:15, 13.60s/it]

Batch 50: Reward=1.960, Quality=-1.750, Emotion=-0.250, Relevance=1.188, Sentiment=0.290, Empathy=0.906, Success=75.00%


Training batches:  24%|██▍       | 56/233 [11:02<33:10, 11.25s/it]

Batch 55: Reward=2.482, Quality=0.500, Emotion=1.125, Relevance=-0.500, Sentiment=0.266, Empathy=0.876, Success=81.25%


Training batches:  26%|██▌       | 61/233 [12:04<33:02, 11.53s/it]

Batch 60: Reward=1.640, Quality=-1.000, Emotion=1.188, Relevance=0.125, Sentiment=0.415, Empathy=0.871, Success=87.50%


Training batches:  28%|██▊       | 66/233 [13:07<36:11, 13.00s/it]

Batch 65: Reward=-1.132, Quality=-3.750, Emotion=-1.312, Relevance=-0.375, Sentiment=0.131, Empathy=0.916, Success=62.50%


Training batches:  30%|███       | 71/233 [13:54<27:35, 10.22s/it]

Batch 70: Reward=3.557, Quality=-1.000, Emotion=0.500, Relevance=1.438, Sentiment=0.322, Empathy=0.890, Success=75.00%


Training batches:  33%|███▎      | 76/233 [14:44<23:08,  8.85s/it]

Batch 75: Reward=2.125, Quality=1.250, Emotion=1.125, Relevance=-0.688, Sentiment=0.296, Empathy=0.856, Success=81.25%


Training batches:  35%|███▍      | 81/233 [15:50<27:49, 10.98s/it]

Batch 80: Reward=6.586, Quality=2.000, Emotion=2.188, Relevance=1.562, Sentiment=0.499, Empathy=0.811, Success=93.75%


Training batches:  37%|███▋      | 86/233 [16:43<25:04, 10.23s/it]

Batch 85: Reward=1.016, Quality=-0.250, Emotion=-0.062, Relevance=0.188, Sentiment=0.553, Empathy=0.892, Success=75.00%


Training batches:  39%|███▉      | 91/233 [17:41<24:10, 10.22s/it]

Batch 90: Reward=4.340, Quality=0.500, Emotion=1.375, Relevance=1.688, Sentiment=0.259, Empathy=0.884, Success=87.50%


Training batches:  41%|████      | 96/233 [18:27<21:21,  9.36s/it]

Batch 95: Reward=3.982, Quality=-0.500, Emotion=1.125, Relevance=1.562, Sentiment=0.191, Empathy=0.886, Success=81.25%


Training batches:  43%|████▎     | 101/233 [19:22<22:59, 10.45s/it]

Batch 100: Reward=2.601, Quality=-0.250, Emotion=0.375, Relevance=-0.500, Sentiment=0.515, Empathy=0.815, Success=81.25%


Training batches:  45%|████▌     | 106/233 [20:21<24:42, 11.67s/it]

Batch 105: Reward=2.992, Quality=-0.250, Emotion=0.562, Relevance=0.000, Sentiment=0.206, Empathy=0.880, Success=81.25%


Training batches:  48%|████▊     | 111/233 [21:26<25:39, 12.62s/it]

Batch 110: Reward=-0.131, Quality=-2.500, Emotion=-1.375, Relevance=-0.125, Sentiment=0.184, Empathy=0.898, Success=56.25%


Training batches:  50%|████▉     | 116/233 [22:41<31:27, 16.13s/it]

Batch 115: Reward=-2.045, Quality=-4.000, Emotion=-1.938, Relevance=-0.250, Sentiment=0.294, Empathy=0.918, Success=56.25%


Training batches:  52%|█████▏    | 121/233 [23:39<21:59, 11.78s/it]

Batch 120: Reward=3.380, Quality=-0.250, Emotion=0.875, Relevance=1.688, Sentiment=0.248, Empathy=0.884, Success=93.75%


Training batches:  54%|█████▍    | 126/233 [24:46<22:32, 12.64s/it]

Batch 125: Reward=0.706, Quality=-1.000, Emotion=-0.938, Relevance=0.250, Sentiment=0.198, Empathy=0.867, Success=62.50%


Training batches:  56%|█████▌    | 131/233 [25:48<20:41, 12.17s/it]

Batch 130: Reward=4.720, Quality=1.250, Emotion=1.875, Relevance=1.250, Sentiment=0.388, Empathy=0.871, Success=100.00%


Training batches:  58%|█████▊    | 136/233 [26:28<14:06,  8.72s/it]

Batch 135: Reward=2.389, Quality=-0.250, Emotion=-0.125, Relevance=1.312, Sentiment=0.417, Empathy=0.877, Success=68.75%


Training batches:  61%|██████    | 141/233 [27:25<16:39, 10.87s/it]

Batch 140: Reward=0.059, Quality=-0.250, Emotion=-0.062, Relevance=-1.250, Sentiment=0.307, Empathy=0.887, Success=75.00%


Training batches:  63%|██████▎   | 146/233 [28:25<17:41, 12.20s/it]

Batch 145: Reward=0.129, Quality=-3.250, Emotion=-0.750, Relevance=0.875, Sentiment=0.265, Empathy=0.892, Success=62.50%


Training batches:  65%|██████▍   | 151/233 [29:25<16:49, 12.32s/it]

Batch 150: Reward=2.197, Quality=-1.750, Emotion=-0.688, Relevance=2.188, Sentiment=0.173, Empathy=0.885, Success=68.75%


Training batches:  67%|██████▋   | 156/233 [30:10<13:03, 10.18s/it]

Batch 155: Reward=2.853, Quality=-1.000, Emotion=0.500, Relevance=1.562, Sentiment=0.165, Empathy=0.866, Success=75.00%


Training batches:  69%|██████▉   | 161/233 [31:18<15:45, 13.14s/it]

Batch 160: Reward=0.139, Quality=-1.750, Emotion=-1.312, Relevance=1.000, Sentiment=0.279, Empathy=0.895, Success=62.50%


Training batches:  71%|███████   | 166/233 [32:12<12:50, 11.51s/it]

Batch 165: Reward=1.989, Quality=-1.750, Emotion=0.375, Relevance=0.500, Sentiment=0.275, Empathy=0.842, Success=81.25%


Training batches:  73%|███████▎  | 171/233 [33:11<12:47, 12.38s/it]

Batch 170: Reward=-0.327, Quality=-1.000, Emotion=-0.062, Relevance=-1.375, Sentiment=0.290, Empathy=0.872, Success=75.00%


Training batches:  76%|███████▌  | 176/233 [34:11<11:42, 12.33s/it]

Batch 175: Reward=-0.637, Quality=-2.500, Emotion=-0.688, Relevance=-0.062, Sentiment=0.313, Empathy=0.898, Success=68.75%


Training batches:  78%|███████▊  | 181/233 [35:10<11:02, 12.74s/it]

Batch 180: Reward=1.074, Quality=-1.750, Emotion=-1.312, Relevance=1.938, Sentiment=0.340, Empathy=0.872, Success=62.50%


Training batches:  80%|███████▉  | 186/233 [36:16<10:39, 13.60s/it]

Batch 185: Reward=-1.479, Quality=-1.750, Emotion=-0.625, Relevance=-1.188, Sentiment=0.285, Empathy=0.910, Success=75.00%


Training batches:  82%|████████▏ | 191/233 [37:00<06:44,  9.63s/it]

Batch 190: Reward=5.283, Quality=1.250, Emotion=2.375, Relevance=0.375, Sentiment=0.244, Empathy=0.871, Success=93.75%


Training batches:  84%|████████▍ | 196/233 [38:04<07:45, 12.59s/it]

Batch 195: Reward=1.392, Quality=-2.250, Emotion=-0.625, Relevance=0.438, Sentiment=0.295, Empathy=0.830, Success=75.00%


Training batches:  86%|████████▋ | 201/233 [39:10<07:27, 13.99s/it]

Batch 200: Reward=-2.965, Quality=-1.000, Emotion=-3.625, Relevance=-0.938, Sentiment=0.137, Empathy=0.899, Success=56.25%


Training batches:  88%|████████▊ | 206/233 [40:21<06:36, 14.70s/it]

Batch 205: Reward=0.638, Quality=-1.750, Emotion=-0.750, Relevance=0.562, Sentiment=0.196, Empathy=0.907, Success=62.50%


Training batches:  91%|█████████ | 211/233 [41:13<03:56, 10.73s/it]

Batch 210: Reward=5.837, Quality=1.250, Emotion=1.188, Relevance=2.000, Sentiment=0.362, Empathy=0.899, Success=87.50%


Training batches:  93%|█████████▎| 216/233 [42:01<02:58, 10.50s/it]

Batch 215: Reward=0.277, Quality=-1.000, Emotion=-0.688, Relevance=0.625, Sentiment=0.199, Empathy=0.888, Success=68.75%


Training batches:  95%|█████████▍| 221/233 [43:03<02:22, 11.89s/it]

Batch 220: Reward=4.450, Quality=-0.250, Emotion=1.750, Relevance=1.562, Sentiment=0.225, Empathy=0.884, Success=87.50%


Training batches:  97%|█████████▋| 226/233 [43:48<01:07,  9.71s/it]

Batch 225: Reward=6.271, Quality=1.750, Emotion=2.438, Relevance=1.562, Sentiment=0.431, Empathy=0.878, Success=100.00%


Training batches:  99%|█████████▉| 231/233 [44:43<00:22, 11.28s/it]

Batch 230: Reward=0.653, Quality=-1.000, Emotion=-1.188, Relevance=-0.125, Sentiment=0.211, Empathy=0.840, Success=75.00%


Training batches: 100%|██████████| 233/233 [45:03<00:00, 11.60s/it]

Error in batch 232: Batch size (16) does not match number of examples - but got 9 for: queries

Epoch 7 Results:
Average total reward: 2.2426
Average quality score: -0.7229
Average emotion score: 0.2228
Average relevance score: 0.8038
Average sentiment score: 0.2801
Average empathy score: 0.8762
Emotion generation success: 78.66%
Epoch time: 2703.2s



[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_7)... 

Saved checkpoint to /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_7


Done. 1.5s


Running validation...




Validation Results:
Average reward: 2.4735
Average quality: -0.5888
Average emotion: 0.2292
Average relevance: 0.6697
Average sentiment: 0.2838
Average empathy: 0.8815
Emotion generation success: 76.40%
Validation time: 285.8s

Epoch 8/8


Training batches:   0%|          | 1/233 [00:13<51:46, 13.39s/it]

Batch 0: Reward=1.975, Quality=-1.750, Emotion=-0.125, Relevance=0.562, Sentiment=0.081, Empathy=0.890, Success=68.75%


Training batches:   3%|▎         | 6/233 [01:19<49:19, 13.04s/it]

Batch 5: Reward=1.287, Quality=-1.000, Emotion=-0.125, Relevance=0.312, Sentiment=0.375, Empathy=0.845, Success=68.75%


Training batches:   5%|▍         | 11/233 [02:07<36:18,  9.81s/it]

Batch 10: Reward=4.110, Quality=0.500, Emotion=1.562, Relevance=1.375, Sentiment=0.560, Empathy=0.877, Success=87.50%


Training batches:   7%|▋         | 16/233 [03:09<43:54, 12.14s/it]

Batch 15: Reward=-0.888, Quality=-3.250, Emotion=-2.000, Relevance=-0.875, Sentiment=0.251, Empathy=0.908, Success=50.00%


Training batches:   9%|▉         | 21/233 [04:17<45:14, 12.80s/it]

Batch 20: Reward=3.618, Quality=-0.250, Emotion=1.125, Relevance=1.812, Sentiment=0.121, Empathy=0.873, Success=81.25%


Training batches:  11%|█         | 26/233 [05:15<38:44, 11.23s/it]

Batch 25: Reward=1.622, Quality=-1.000, Emotion=0.500, Relevance=1.188, Sentiment=0.300, Empathy=0.878, Success=75.00%


Training batches:  13%|█▎        | 31/233 [06:46<55:55, 16.61s/it]

Batch 30: Reward=-0.152, Quality=-2.500, Emotion=-1.375, Relevance=1.000, Sentiment=0.164, Empathy=0.877, Success=56.25%


Training batches:  15%|█▌        | 36/233 [07:48<40:53, 12.46s/it]

Batch 35: Reward=2.610, Quality=-1.000, Emotion=0.562, Relevance=1.438, Sentiment=0.208, Empathy=0.889, Success=81.25%


Training batches:  18%|█▊        | 41/233 [08:43<36:17, 11.34s/it]

Batch 40: Reward=2.305, Quality=1.250, Emotion=-1.188, Relevance=1.438, Sentiment=0.379, Empathy=0.873, Success=75.00%


Training batches:  20%|█▉        | 46/233 [09:52<41:55, 13.45s/it]

Batch 45: Reward=-1.029, Quality=-3.250, Emotion=-1.312, Relevance=-0.312, Sentiment=0.384, Empathy=0.873, Success=62.50%


Training batches:  22%|██▏       | 51/233 [11:00<41:34, 13.71s/it]

Batch 50: Reward=1.917, Quality=-1.750, Emotion=-0.875, Relevance=0.875, Sentiment=0.282, Empathy=0.903, Success=68.75%


Training batches:  24%|██▍       | 56/233 [11:53<32:34, 11.04s/it]

Batch 55: Reward=2.318, Quality=-0.250, Emotion=1.125, Relevance=-0.875, Sentiment=0.324, Empathy=0.892, Success=81.25%


Training batches:  26%|██▌       | 61/233 [12:51<30:41, 10.71s/it]

Batch 60: Reward=5.185, Quality=0.500, Emotion=1.750, Relevance=1.938, Sentiment=0.564, Empathy=0.870, Success=87.50%


Training batches:  28%|██▊       | 66/233 [14:01<39:44, 14.28s/it]

Batch 65: Reward=-0.573, Quality=-3.250, Emotion=-1.938, Relevance=0.688, Sentiment=0.119, Empathy=0.884, Success=56.25%


Training batches:  30%|███       | 71/233 [15:00<29:19, 10.86s/it]

Batch 70: Reward=3.264, Quality=-0.250, Emotion=-0.062, Relevance=2.375, Sentiment=0.268, Empathy=0.891, Success=75.00%


Training batches:  33%|███▎      | 76/233 [16:02<28:48, 11.01s/it]

Batch 75: Reward=1.694, Quality=-1.750, Emotion=1.125, Relevance=0.250, Sentiment=0.255, Empathy=0.887, Success=81.25%


Training batches:  35%|███▍      | 81/233 [17:05<29:38, 11.70s/it]

Batch 80: Reward=3.502, Quality=-0.250, Emotion=1.000, Relevance=1.250, Sentiment=0.399, Empathy=0.875, Success=87.50%


Training batches:  37%|███▋      | 86/233 [17:56<24:00,  9.80s/it]

Batch 85: Reward=1.013, Quality=-1.000, Emotion=-0.125, Relevance=0.188, Sentiment=0.260, Empathy=0.883, Success=68.75%


Training batches:  39%|███▉      | 91/233 [18:48<21:58,  9.29s/it]

Batch 90: Reward=1.983, Quality=0.500, Emotion=0.812, Relevance=-0.125, Sentiment=0.126, Empathy=0.880, Success=87.50%


Training batches:  41%|████      | 96/233 [19:38<23:27, 10.27s/it]

Batch 95: Reward=0.594, Quality=-2.500, Emotion=-0.125, Relevance=1.375, Sentiment=0.216, Empathy=0.909, Success=68.75%


Training batches:  43%|████▎     | 101/233 [20:41<24:12, 11.00s/it]

Batch 100: Reward=5.160, Quality=0.500, Emotion=1.562, Relevance=0.750, Sentiment=0.298, Empathy=0.884, Success=87.50%


Training batches:  45%|████▌     | 106/233 [21:47<26:14, 12.40s/it]

Batch 105: Reward=3.189, Quality=-1.000, Emotion=1.188, Relevance=0.375, Sentiment=0.272, Empathy=0.812, Success=87.50%


Training batches:  48%|████▊     | 111/233 [22:52<27:42, 13.63s/it]

Batch 110: Reward=-0.191, Quality=-3.250, Emotion=-2.000, Relevance=0.938, Sentiment=0.139, Empathy=0.898, Success=50.00%


Training batches:  50%|████▉     | 116/233 [23:59<26:33, 13.62s/it]

Batch 115: Reward=3.008, Quality=-0.250, Emotion=-0.062, Relevance=0.062, Sentiment=0.259, Empathy=0.867, Success=75.00%


Training batches:  52%|█████▏    | 121/233 [24:46<17:32,  9.40s/it]

Batch 120: Reward=5.881, Quality=1.250, Emotion=1.875, Relevance=2.125, Sentiment=0.360, Empathy=0.869, Success=100.00%


Training batches:  54%|█████▍    | 126/233 [25:53<21:15, 11.92s/it]

Batch 125: Reward=0.881, Quality=-1.750, Emotion=-0.250, Relevance=1.375, Sentiment=0.278, Empathy=0.891, Success=75.00%


Training batches:  56%|█████▌    | 131/233 [27:04<22:55, 13.49s/it]

Batch 130: Reward=3.541, Quality=-0.250, Emotion=-0.062, Relevance=2.125, Sentiment=0.379, Empathy=0.880, Success=75.00%


Training batches:  58%|█████▊    | 136/233 [27:51<15:54,  9.84s/it]

Batch 135: Reward=3.006, Quality=-0.250, Emotion=0.500, Relevance=1.375, Sentiment=0.134, Empathy=0.859, Success=75.00%


Training batches:  61%|██████    | 141/233 [28:40<15:03,  9.82s/it]

Batch 140: Reward=2.695, Quality=-0.250, Emotion=0.500, Relevance=-0.312, Sentiment=0.124, Empathy=0.881, Success=75.00%


Training batches:  63%|██████▎   | 146/233 [29:37<16:19, 11.26s/it]

Batch 145: Reward=0.580, Quality=-2.500, Emotion=-0.750, Relevance=0.062, Sentiment=0.183, Empathy=0.901, Success=62.50%


Training batches:  65%|██████▍   | 151/233 [30:38<17:30, 12.81s/it]

Batch 150: Reward=1.763, Quality=-1.750, Emotion=-0.750, Relevance=1.562, Sentiment=0.140, Empathy=0.888, Success=62.50%


Training batches:  67%|██████▋   | 156/233 [31:28<13:41, 10.67s/it]

Batch 155: Reward=6.058, Quality=0.500, Emotion=1.750, Relevance=3.000, Sentiment=0.079, Empathy=0.885, Success=87.50%


Training batches:  69%|██████▉   | 161/233 [32:22<11:58,  9.98s/it]

Batch 160: Reward=2.883, Quality=-0.250, Emotion=0.500, Relevance=0.188, Sentiment=0.170, Empathy=0.862, Success=75.00%


Training batches:  71%|███████   | 166/233 [33:21<12:34, 11.26s/it]

Batch 165: Reward=1.064, Quality=-0.250, Emotion=0.312, Relevance=-0.062, Sentiment=0.186, Empathy=0.880, Success=75.00%


Training batches:  73%|███████▎  | 171/233 [34:07<10:06,  9.78s/it]

Batch 170: Reward=0.333, Quality=-1.750, Emotion=0.500, Relevance=-0.938, Sentiment=0.417, Empathy=0.882, Success=75.00%


Training batches:  76%|███████▌  | 176/233 [35:00<09:49, 10.34s/it]

Batch 175: Reward=1.336, Quality=-0.250, Emotion=-0.688, Relevance=-0.188, Sentiment=0.278, Empathy=0.875, Success=68.75%


Training batches:  78%|███████▊  | 181/233 [36:00<10:49, 12.50s/it]

Batch 180: Reward=1.795, Quality=-2.000, Emotion=-0.125, Relevance=-0.125, Sentiment=0.431, Empathy=0.903, Success=68.75%


Training batches:  80%|███████▉  | 186/233 [37:01<09:36, 12.26s/it]

Batch 185: Reward=1.191, Quality=-1.750, Emotion=0.500, Relevance=-1.062, Sentiment=0.337, Empathy=0.889, Success=75.00%


Training batches:  82%|████████▏ | 191/233 [37:55<07:16, 10.39s/it]

Batch 190: Reward=3.884, Quality=1.250, Emotion=2.375, Relevance=-0.500, Sentiment=0.391, Empathy=0.864, Success=93.75%


Training batches:  84%|████████▍ | 196/233 [38:55<07:45, 12.58s/it]

Batch 195: Reward=-0.503, Quality=-1.000, Emotion=-1.375, Relevance=-1.250, Sentiment=0.134, Empathy=0.876, Success=56.25%


Training batches:  86%|████████▋ | 201/233 [40:00<07:41, 14.42s/it]

Batch 200: Reward=-2.365, Quality=-4.750, Emotion=-2.562, Relevance=1.125, Sentiment=0.242, Empathy=0.906, Success=50.00%


Training batches:  88%|████████▊ | 206/233 [41:07<06:33, 14.58s/it]

Batch 205: Reward=-0.608, Quality=-1.000, Emotion=-1.250, Relevance=-1.750, Sentiment=0.276, Empathy=0.907, Success=68.75%


Training batches:  91%|█████████ | 211/233 [42:01<04:11, 11.42s/it]

Batch 210: Reward=2.047, Quality=-0.250, Emotion=1.125, Relevance=-0.625, Sentiment=0.155, Empathy=0.881, Success=81.25%


Training batches:  93%|█████████▎| 216/233 [42:53<02:59, 10.59s/it]

Batch 215: Reward=2.903, Quality=-0.250, Emotion=0.562, Relevance=0.250, Sentiment=0.461, Empathy=0.875, Success=81.25%


Training batches:  95%|█████████▍| 221/233 [43:47<02:12, 11.06s/it]

Batch 220: Reward=3.553, Quality=1.250, Emotion=1.812, Relevance=-0.312, Sentiment=0.134, Empathy=0.865, Success=93.75%


Training batches:  97%|█████████▋| 226/233 [44:34<01:15, 10.84s/it]

Batch 225: Reward=-0.536, Quality=-3.750, Emotion=-0.125, Relevance=-0.375, Sentiment=0.282, Empathy=0.883, Success=68.75%


Training batches:  99%|█████████▉| 231/233 [45:29<00:23, 11.86s/it]

Batch 230: Reward=4.283, Quality=-0.250, Emotion=0.562, Relevance=1.688, Sentiment=0.226, Empathy=0.887, Success=81.25%


Training batches: 100%|██████████| 233/233 [45:41<00:00, 11.76s/it]

Error in batch 232: Batch size (16) does not match number of examples - but got 9 for: queries

Epoch 8 Results:
Average total reward: 1.8181
Average quality score: -0.9153
Average emotion score: -0.0516
Average relevance score: 0.4733
Average sentiment score: 0.2788
Average empathy score: 0.8840
Emotion generation success: 73.72%
Epoch time: 2741.0s



[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_8)... 

Saved checkpoint to /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_8


Done. 1.6s


Running validation...




Validation Results:
Average reward: 2.1384
Average quality: -0.6427
Average emotion: -0.0180
Average relevance: 0.3618
Average sentiment: 0.2897
Average empathy: 0.8862
Emotion generation success: 73.93%
Validation time: 297.8s


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/final_model)... Done. 1.6s



Training completed! Total time: 16402.2s
Final model saved to /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/final_model
W&B run url: https://wandb.ai/ericzhangez1006-ucl/8m19therapy-chatbot-ppo-enhanced/runs/agsy9y6e


0,1
distributions/emotion_mean,▆███▆▃▂▁
distributions/emotion_std,▄▂▁▁▄▇██
distributions/empathy_mean,▁▄▄▅▅▆▇█
distributions/empathy_std,█▃▄▂▃▁▃▁
distributions/quality_mean,▇█▇█▆▅▂▁
distributions/quality_std,▂▁▂▁▃▅▇█
distributions/relevance_mean,▁▄▆█▇▆▅▃
distributions/relevance_std,▆▄▄▁▂▄▇█
distributions/reward_mean,▃▆▇█▆▃▂▁
distributions/reward_std,▄▂▂▁▃▆▇█

0,1
distributions/emotion_mean,-0.0516
distributions/emotion_std,4.53462
distributions/empathy_mean,0.88404
distributions/empathy_std,0.06453
distributions/quality_mean,-0.91535
distributions/quality_std,5.12793
distributions/relevance_mean,0.47326
distributions/relevance_std,4.17938
distributions/reward_mean,1.81805
distributions/reward_std,7.96925


PPO training with enhanced reward components completed successfully!
