In [None]:
pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=e4c36f1fcd769c5a49686e1cf88964698ca4deb9bb79866c58e0cc2ca5af4791
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


Baseline GPT2


In [None]:
import json
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer, AutoModelForSequenceClassification
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import nltk
from tqdm import tqdm
import numpy as np
from collections import defaultdict

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# ====================== Text Cleaning Functions ======================
def clean_therapy_text(text):
    """Remove descriptive text patterns from SFT data"""
    expression_pattern = r'\s*(The (?:speaker|emotion state)[^.]*\.(?:[^.]*\.)*)'
    match = re.search(expression_pattern, text, re.IGNORECASE | re.DOTALL)
    if match:
        return text[:match.start()].strip()
    return text.strip()

# ====================== Baseline Model Functions ======================
def load_baseline_gpt2(device='cuda'):
    """Load vanilla GPT-2 model without custom tokenizer"""
    print("Loading vanilla GPT-2 model...")

    # Load standard GPT-2 tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.to(device)

    print(f"Vanilla GPT-2 loaded with vocab size: {len(tokenizer)}")

    return model, tokenizer

def load_emotion_classifier(device='cuda'):
    """Load RoBERTa emotion classifier"""
    print("Loading emotion classifier...")
    emotion_tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
    emotion_model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")
    emotion_model.to(device)
    emotion_model.eval()

    return emotion_model, emotion_tokenizer

def extract_user_input_for_baseline(input_text, use_full_input=False):
    """
    Extract input for baseline GPT-2

    Args:
        input_text: Full structured input
        use_full_input: If True, use full input (GPT-2 will see unknown tokens)
                       If False, extract just user text (default)
    """
    if use_full_input:
        # Return full input
        return input_text

    # Extract just the user text (default behavior for fair comparison)
    user_match = re.search(r'<user>(.*?)(?:<user_emotion>|<therapist>)', input_text)
    if user_match:
        user_text = user_match.group(1).strip()
        return user_text

    # Fallback: try to extract any text after <user>
    user_match = re.search(r'<user>(.*)', input_text)
    if user_match:
        return user_match.group(1).strip()

    # Last fallback: return the input as is
    return input_text

def predict_emotion_roberta(text, emotion_model, emotion_tokenizer, device='cuda'):
    """Predict emotion using RoBERTa model, constrained to therapy emotions"""
    if not text.strip():
        return "neutral", 0.0

    # Define therapy emotions we care about
    therapy_emotions = ['anger', 'joy', 'neutral', 'sadness', 'depression', 'disgust', 'fear']

    # Tokenize and predict
    inputs = emotion_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = emotion_model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Get all emotion labels from RoBERTa model
    emotion_labels = emotion_model.config.id2label

    emotion_mapping = {
        # Direct matches
        'anger': 'anger',
        'joy': 'joy',
        'sadness': 'sadness',
        'fear': 'fear',
        'disgust': 'disgust',
        'neutral': 'neutral',

        # Joy-related emotions
        'amusement': 'joy',
        'excitement': 'joy',
        'gratitude': 'joy',
        'love': 'joy',
        'optimism': 'joy',
        'pride': 'joy',
        'relief': 'joy',
        'admiration': 'joy',

        # Sadness-related emotions (potential depression indicators)
        'disappointment': 'sadness',
        'embarrassment': 'sadness',
        'grief': 'sadness',
        'remorse': 'sadness',

        # Anger-related emotions
        'annoyance': 'anger',
        'disapproval': 'anger',

        # Fear-related emotions
        'nervousness': 'fear',

        # Neutral-related emotions
        'approval': 'neutral',
        'caring': 'neutral',
        'confusion': 'neutral',
        'curiosity': 'neutral',
        'desire': 'neutral',
        'realization': 'neutral',
        'surprise': 'neutral'
    }

    # Aggregate scores for therapy emotions
    therapy_scores = {emotion: 0.0 for emotion in therapy_emotions}

    for class_id, score in enumerate(predictions[0]):
        go_emotion = emotion_labels[class_id]
        therapy_emotion = emotion_mapping.get(go_emotion, 'neutral')
        therapy_scores[therapy_emotion] += score.item()

    # Find the therapy emotion with highest score
    best_emotion = max(therapy_scores, key=therapy_scores.get)
    best_score = therapy_scores[best_emotion]

    # Enhanced depression detection based on strong indicators
    if best_emotion == 'sadness':
        depression_indicators = ['grief', 'remorse', 'disappointment', 'embarrassment']
        depression_score = sum(predictions[0][class_id].item()
                              for class_id, label in emotion_labels.items()
                              if label in depression_indicators)

        if depression_score > 0.2 or (emotion_labels.get(predictions[0].argmax().item()) in ['grief', 'remorse']):
            best_emotion = 'depression'
            best_score = depression_score

    return best_emotion, best_score

# ====================== Reference Data Processing ======================
class TherapyEvaluationDataset:
    """Dataset class for evaluation data processing"""

    def __init__(self, json_path, tokenizer_path=None):
        with open(json_path, 'r', encoding='utf-8') as f:
            self._data = json.load(f)

        self.evaluation_data = []
        self._process_evaluation_data()

    def _process_evaluation_data(self):
        """Process dialog data for evaluation"""
        for conv in tqdm(self._data, desc="Processing evaluation data"):
            problem_type = conv.get("problem_type", "").strip()
            dialog = conv.get("dialog", [])

            user_text_parts = []
            user_emotions = []

            for turn in dialog:
                speaker = turn.get("speaker", "")
                text = clean_therapy_text(turn.get("text", ""))
                emotion = turn.get("emotion", "").strip()

                if speaker != "sys":  # User turn
                    if text:
                        user_text_parts.append(text)
                    if emotion:
                        user_emotions.append(emotion)
                else:  # Therapist turn
                    if not user_text_parts:
                        continue

                    therapist_text = clean_therapy_text(text)
                    therapist_emotion = emotion

                    combined_user_text = " ".join(user_text_parts)
                    last_user_emotion = user_emotions[-1] if user_emotions else ""

                    # Create input prompt
                    input_parts = []
                    if problem_type:
                        input_parts.append(f"<problem>{problem_type}")

                    input_parts.append(f"<user>{combined_user_text}")
                    if last_user_emotion:
                        input_parts.append(f"<user_emotion>{last_user_emotion}")

                    input_parts.append("<therapist>")
                    input_text = " ".join(input_parts)

                    self.evaluation_data.append({
                        'input_text': input_text,
                        'reference_text': therapist_text,
                        'reference_emotion': therapist_emotion,
                        'user_input': combined_user_text,
                        'user_emotion': last_user_emotion,
                        'problem_type': problem_type
                    })

                    user_text_parts = []
                    user_emotions = []

        print(f"Processed {len(self.evaluation_data)} evaluation samples")

    def get_evaluation_data(self):
        return self.evaluation_data

# ====================== Evaluation Metrics ======================
class TherapyEvaluationMetrics:
    """Class to compute evaluation metrics"""

    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.smoothing = SmoothingFunction().method1

    def compute_bleu(self, reference, candidate):
        """Compute BLEU score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=self.smoothing)

    def compute_rouge(self, reference, candidate):
        """Compute ROUGE scores"""
        if not candidate or not reference:
            return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

        scores = self.rouge_scorer.score(reference, candidate)
        return {
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        }

    def compute_meteor(self, reference, candidate):
        """Compute METEOR score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return meteor_score([reference_tokens], candidate_tokens)

# ====================== Model Evaluation ======================
def evaluate_baseline(evaluation_dataset, device='cuda', top_p=0.8, max_new_tokens=128, use_full_input=False):
    """
    Evaluate vanilla GPT-2 model on therapy dataset with emotion detection

    Args:
        evaluation_dataset: TherapyEvaluationDataset instance
        device: Device to run evaluation on
        top_p: Top-p sampling parameter
        max_new_tokens: Maximum tokens to generate
        use_full_input: If True, feed full structured input to GPT-2
                       If False, extract just user text (default for fair comparison)

    Returns:
        Dictionary containing evaluation results
    """
    # Load baseline model and emotion classifier
    model, tokenizer = load_baseline_gpt2(device)
    emotion_model, emotion_tokenizer = load_emotion_classifier(device)

    metrics_computer = TherapyEvaluationMetrics()
    evaluation_data = evaluation_dataset.get_evaluation_data()

    results = {
        'bleu_scores': [],
        'rouge1_scores': [],
        'rouge2_scores': [],
        'rougeL_scores': [],
        'meteor_scores': [],
        'emotion_accuracy': [],
        'has_emotion_tag': [],
        'sample_outputs': []
    }

    emotion_confusion = defaultdict(lambda: defaultdict(int))

    model.eval()

    if use_full_input:
        print("Evaluating vanilla GPT-2 with FULL structured input (will see unknown tokens)...")
    else:
        print("Evaluating vanilla GPT-2 with extracted user text only...")
    print(f"Evaluating model on {len(evaluation_data)} samples...")

    # Create progress bar
    progress_bar = tqdm(evaluation_data, desc="Evaluating Baseline",
                       bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]')

    for i, sample in enumerate(progress_bar):
        input_text = sample['input_text']
        reference_text = sample['reference_text']
        reference_emotion = sample['reference_emotion']

        # Extract user input for baseline
        user_input = extract_user_input_for_baseline(input_text, use_full_input)

        # Tokenize input for vanilla GPT-2
        input_ids = tokenizer.encode(user_input, return_tensors='pt').to(device)

        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                top_p=top_p,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        # Decode response
        new_tokens = outputs[0][len(input_ids[0]):]
        predicted_text = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()

        # Predict emotion using RoBERTa
        predicted_emotion, emotion_confidence = predict_emotion_roberta(
            predicted_text, emotion_model, emotion_tokenizer, device
        )

        # Compute text generation metrics
        bleu = metrics_computer.compute_bleu(reference_text, predicted_text)
        rouge = metrics_computer.compute_rouge(reference_text, predicted_text)
        meteor = metrics_computer.compute_meteor(reference_text, predicted_text)

        # Compute emotion accuracy
        emotion_correct = (predicted_emotion.lower() == reference_emotion.lower())
        has_emotion = True

        # Store results
        results['bleu_scores'].append(bleu)
        results['rouge1_scores'].append(rouge['rouge1'])
        results['rouge2_scores'].append(rouge['rouge2'])
        results['rougeL_scores'].append(rouge['rougeL'])
        results['meteor_scores'].append(meteor)
        results['emotion_accuracy'].append(emotion_correct)
        results['has_emotion_tag'].append(has_emotion)

        # Update confusion matrix
        emotion_confusion[reference_emotion.lower()][predicted_emotion.lower()] += 1

        # Store sample outputs for inspection
        if i < 10:  # Store first 10 samples
            results['sample_outputs'].append({
                'input': input_text,
                'user_input_extracted': user_input,
                'reference_text': reference_text,
                'predicted_text': predicted_text,
                'reference_emotion': reference_emotion,
                'predicted_emotion': predicted_emotion,
                'emotion_confidence': emotion_confidence,
                'full_response': predicted_text,
                'bleu': bleu,
                'rouge1': rouge['rouge1'],
                'meteor': meteor
            })

        if (i + 1) % 20 == 0 or i < 5:
            sample_info = f"Sample {i+1}: BLEU={bleu:.3f}, Emotion={'✓' if emotion_correct else '✗'}"
            progress_bar.set_description(f"Evaluating Baseline - {sample_info}")

            if (i + 1) % 50 == 0 or i < 3:
                print(f"\n" + "="*80)
                print(f"BASELINE SAMPLE {i+1}/{len(evaluation_data)}")
                print("="*80)
                print(f"Original Input: {input_text[:120]}...")
                print(f"Processed Input: {user_input[:100]}...")
                if use_full_input:
                    print(f"Note: GPT-2 sees unknown tokens like <problem>, <user>, etc.")
                print(f"Reference: {reference_text[:100]}...")
                print(f"Generated: {predicted_text[:100]}...")
                print(f"Ref Emotion: {reference_emotion} | Pred Emotion: {predicted_emotion} (conf: {emotion_confidence:.3f}) | Match: {'✓' if emotion_correct else '✗'}")
                print(f"BLEU: {bleu:.4f} | ROUGE-1: {rouge['rouge1']:.4f} | METEOR: {meteor:.4f}")
                print("="*80)
        else:
            if i > 0:
                avg_bleu = np.mean(results['bleu_scores'])
                avg_emotion_acc = np.mean(results['emotion_accuracy'])
                progress_bar.set_description(f"Evaluating Baseline - Avg BLEU: {avg_bleu:.3f}, Emotion Acc: {avg_emotion_acc:.3f}")

    # Close progress bar
    progress_bar.close()

    # Compute summary statistics
    results['summary'] = {
        'avg_bleu': np.mean(results['bleu_scores']),
        'avg_rouge1': np.mean(results['rouge1_scores']),
        'avg_rouge2': np.mean(results['rouge2_scores']),
        'avg_rougeL': np.mean(results['rougeL_scores']),
        'avg_meteor': np.mean(results['meteor_scores']),
        'emotion_accuracy': np.mean(results['emotion_accuracy']),
        'emotion_tag_coverage': np.mean(results['has_emotion_tag']),
        'total_samples': len(evaluation_data)
    }

    results['emotion_confusion_matrix'] = dict(emotion_confusion)
    results['detected_format'] = 'roberta_emotion_detection'

    # Print final progress summary
    print(f"\n{'='*60}")
    print(f"BASELINE EVALUATION COMPLETED!")
    print(f"{'='*60}")
    print(f"Processed {len(evaluation_data)} samples")
    print(f"Average BLEU: {results['summary']['avg_bleu']:.4f}")
    print(f"Average ROUGE-1: {results['summary']['avg_rouge1']:.4f}")
    print(f"Emotion Accuracy: {results['summary']['emotion_accuracy']:.4f}")
    print(f"Emotion Tag Coverage: {results['summary']['emotion_tag_coverage']:.4f}")
    print(f"{'='*60}")

    results['model_type'] = 'baseline_gpt2_roberta_emotion'
    if use_full_input:
        results['model_type'] += '_full_input'
    else:
        results['model_type'] += '_user_only'

    results['generation_params'] = {
        'top_p': top_p,
        'max_new_tokens': max_new_tokens,
        'use_full_input': use_full_input
    }

    return results

def print_evaluation_results(results):
    """Print formatted evaluation results"""
    summary = results['summary']

    print("\n" + "="*60)
    print("BASELINE GPT-2 EVALUATION RESULTS")
    print("="*60)
    print(f"Model Type: {results.get('model_type', 'Unknown').upper()}")
    print(f"Output Format: {results.get('detected_format', 'Unknown').upper()}")

    # Show generation parameters
    gen_params = results.get('generation_params', {})
    if gen_params:
        print(f"Generation Params: top_p={gen_params.get('top_p', 'N/A')}, "
              f"max_tokens={gen_params.get('max_new_tokens', 'N/A')}, "
              f"full_input={gen_params.get('use_full_input', 'N/A')}")

    print(f"\nText Generation Metrics:")
    print(f"  BLEU Score:     {summary['avg_bleu']:.4f}")
    print(f"  ROUGE-1:        {summary['avg_rouge1']:.4f}")
    print(f"  ROUGE-2:        {summary['avg_rouge2']:.4f}")
    print(f"  ROUGE-L:        {summary['avg_rougeL']:.4f}")
    print(f"  METEOR:         {summary['avg_meteor']:.4f}")

    print(f"\nEmotion Prediction:")
    print(f"  Emotion Accuracy:    {summary['emotion_accuracy']:.4f}")
    print(f"  Emotion Tag Coverage: {summary['emotion_tag_coverage']:.4f}")

    print(f"\nDataset Info:")
    print(f"  Total Samples:  {summary['total_samples']}")

    print(f"\nSample Outputs:")
    for i, sample in enumerate(results['sample_outputs'][:3], 1):
        print(f"\n  Sample {i}:")
        print(f"    Original Input: {sample['input'][:100]}...")
        print(f"    Processed Input: {sample['user_input_extracted'][:80]}...")
        print(f"    Reference: {sample['reference_text'][:80]}...")
        print(f"    Predicted: {sample['predicted_text'][:80]}...")
        print(f"    Ref Emotion: {sample['reference_emotion']}")
        print(f"    Pred Emotion: {sample['predicted_emotion']} (conf: {sample['emotion_confidence']:.3f})")
        print(f"    BLEU: {sample['bleu']:.3f}, ROUGE-1: {sample['rouge1']:.3f}")

if __name__ == "__main__":
    TEST_DATA_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/test.json"
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    # Load evaluation dataset
    eval_dataset = TherapyEvaluationDataset(TEST_DATA_PATH)

    print("\n=== BASELINE WITH FULL STRUCTURED INPUT ===")
    baseline_full_results = evaluate_baseline(eval_dataset, DEVICE, use_full_input=True, top_p=0.8)
    print_evaluation_results(baseline_full_results)


SFT Hyperparameters Tuning


In [None]:
import json
import re
import torch
from transformers import GPT2Tokenizer
from datasets import Dataset as HFDataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import nltk
from tqdm import tqdm
import numpy as np
from collections import defaultdict
import itertools
import pickle
import os
from datetime import datetime
import random

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set seed at import
set_seed(42)

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# ====================== Text Cleaning Functions ======================
def clean_therapy_text(text):
    """Remove descriptive text patterns from SFT data"""
    expression_pattern = r'\s*(The (?:speaker|emotion state)[^.]*\.(?:[^.]*\.)*)'
    match = re.search(expression_pattern, text, re.IGNORECASE | re.DOTALL)
    if match:
        return text[:match.start()].strip()
    return text.strip()

# ====================== Model Output Extraction ======================
def extract_model_output(response):
    """Extract therapist text and emotion from model output"""
    response = response.strip()

    if response.endswith('<eos>'):
        response = response[:-5].strip()

    # Find <therapist_emotion> tag
    emotion_pattern = r'<therapist_emotion>'
    emotion_match = re.search(emotion_pattern, response)

    if emotion_match:
        # Extract text before <therapist_emotion>
        therapist_text = response[:emotion_match.start()].strip()

        # Extract emotion part after <therapist_emotion>
        emotion_part = response[emotion_match.end():].strip()

        # Get the first word as emotion
        emotion_words = emotion_part.split()
        emotion = emotion_words[0].lower() if emotion_words else ""

        return therapist_text, emotion, True
    else:
        # No emotion tag found, return entire response as text
        return response, "", False

# ====================== Reference Data Processing ======================
class TherapyEvaluationDataset:
    """Dataset class for evaluation data processing"""

    def __init__(self, json_path, tokenizer_path=None, subset_size=None):
        with open(json_path, 'r', encoding='utf-8') as f:
            self._data = json.load(f)

        if tokenizer_path:
            self.tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
        else:
            from transformers import AutoTokenizer
            self.tokenizer = AutoTokenizer.from_pretrained('gpt2')

        self.evaluation_data = []
        self.subset_size = subset_size
        self._process_evaluation_data()

    def _process_evaluation_data(self):
        """Process dialog data for evaluation"""
        print(f"Processing evaluation data (subset_size: {self.subset_size})...")

        for conv in tqdm(self._data, desc="Processing evaluation data"):
            problem_type = conv.get("problem_type", "").strip()
            dialog = conv.get("dialog", [])

            user_text_parts = []
            user_emotions = []

            for turn in dialog:
                speaker = turn.get("speaker", "")
                text = clean_therapy_text(turn.get("text", ""))
                emotion = turn.get("emotion", "").strip()

                if speaker != "sys":  # User turn
                    if text:
                        user_text_parts.append(text)
                    if emotion:
                        user_emotions.append(emotion)
                else:  # Therapist turn
                    if not user_text_parts:
                        continue

                    therapist_text = clean_therapy_text(text)
                    therapist_emotion = emotion

                    combined_user_text = " ".join(user_text_parts)
                    last_user_emotion = user_emotions[-1] if user_emotions else ""

                    # Create input prompt
                    input_parts = []
                    if problem_type:
                        input_parts.append(f"<problem>{problem_type}")

                    input_parts.append(f"<user>{combined_user_text}")
                    if last_user_emotion:
                        input_parts.append(f"<user_emotion>{last_user_emotion}")

                    input_parts.append("<therapist>")
                    input_text = " ".join(input_parts)

                    self.evaluation_data.append({
                        'input_text': input_text,
                        'reference_text': therapist_text,
                        'reference_emotion': therapist_emotion,
                        'user_input': combined_user_text,
                        'user_emotion': last_user_emotion,
                        'problem_type': problem_type
                    })

                    user_text_parts = []
                    user_emotions = []

                    if self.subset_size and len(self.evaluation_data) >= self.subset_size:
                        break

            if self.subset_size and len(self.evaluation_data) >= self.subset_size:
                break

        print(f"Processed {len(self.evaluation_data)} evaluation samples")

    def get_evaluation_data(self):
        return self.evaluation_data

# ====================== Evaluation Metrics ======================
class TherapyEvaluationMetrics:
    """Class to compute evaluation metrics"""

    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.smoothing = SmoothingFunction().method1

    def compute_bleu(self, reference, candidate):
        """Compute BLEU score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=self.smoothing)

    def compute_rouge(self, reference, candidate):
        """Compute ROUGE scores"""
        if not candidate or not reference:
            return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

        scores = self.rouge_scorer.score(reference, candidate)
        return {
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        }

    def compute_meteor(self, reference, candidate):
        """Compute METEOR score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return meteor_score([reference_tokens], candidate_tokens)

# ====================== Fast Evaluation Function ======================
def evaluate_hyperparameters(model, tokenizer, evaluation_dataset, device='cuda',
                            max_new_tokens=128, top_p=0.8, top_k=0, temperature=1.0, do_sample=True,
                            verbose=False):
    """
    Fast evaluation for hyperparameter tuning

    Args:
        model: Trained model
        tokenizer: Model tokenizer
        evaluation_dataset: TherapyEvaluationDataset instance
        device: Device to run evaluation on
        max_new_tokens: Maximum tokens to generate
        top_p: Top-p sampling parameter
        top_k: Top-k sampling parameter
        temperature: Temperature for sampling
        do_sample: Whether to use sampling
        verbose: Whether to print detailed progress

    Returns:
        Dictionary containing evaluation results
    """
    metrics_computer = TherapyEvaluationMetrics()
    evaluation_data = evaluation_dataset.get_evaluation_data()

    results = {
        'bleu_scores': [],
        'rouge1_scores': [],
        'rouge2_scores': [],
        'rougeL_scores': [],
        'meteor_scores': [],
        'emotion_accuracy': [],
        'has_emotion_tag': [],
    }

    model.eval()

    # Progress bar only if verbose
    if verbose:
        progress_bar = tqdm(evaluation_data, desc=f"Eval p={top_p:.2f} k={top_k} T={temperature:.1f}")
        data_iter = progress_bar
    else:
        data_iter = evaluation_data

    for i, sample in enumerate(data_iter):
        input_text = sample['input_text']
        reference_text = sample['reference_text']
        reference_emotion = sample['reference_emotion']

        # Tokenize input
        input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_new_tokens=max_new_tokens,
                do_sample=do_sample,
                top_p=top_p,
                top_k=top_k,
                temperature=temperature,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        # Decode response
        new_tokens = outputs[0][len(input_ids[0]):]
        response = tokenizer.decode(new_tokens, skip_special_tokens=False).strip()

        # Extract model output
        predicted_text, predicted_emotion, has_emotion = extract_model_output(response)

        # Compute metrics
        bleu = metrics_computer.compute_bleu(reference_text, predicted_text)
        rouge = metrics_computer.compute_rouge(reference_text, predicted_text)
        meteor = metrics_computer.compute_meteor(reference_text, predicted_text)

        # Compute emotion accuracy
        emotion_correct = (predicted_emotion.lower() == reference_emotion.lower()) if has_emotion else False

        # Store results
        results['bleu_scores'].append(bleu)
        results['rouge1_scores'].append(rouge['rouge1'])
        results['rouge2_scores'].append(rouge['rouge2'])
        results['rougeL_scores'].append(rouge['rougeL'])
        results['meteor_scores'].append(meteor)
        results['emotion_accuracy'].append(emotion_correct)
        results['has_emotion_tag'].append(has_emotion)

    if verbose and 'progress_bar' in locals():
        progress_bar.close()

    # Compute summary statistics
    summary = {
        'avg_bleu': np.mean(results['bleu_scores']),
        'avg_rouge1': np.mean(results['rouge1_scores']),
        'avg_rouge2': np.mean(results['rouge2_scores']),
        'avg_rougeL': np.mean(results['rougeL_scores']),
        'avg_meteor': np.mean(results['meteor_scores']),
        'emotion_accuracy': np.mean(results['emotion_accuracy']),
        'emotion_tag_coverage': np.mean(results['has_emotion_tag']),
        'total_samples': len(evaluation_data),
        'top_p': top_p,
        'top_k': top_k,
        'temperature': temperature
    }

    return summary

# ====================== Model Loading Functions ======================
def load_sft_model(checkpoint_path, tokenizer_path, device='cuda'):
    """Load SFT model from checkpoint file"""
    from transformers import GPT2LMHeadModel

    print(f"Loading SFT checkpoint: {checkpoint_path}")

    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)

    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.resize_token_embeddings(len(tokenizer))
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)

    return model, tokenizer

def load_rl_model(model_dir, tokenizer_path, device='cuda'):
    """Load RL model from directory"""
    from transformers import AutoModelForCausalLM

    print(f"Loading RL model from directory: {model_dir}")

    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

    model = AutoModelForCausalLM.from_pretrained(model_dir).to(device)

    return model, tokenizer

def load_model_auto(model_path, tokenizer_path, device='cuda'):
    """Automatically detect and load model"""
    import os

    if os.path.isfile(model_path):
        print("Detected SFT checkpoint file")
        model, tokenizer = load_sft_model(model_path, tokenizer_path, device)
        return model, tokenizer, 'sft'
    elif os.path.isdir(model_path):
        print("Detected RL model directory")
        model, tokenizer = load_rl_model(model_path, tokenizer_path, device)
        return model, tokenizer, 'rl'
    else:
        raise ValueError(f"Model path {model_path} is neither a file nor a directory")

# ====================== Hyperparameter Tuning Functions ======================
def compute_combined_score(results, weights=None):
    """
    Compute a combined score from multiple metrics

    Args:
        results: Dictionary containing metric results
        weights: Dictionary of weights for each metric

    Returns:
        Combined score (higher is better)
    """
    if weights is None:
        weights = {
            'avg_bleu': 0.2,
            'avg_rouge1': 0.15,
            'avg_rouge2': 0.15,
            'avg_rougeL': 0.15,
            'avg_meteor': 0.2,
            'emotion_accuracy': 0.1,
            'emotion_tag_coverage': 0.05
        }

    score = 0.0
    for metric, weight in weights.items():
        if metric in results:
            score += results[metric] * weight

    return score

def hyperparameter_search(model_path, tokenizer_path, test_data_path, device='cuda',
                         top_p_values=None, top_k_values=None, temperature_values=None, subset_size=100,
                         max_new_tokens=128, output_dir=None, weights=None):
    """
    Perform grid search over top_p, top_k, and temperature values

    Args:
        model_path: Path to trained model
        tokenizer_path: Path to tokenizer
        test_data_path: Path to test JSON data
        device: Device to run evaluation on
        top_p_values: List of top_p values to test
        top_k_values: List of top_k values to test
        temperature_values: List of temperature values to test
        subset_size: Number of samples to use for evaluation
        max_new_tokens: Maximum tokens to generate
        output_dir: Directory to save results
        weights: Weights for combining metrics

    Returns:
        Dictionary with all results and best parameters
    """

    # Default hyperparameter values
    if top_p_values is None:
        top_p_values = [0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]

    if top_k_values is None:
        top_k_values = [0, 5, 10, 20, 40, 50]  # 0 means no top_k filtering

    if temperature_values is None:
        temperature_values = [0.8, 1.0, 1.1, 1.2, 1.3, 1.4]

    # Load model and tokenizer
    print(f"Loading model from {model_path}...")
    model, tokenizer, model_type = load_model_auto(model_path, tokenizer_path, device)
    print(f"Loaded {model_type.upper()} model successfully")

    # Load evaluation dataset (subset)
    print(f"Loading evaluation data subset (size: {subset_size})...")
    eval_dataset = TherapyEvaluationDataset(test_data_path, tokenizer_path, subset_size=subset_size)

    # Initialize results storage
    all_results = []
    best_score = -1
    best_params = None

    # Total combinations
    total_combinations = len(top_p_values) * len(top_k_values) * len(temperature_values)
    print(f"Testing {total_combinations} hyperparameter combinations...")
    print(f"top_p values: {top_p_values}")
    print(f"top_k values: {top_k_values}")
    print(f"temperature values: {temperature_values}")
    print(f"Subset size: {subset_size} samples")

    # Grid search
    combination_count = 0

    for top_p in top_p_values:
        for top_k in top_k_values:
            for temperature in temperature_values:
                combination_count += 1

                print(f"\n[{combination_count}/{total_combinations}] Testing top_p={top_p}, top_k={top_k}, temp={temperature}")

                # Evaluate with current hyperparameters
                results = evaluate_hyperparameters(
                    model, tokenizer, eval_dataset, device,
                    max_new_tokens=max_new_tokens,
                    top_p=top_p, top_k=top_k, temperature=temperature,
                    verbose=False
                )

                # Compute combined score
                combined_score = compute_combined_score(results, weights)
                results['combined_score'] = combined_score

                # Store results
                all_results.append(results)

                # Check if this is the best so far
                if combined_score > best_score:
                    best_score = combined_score
                    best_params = {'top_p': top_p, 'top_k': top_k, 'temperature': temperature}
                    print(f"  *** NEW BEST *** Score: {combined_score:.4f}")

                # Print current results
                print(f"  BLEU: {results['avg_bleu']:.4f}, "
                      f"ROUGE-1: {results['avg_rouge1']:.4f}, "
                      f"ROUGE-2: {results['avg_rouge2']:.4f}, "
                      f"ROUGE-L: {results['avg_rougeL']:.4f}, "
                      f"METEOR: {results['avg_meteor']:.4f}, "
                      f"Emotion: {results['emotion_accuracy']:.4f}, "
                      f"Combined: {combined_score:.4f}")

    # Compile final results
    final_results = {
        'best_params': best_params,
        'best_score': best_score,
        'all_results': all_results,
        'model_path': model_path,
        'model_type': model_type,
        'subset_size': subset_size,
        'search_space': {
            'top_p_values': top_p_values,
            'top_k_values': top_k_values,
            'temperature_values': temperature_values
        },
        'weights': weights or {
            'avg_bleu': 0.2,
            'avg_rouge1': 0.15,
            'avg_rouge2': 0.15,
            'avg_rougeL': 0.15,
            'avg_meteor': 0.2,
            'emotion_accuracy': 0.1,
            'emotion_tag_coverage': 0.05
        },
        'timestamp': datetime.now().isoformat()
    }

    # Print summary
    print(f"\n{'='*60}")
    print("HYPERPARAMETER SEARCH COMPLETED")
    print(f"{'='*60}")
    print(f"Best parameters: top_p={best_params['top_p']}, top_k={best_params['top_k']}, temperature={best_params['temperature']}")
    print(f"Best combined score: {best_score:.4f}")

    # Find best result details
    best_result = max(all_results, key=lambda x: x['combined_score'])
    print(f"\nBest result details:")
    print(f"  BLEU Score: {best_result['avg_bleu']:.4f}")
    print(f"  ROUGE-1: {best_result['avg_rouge1']:.4f}")
    print(f"  ROUGE-2: {best_result['avg_rouge2']:.4f}")
    print(f"  ROUGE-L: {best_result['avg_rougeL']:.4f}")
    print(f"  METEOR: {best_result['avg_meteor']:.4f}")
    print(f"  Emotion Accuracy: {best_result['emotion_accuracy']:.4f}")
    print(f"  Emotion Coverage: {best_result['emotion_tag_coverage']:.4f}")
    print(f"  Combined Score: {best_result['combined_score']:.4f}")

    # Save results if output directory specified
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        results_file = os.path.join(output_dir, f"hyperparameter_search_{timestamp}.pkl")

        with open(results_file, 'wb') as f:
            pickle.dump(final_results, f)

        print(f"\nResults saved to: {results_file}")

        # Also save a summary CSV
        import pandas as pd
        df_results = pd.DataFrame(all_results)
        csv_file = os.path.join(output_dir, f"hyperparameter_results_{timestamp}.csv")
        df_results.to_csv(csv_file, index=False)
        print(f"CSV summary saved to: {csv_file}")

    return final_results

def analyze_results(results_file):
    """Analyze hyperparameter search results"""
    with open(results_file, 'rb') as f:
        results = pickle.load(f)

    import pandas as pd

    # Convert to DataFrame
    df = pd.DataFrame(results['all_results'])

    print(f"Hyperparameter Search Analysis")
    print(f"=" * 50)
    print(f"Model: {results['model_path']}")
    print(f"Search completed: {results['timestamp']}")
    print(f"Total combinations tested: {len(df)}")

    # Top 5 results
    print(f"\nTop 5 Results:")
    top_5 = df.nlargest(5, 'combined_score')[['top_p', 'top_k', 'temperature', 'avg_bleu', 'avg_rouge1',
                                              'avg_rouge2', 'avg_rougeL', 'avg_meteor',
                                              'emotion_accuracy', 'combined_score']]
    print(top_5.to_string(index=False))

    print(f"\nBest parameters for each metric:")
    for metric in ['avg_bleu', 'avg_rouge1', 'avg_rouge2', 'avg_rougeL', 'avg_meteor', 'emotion_accuracy']:
        best_row = df.loc[df[metric].idxmax()]
        print(f"  {metric}: top_p={best_row['top_p']}, top_k={best_row['top_k']}, temp={best_row['temperature']} (score: {best_row[metric]:.4f})")

    return df

# ====================== Usage Example ======================
if __name__ == "__main__":
    MODEL_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/therapy_model_4thFIXED_epoch_7_loss_2.2373.ckpt"
    TOKENIZER_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/train_processed_4thFIXED_tokenizer"
    TEST_DATA_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/test.json"
    OUTPUT_DIR = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/Evaluation2/SFThyperparameter_search1"
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    # Hyperparameter search space
    TOP_P_VALUES = [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
    TOP_K_VALUES = [0, 5, 10, 15, 20, 25, 30, 35]
    TEMPERATURE_VALUES = [1.0, 1.1, 1.2, 1.3]

    SUBSET_SIZE = 100

    CUSTOM_WEIGHTS = None

    # Run hyperparameter search
    results = hyperparameter_search(
        model_path=MODEL_PATH,
        tokenizer_path=TOKENIZER_PATH,
        test_data_path=TEST_DATA_PATH,
        device=DEVICE,
        top_p_values=TOP_P_VALUES,
        top_k_values=TOP_K_VALUES,
        temperature_values=TEMPERATURE_VALUES,
        subset_size=SUBSET_SIZE,
        output_dir=OUTPUT_DIR,
        weights=CUSTOM_WEIGHTS
    )

    print(f"\nBest hyperparameters found:")
    print(f"top_p: {results['best_params']['top_p']}")
    print(f"top_k: {results['best_params']['top_k']}")
    print(f"temperature: {results['best_params']['temperature']}")
    print(f"Score: {results['best_score']:.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Loading model from /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/therapy_model_4thFIXED_epoch_7_loss_2.2373.ckpt...
Detected SFT checkpoint file
Loading SFT checkpoint: /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/therapy_model_4thFIXED_epoch_7_loss_2.2373.ckpt


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Loaded SFT model successfully
Loading evaluation data subset (size: 100)...
Processing evaluation data (subset_size: 100)...


Processing evaluation data:  24%|██▎       | 24/102 [00:00<00:00, 5276.41it/s]

Processed 100 evaluation samples
Testing 256 hyperparameter combinations...
top_p values: [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
top_k values: [0, 5, 10, 15, 20, 25, 30, 35]
temperature values: [1.0, 1.1, 1.2, 1.3]
Subset size: 100 samples

[1/256] Testing top_p=0.6, top_k=0, temp=1.0





  *** NEW BEST *** Score: 0.2035
  BLEU: 0.0153, ROUGE-1: 0.1341, ROUGE-2: 0.0313, ROUGE-L: 0.1273, METEOR: 0.0627, Emotion: 0.9400, Combined: 0.2035

[2/256] Testing top_p=0.6, top_k=0, temp=1.1
  BLEU: 0.0148, ROUGE-1: 0.1206, ROUGE-2: 0.0162, ROUGE-L: 0.1143, METEOR: 0.0577, Emotion: 0.8900, Combined: 0.1906

[3/256] Testing top_p=0.6, top_k=0, temp=1.2
  BLEU: 0.0136, ROUGE-1: 0.1116, ROUGE-2: 0.0209, ROUGE-L: 0.1042, METEOR: 0.0576, Emotion: 0.8300, Combined: 0.1802

[4/256] Testing top_p=0.6, top_k=0, temp=1.3
  BLEU: 0.0078, ROUGE-1: 0.0800, ROUGE-2: 0.0017, ROUGE-L: 0.0761, METEOR: 0.0520, Emotion: 0.7700, Combined: 0.1571

[5/256] Testing top_p=0.6, top_k=5, temp=1.0
  BLEU: 0.0155, ROUGE-1: 0.1291, ROUGE-2: 0.0251, ROUGE-L: 0.1256, METEOR: 0.0593, Emotion: 0.9700, Combined: 0.2034

[6/256] Testing top_p=0.6, top_k=5, temp=1.1
  BLEU: 0.0130, ROUGE-1: 0.1354, ROUGE-2: 0.0187, ROUGE-L: 0.1275, METEOR: 0.0608, Emotion: 0.9300, Combined: 0.2000

[7/256] Testing top_p=0.6, top_k=5

SFT Evaluation

In [None]:
import json
import re
import torch
from transformers import GPT2Tokenizer
from datasets import Dataset as HFDataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import nltk
from tqdm import tqdm
import numpy as np
from collections import defaultdict

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# ====================== Text Cleaning Functions ======================
def clean_therapy_text(text):
    """Remove descriptive text patterns from SFT data"""
    expression_pattern = r'\s*(The (?:speaker|emotion state)[^.]*\.(?:[^.]*\.)*)'
    match = re.search(expression_pattern, text, re.IGNORECASE | re.DOTALL)
    if match:
        return text[:match.start()].strip()
    return text.strip()

# ====================== Model Format Detection ======================
def detect_model_format(model, tokenizer, device='cuda', test_inputs=None):
    """
    Test model output format (should be consistent since both SFT and RL use same format)

    Args:
        model: The model to test
        tokenizer: Model tokenizer
        device: Device to run test on
        test_inputs: List of test input strings, uses defaults if None

    Returns:
        str: 'standard' if using <therapist_emotion> format, 'unknown' otherwise
    """
    if test_inputs is None:
        test_inputs = [
            "<problem>anxiety <user>I'm worried about work <user_emotion>anxiety <therapist>",
            "<problem>depression <user>I feel very sad <user_emotion>sadness <therapist>",
            "<problem>relationship <user>My partner doesn't understand me <user_emotion>anger <therapist>"
        ]

    emotion_tag_count = 0
    total_tests = len(test_inputs)

    model.eval()
    with torch.no_grad():
        for test_input in test_inputs:
            # Tokenize and generate
            input_ids = tokenizer.encode(test_input, return_tensors='pt').to(device)
            outputs = model.generate(
                input_ids,
                max_new_tokens=128,
                do_sample=True,
                top_p=0.8,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

            # Decode response
            new_tokens = outputs[0][len(input_ids[0]):]
            response = tokenizer.decode(new_tokens, skip_special_tokens=False).strip()

            # Check for emotion tag
            if '<therapist_emotion>' in response:
                emotion_tag_count += 1

    if emotion_tag_count >= total_tests * 0.5:
        return 'standard'
    else:
        return 'unknown'

# ====================== Model Output Extraction ======================
def extract_model_output(response):
    """
    Extract therapist text and emotion from model output.
    Both SFT and RL models use format: 'text <therapist_emotion> emotion<eos>'

    Returns:
        therapist_text (str): Text before <therapist_emotion>
        emotion (str): Emotion word after <therapist_emotion>
        has_emotion_tag (bool): Whether emotion tag was found
    """
    response = response.strip()

    # Remove <eos> if present
    if response.endswith('<eos>'):
        response = response[:-5].strip()

    # Find <therapist_emotion> tag
    emotion_pattern = r'<therapist_emotion>'
    emotion_match = re.search(emotion_pattern, response)

    if emotion_match:
        # Extract text before <therapist_emotion>
        therapist_text = response[:emotion_match.start()].strip()

        # Extract emotion part after <therapist_emotion>
        emotion_part = response[emotion_match.end():].strip()

        # Get the first word as emotion
        emotion_words = emotion_part.split()
        emotion = emotion_words[0].lower() if emotion_words else ""

        return therapist_text, emotion, True
    else:
        # No emotion tag found, return entire response as text
        return response, "", False

def extract_emotion_only(response):
    """Extract only emotion from response (for compatibility with existing code)"""
    _, emotion, has_emotion = extract_model_output(response)
    return emotion if has_emotion else None

# ====================== Reference Data Processing ======================
class TherapyEvaluationDataset:
    """Dataset class for evaluation data processing"""

    def __init__(self, json_path, tokenizer_path=None):
        with open(json_path, 'r', encoding='utf-8') as f:
            self._data = json.load(f)

        if tokenizer_path:
            self.tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
        else:
            # Use default tokenizer if path not provided
            from transformers import AutoTokenizer
            self.tokenizer = AutoTokenizer.from_pretrained('gpt2')

        self.evaluation_data = []
        self._process_evaluation_data()

    def _process_evaluation_data(self):
        """Process dialog data for evaluation"""
        for conv in tqdm(self._data, desc="Processing evaluation data"):
            problem_type = conv.get("problem_type", "").strip()
            dialog = conv.get("dialog", [])

            user_text_parts = []
            user_emotions = []

            for turn in dialog:
                speaker = turn.get("speaker", "")
                text = clean_therapy_text(turn.get("text", ""))
                emotion = turn.get("emotion", "").strip()

                if speaker != "sys":  # User turn
                    if text:
                        user_text_parts.append(text)
                    if emotion:
                        user_emotions.append(emotion)
                else:  # Therapist turn
                    if not user_text_parts:
                        continue

                    therapist_text = clean_therapy_text(text)
                    therapist_emotion = emotion

                    combined_user_text = " ".join(user_text_parts)
                    last_user_emotion = user_emotions[-1] if user_emotions else ""

                    # Create input prompt
                    input_parts = []
                    if problem_type:
                        input_parts.append(f"<problem>{problem_type}")

                    input_parts.append(f"<user>{combined_user_text}")
                    if last_user_emotion:
                        input_parts.append(f"<user_emotion>{last_user_emotion}")

                    input_parts.append("<therapist>")
                    input_text = " ".join(input_parts)

                    self.evaluation_data.append({
                        'input_text': input_text,
                        'reference_text': therapist_text,
                        'reference_emotion': therapist_emotion,
                        'user_input': combined_user_text,
                        'user_emotion': last_user_emotion,
                        'problem_type': problem_type
                    })

                    user_text_parts = []
                    user_emotions = []

        print(f"Processed {len(self.evaluation_data)} evaluation samples")

    def get_evaluation_data(self):
        return self.evaluation_data

# ====================== Evaluation Metrics ======================
class TherapyEvaluationMetrics:
    """Class to compute evaluation metrics"""

    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.smoothing = SmoothingFunction().method1

    def compute_bleu(self, reference, candidate):
        """Compute BLEU score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=self.smoothing)

    def compute_rouge(self, reference, candidate):
        """Compute ROUGE scores"""
        if not candidate or not reference:
            return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

        scores = self.rouge_scorer.score(reference, candidate)
        return {
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        }

    def compute_meteor(self, reference, candidate):
        """Compute METEOR score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return meteor_score([reference_tokens], candidate_tokens)

# ====================== Model Evaluation ======================
def evaluate_model(model, tokenizer, evaluation_dataset, device='cuda', max_new_tokens=128,
                  top_p=0.6, top_k=30, temperature=1.0, do_sample=True):
    """
    Evaluate model on therapy dataset

    Args:
        model: Trained model
        tokenizer: Model tokenizer
        evaluation_dataset: TherapyEvaluationDataset instance
        device: Device to run evaluation on
        max_new_tokens: Maximum tokens to generate
        top_p: Top-p sampling parameter
        do_sample: Whether to use sampling

    Returns:
        Dictionary containing evaluation results
    """
    metrics_computer = TherapyEvaluationMetrics()
    evaluation_data = evaluation_dataset.get_evaluation_data()

    results = {
        'bleu_scores': [],
        'rouge1_scores': [],
        'rouge2_scores': [],
        'rougeL_scores': [],
        'meteor_scores': [],
        'emotion_accuracy': [],
        'has_emotion_tag': [],
        'sample_outputs': []
    }

    emotion_confusion = defaultdict(lambda: defaultdict(int))

    model.eval()

    # Detect model output format
    detected_format = detect_model_format(model, tokenizer, device)
    print(f"Output format check: {detected_format.upper()}")

    print(f"Evaluating model on {len(evaluation_data)} samples...")

    progress_bar = tqdm(evaluation_data, desc="Evaluating",
                       bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]')

    for i, sample in enumerate(progress_bar):
        input_text = sample['input_text']
        reference_text = sample['reference_text']
        reference_emotion = sample['reference_emotion']

        # Tokenize input
        input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_new_tokens=max_new_tokens,
                do_sample=do_sample,
                top_p=top_p,
                top_k=top_k,
                temperature=temperature,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        # Decode response
        new_tokens = outputs[0][len(input_ids[0]):]
        response = tokenizer.decode(new_tokens, skip_special_tokens=False).strip()

        # Extract model output
        predicted_text, predicted_emotion, has_emotion = extract_model_output(response)

        # Compute text generation metrics
        bleu = metrics_computer.compute_bleu(reference_text, predicted_text)
        rouge = metrics_computer.compute_rouge(reference_text, predicted_text)
        meteor = metrics_computer.compute_meteor(reference_text, predicted_text)

        # Compute emotion accuracy
        emotion_correct = (predicted_emotion.lower() == reference_emotion.lower()) if has_emotion else False

        # Store results
        results['bleu_scores'].append(bleu)
        results['rouge1_scores'].append(rouge['rouge1'])
        results['rouge2_scores'].append(rouge['rouge2'])
        results['rougeL_scores'].append(rouge['rougeL'])
        results['meteor_scores'].append(meteor)
        results['emotion_accuracy'].append(emotion_correct)
        results['has_emotion_tag'].append(has_emotion)

        if has_emotion:
            emotion_confusion[reference_emotion.lower()][predicted_emotion.lower()] += 1
        else:
            emotion_confusion[reference_emotion.lower()]['no_prediction'] += 1

        if i < 10:
            results['sample_outputs'].append({
                'input': input_text,
                'reference_text': reference_text,
                'predicted_text': predicted_text,
                'reference_emotion': reference_emotion,
                'predicted_emotion': predicted_emotion,
                'full_response': response,
                'bleu': bleu,
                'rouge1': rouge['rouge1'],
                'meteor': meteor
            })

        if (i + 1) % 20 == 0 or i < 5:
            sample_info = f"Sample {i+1}: BLEU={bleu:.3f}, Emotion={'✓' if emotion_correct else '✗'}"
            progress_bar.set_description(f"Evaluating - {sample_info}")

            if (i + 1) % 50 == 0 or i < 3:
                print(f"\n" + "="*80)
                print(f"LIVE SAMPLE {i+1}/{len(evaluation_data)}")
                print("="*80)
                print(f"Input: {input_text[:120]}...")
                print(f"Reference: {reference_text[:100]}...")
                print(f"Generated: {predicted_text[:100]}...")
                print(f"Full Response: {response[:150]}...")
                print(f"Ref Emotion: {reference_emotion} | Pred Emotion: {predicted_emotion} | Match: {'✓' if emotion_correct else '✗'}")
                print(f"BLEU: {bleu:.4f} | ROUGE-1: {rouge['rouge1']:.4f} | METEOR: {meteor:.4f}")
                print("="*80)
        else:
            if i > 0:
                avg_bleu = np.mean(results['bleu_scores'])
                avg_emotion_acc = np.mean(results['emotion_accuracy'])
                progress_bar.set_description(f"Evaluating - Avg BLEU: {avg_bleu:.3f}, Emotion Acc: {avg_emotion_acc:.3f}")

    progress_bar.close()

    # Compute summary statistics
    results['summary'] = {
        'avg_bleu': np.mean(results['bleu_scores']),
        'avg_rouge1': np.mean(results['rouge1_scores']),
        'avg_rouge2': np.mean(results['rouge2_scores']),
        'avg_rougeL': np.mean(results['rougeL_scores']),
        'avg_meteor': np.mean(results['meteor_scores']),
        'emotion_accuracy': np.mean(results['emotion_accuracy']),
        'emotion_tag_coverage': np.mean(results['has_emotion_tag']),
        'total_samples': len(evaluation_data)
    }

    results['emotion_confusion_matrix'] = dict(emotion_confusion)
    results['detected_format'] = detected_format

    # Print final progress summary
    print(f"\n{'='*60}")
    print(f"EVALUATION COMPLETED!")
    print(f"{'='*60}")
    print(f"Processed {len(evaluation_data)} samples")
    print(f"Average BLEU: {results['summary']['avg_bleu']:.4f}")
    print(f"Average ROUGE-1: {results['summary']['avg_rouge1']:.4f}")
    print(f"Emotion Accuracy: {results['summary']['emotion_accuracy']:.4f}")
    print(f"Emotion Tag Coverage: {results['summary']['emotion_tag_coverage']:.4f}")
    print(f"{'='*60}")

    return results

def print_evaluation_results(results):
    """Print formatted evaluation results"""
    summary = results['summary']

    print("\n" + "="*60)
    print("THERAPY MODEL EVALUATION RESULTS")
    print("="*60)
    print(f"Model Type: {results.get('model_type', 'Unknown').upper()}")
    print(f"Output Format: {results.get('detected_format', 'Unknown').upper()}")

    print(f"\nText Generation Metrics:")
    print(f"  BLEU Score:     {summary['avg_bleu']:.4f}")
    print(f"  ROUGE-1:        {summary['avg_rouge1']:.4f}")
    print(f"  ROUGE-2:        {summary['avg_rouge2']:.4f}")
    print(f"  ROUGE-L:        {summary['avg_rougeL']:.4f}")
    print(f"  METEOR:         {summary['avg_meteor']:.4f}")

    print(f"\nEmotion Prediction:")
    print(f"  Emotion Accuracy:    {summary['emotion_accuracy']:.4f}")
    print(f"  Emotion Tag Coverage: {summary['emotion_tag_coverage']:.4f}")

    print(f"\nDataset Info:")
    print(f"  Total Samples:  {summary['total_samples']}")

    print(f"\nSample Outputs:")
    for i, sample in enumerate(results['sample_outputs'][:3], 1):
        print(f"\n  Sample {i}:")
        print(f"    Input: {sample['input'][:100]}...")
        print(f"    Reference: {sample['reference_text'][:80]}...")
        print(f"    Predicted: {sample['predicted_text'][:80]}...")
        print(f"    Ref Emotion: {sample['reference_emotion']}")
        print(f"    Pred Emotion: {sample['predicted_emotion']}")
        print(f"    BLEU: {sample['bleu']:.3f}, ROUGE-1: {sample['rouge1']:.3f}")

# ====================== Model Loading Functions ======================
def load_sft_model(checkpoint_path, tokenizer_path, device='cuda'):
    """Load SFT model from checkpoint file"""
    from transformers import GPT2LMHeadModel

    print(f"Loading SFT checkpoint: {checkpoint_path}")

    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)

    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.resize_token_embeddings(len(tokenizer))
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)

    if 'epoch' in checkpoint:
        print(f"Checkpoint info: Epoch {checkpoint['epoch']}, Loss {checkpoint.get('valid_loss', 'N/A')}")

    return model, tokenizer

def load_rl_model(model_dir, tokenizer_path, device='cuda'):
    """Load RL model from directory"""
    from transformers import AutoModelForCausalLM

    print(f"Loading RL model from directory: {model_dir}")

    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

    model = AutoModelForCausalLM.from_pretrained(model_dir).to(device)

    return model, tokenizer

def load_model_auto(model_path, tokenizer_path, device='cuda'):
    """
    Automatically detect and load model (SFT checkpoint or RL directory)

    Args:
        model_path: Path to model (file for SFT, directory for RL)
        tokenizer_path: Path to tokenizer
        device: Device to load model on

    Returns:
        tuple: (model, tokenizer, model_type)
    """
    import os

    if os.path.isfile(model_path):
        print("Detected SFT checkpoint file")
        model, tokenizer = load_sft_model(model_path, tokenizer_path, device)
        return model, tokenizer, 'sft'

    elif os.path.isdir(model_path):
        print("Detected RL model directory")
        model, tokenizer = load_rl_model(model_path, tokenizer_path, device)
        return model, tokenizer, 'rl'

    else:
        raise ValueError(f"Model path {model_path} is neither a file nor a directory")

# ====================== Main Evaluation Function ======================
def run_evaluation(model_path, tokenizer_path, test_data_path, device='cuda'):
    """
    Run complete evaluation pipeline

    Args:
        model_path: Path to trained model (file for SFT, directory for RL)
        tokenizer_path: Path to tokenizer
        test_data_path: Path to test JSON data
        device: Device to run evaluation on
    """

    # Load model and tokenizer (auto-detect type)
    print(f"Loading model from {model_path}...")
    model, tokenizer, model_type = load_model_auto(model_path, tokenizer_path, device)
    print(f"Loaded {model_type.upper()} model successfully")

    # Load evaluation dataset
    print(f"Loading evaluation data from {test_data_path}...")
    eval_dataset = TherapyEvaluationDataset(test_data_path, tokenizer_path)

    # Run evaluation
    results = evaluate_model(model, tokenizer, eval_dataset, device=device)
    results['model_type'] = model_type

    print_evaluation_results(results)

    return results

if __name__ == "__main__":


    MODEL_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/therapy_model_4thFIXED_epoch_7_loss_2.2373.ckpt"  # Will auto-detect type
    TOKENIZER_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/train_processed_4thFIXED_tokenizer"
    TEST_DATA_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/test.json"
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    results = run_evaluation(MODEL_PATH, TOKENIZER_PATH, TEST_DATA_PATH, DEVICE)

    import pickle
    with open('/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/Evaluation2/0.6SFT_evaluation_results.pkl', 'wb') as f:
      pickle.dump(results, f)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Loading model from /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/therapy_model_4thFIXED_epoch_7_loss_2.2373.ckpt...
Detected SFT checkpoint file
Loading SFT checkpoint: /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/therapy_model_4thFIXED_epoch_7_loss_2.2373.ckpt


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Checkpoint info: Epoch 7, Loss 2.237253785133362
Loaded SFT model successfully
Loading evaluation data from /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/test.json...


Processing evaluation data: 100%|██████████| 102/102 [00:00<00:00, 6576.97it/s]


Processed 454 evaluation samples
Output format check: STANDARD
Evaluating model on 454 samples...


Evaluating - Sample 1: BLEU=0.000, Emotion=✓:   0%|          | 1/454 [00:02<21:26,  2.84s/it]


LIVE SAMPLE 1/454
Input: <problem>Breakups or Divorce <user>I mean, shit, don't you know that men are the new women? Obsessed with weddings and c...
Reference: But I thought you said...
Generated: What is it?...
Full Response: What is it? <therapist_emotion> neutral<eos>...
Ref Emotion: neutral | Pred Emotion: neutral | Match: ✓
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating - Sample 4: BLEU=0.000, Emotion=✓:   1%|          | 4/454 [00:03<04:00,  1.87it/s]


LIVE SAMPLE 2/454
Input: <problem>Breakups or Divorce <user>Well, we were watching tv.But before that we were having dinner and he had this look ...
Reference: So you were the one who came up with it....
Generated: Well, maybe that's the way it's going to go....
Full Response: Well, maybe that's the way it's going to go. <therapist_emotion> neutral<eos>...
Ref Emotion: neutral | Pred Emotion: neutral | Match: ✓
BLEU: 0.0215 | ROUGE-1: 0.1905 | METEOR: 0.0505

LIVE SAMPLE 3/454
Input: <problem>Breakups or Divorce <user>With what? <user_emotion>neutral <therapist>...
Reference: The ultimatum....
Generated: Well,...
Full Response: Well, <therapist_emotion> neutral<eos>...
Ref Emotion: neutral | Pred Emotion: neutral | Match: ✓
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating - Avg BLEU: 0.011, Emotion Acc: 0.922:  22%|██▏       | 102/454 [00:14<00:38,  9.16it/s]


LIVE SAMPLE 100/454
Input: <problem>Self-Assessment or Assessment of Others <user>I don't know. <user_emotion>depression <therapist>...
Reference: You know We have these two sides to our personalities,...
Generated: I think it's important to know that you're not alone....
Full Response: I think it's important to know that you're not alone. <therapist_emotion> neutral<eos>...
Ref Emotion: neutral | Pred Emotion: neutral | Match: ✓
BLEU: 0.0251 | ROUGE-1: 0.2727 | METEOR: 0.1000


Evaluating - Avg BLEU: 0.011, Emotion Acc: 0.945:  44%|████▍     | 201/454 [00:26<00:29,  8.65it/s]


LIVE SAMPLE 200/454
Input: <problem>Emotional Transference or Infidelity <user>Yeah, I guess you don't really follow that world. I don't really fol...
Reference: I remember last week that you mentioned she was wealthy....
Generated: And what about you?...
Full Response: And what about you? <therapist_emotion> neutral<eos>...
Ref Emotion: neutral | Pred Emotion: neutral | Match: ✓
BLEU: 0.0000 | ROUGE-1: 0.1429 | METEOR: 0.0000


Evaluating - Avg BLEU: 0.012, Emotion Acc: 0.954:  66%|██████▋   | 301/454 [00:38<00:15,  9.96it/s]


LIVE SAMPLE 300/454
Input: <problem>Self-punishment and disgust <user>I wanted Natalie up in the air, as far away from this as possible. Connie was...
Reference: I think I heard my father use that expression....
Generated: I don't know....
Full Response: I don't know. <therapist_emotion> neutral<eos>...
Ref Emotion: neutral | Pred Emotion: neutral | Match: ✓
BLEU: 0.0154 | ROUGE-1: 0.1538 | METEOR: 0.0595


Evaluating - Avg BLEU: 0.011, Emotion Acc: 0.950:  88%|████████▊ | 400/454 [00:50<00:07,  7.56it/s]


LIVE SAMPLE 400/454
Input: <problem>The relationship with friends and family <user>But you know what could be helpful, actually? If you could just ...
Reference: Isn't this what you used to do with Russell?...
Generated: Would you like me to write something about you?...
Full Response: Would you like me to write something about you? <therapist_emotion> neutral<eos>...
Ref Emotion: neutral | Pred Emotion: neutral | Match: ✓
BLEU: 0.0285 | ROUGE-1: 0.2105 | METEOR: 0.1111


Evaluating - Avg BLEU: 0.011, Emotion Acc: 0.954: 100%|██████████| 454/454 [00:56<00:00,  8.08it/s]



EVALUATION COMPLETED!
Processed 454 samples
Average BLEU: 0.0108
Average ROUGE-1: 0.1164
Emotion Accuracy: 0.9537
Emotion Tag Coverage: 1.0000

THERAPY MODEL EVALUATION RESULTS
Model Type: SFT
Output Format: STANDARD

Text Generation Metrics:
  BLEU Score:     0.0108
  ROUGE-1:        0.1164
  ROUGE-2:        0.0175
  ROUGE-L:        0.1076
  METEOR:         0.0485

Emotion Prediction:
  Emotion Accuracy:    0.9537
  Emotion Tag Coverage: 1.0000

Dataset Info:
  Total Samples:  454

Sample Outputs:

  Sample 1:
    Input: <problem>Breakups or Divorce <user>I mean, shit, don't you know that men are the new women? Obsessed...
    Reference: But I thought you said...
    Predicted: What is it?...
    Ref Emotion: neutral
    Pred Emotion: neutral
    BLEU: 0.000, ROUGE-1: 0.000

  Sample 2:
    Input: <problem>Breakups or Divorce <user>Well, we were watching tv.But before that we were having dinner a...
    Reference: So you were the one who came up with it....
    Predicted: Well, maybe

RL Hyperparameters


In [None]:
import json
import re
import torch
from transformers import GPT2Tokenizer
from datasets import Dataset as HFDataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import nltk
from tqdm import tqdm
import numpy as np
from collections import defaultdict
import itertools
import pickle
import os
from datetime import datetime
import random

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# ====================== Text Cleaning Functions ======================
def clean_therapy_text(text):
    """Remove descriptive text patterns from SFT data"""
    expression_pattern = r'\s*(The (?:speaker|emotion state)[^.]*\.(?:[^.]*\.)*)'
    match = re.search(expression_pattern, text, re.IGNORECASE | re.DOTALL)
    if match:
        return text[:match.start()].strip()
    return text.strip()

# ====================== Model Output Extraction ======================
def extract_model_output(response):
    """Extract therapist text and emotion from model output"""
    response = response.strip()

    # Remove <eos> if present
    if response.endswith('<eos>'):
        response = response[:-5].strip()

    # Find <therapist_emotion> tag
    emotion_pattern = r'<therapist_emotion>'
    emotion_match = re.search(emotion_pattern, response)

    if emotion_match:
        # Extract text before <therapist_emotion>
        therapist_text = response[:emotion_match.start()].strip()

        # Extract emotion part after <therapist_emotion>
        emotion_part = response[emotion_match.end():].strip()

        # Get the first word as emotion
        emotion_words = emotion_part.split()
        emotion = emotion_words[0].lower() if emotion_words else ""

        return therapist_text, emotion, True
    else:
        # No emotion tag found, return entire response as text
        return response, "", False

# ====================== Reference Data Processing ======================
class TherapyEvaluationDataset:
    """Dataset class for evaluation data processing"""

    def __init__(self, json_path, tokenizer_path=None, subset_size=None):
        with open(json_path, 'r', encoding='utf-8') as f:
            self._data = json.load(f)

        if tokenizer_path:
            self.tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
        else:
            from transformers import AutoTokenizer
            self.tokenizer = AutoTokenizer.from_pretrained('gpt2')

        self.evaluation_data = []
        self.subset_size = subset_size
        self._process_evaluation_data()

    def _process_evaluation_data(self):
        """Process dialog data for evaluation"""
        print(f"Processing evaluation data (subset_size: {self.subset_size})...")

        for conv in tqdm(self._data, desc="Processing evaluation data"):
            problem_type = conv.get("problem_type", "").strip()
            dialog = conv.get("dialog", [])

            user_text_parts = []
            user_emotions = []

            for turn in dialog:
                speaker = turn.get("speaker", "")
                text = clean_therapy_text(turn.get("text", ""))
                emotion = turn.get("emotion", "").strip()

                if speaker != "sys":  # User turn
                    if text:
                        user_text_parts.append(text)
                    if emotion:
                        user_emotions.append(emotion)
                else:  # Therapist turn
                    if not user_text_parts:
                        continue

                    therapist_text = clean_therapy_text(text)
                    therapist_emotion = emotion

                    combined_user_text = " ".join(user_text_parts)
                    last_user_emotion = user_emotions[-1] if user_emotions else ""

                    # Create input prompt
                    input_parts = []
                    if problem_type:
                        input_parts.append(f"<problem>{problem_type}")

                    input_parts.append(f"<user>{combined_user_text}")
                    if last_user_emotion:
                        input_parts.append(f"<user_emotion>{last_user_emotion}")

                    input_parts.append("<therapist>")
                    input_text = " ".join(input_parts)

                    self.evaluation_data.append({
                        'input_text': input_text,
                        'reference_text': therapist_text,
                        'reference_emotion': therapist_emotion,
                        'user_input': combined_user_text,
                        'user_emotion': last_user_emotion,
                        'problem_type': problem_type
                    })

                    user_text_parts = []
                    user_emotions = []

                    # Early stopping if subset size is reached
                    if self.subset_size and len(self.evaluation_data) >= self.subset_size:
                        break

            # Early stopping if subset size is reached
            if self.subset_size and len(self.evaluation_data) >= self.subset_size:
                break

        print(f"Processed {len(self.evaluation_data)} evaluation samples")

    def get_evaluation_data(self):
        return self.evaluation_data

# ====================== Evaluation Metrics ======================
class TherapyEvaluationMetrics:
    """Class to compute evaluation metrics"""

    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.smoothing = SmoothingFunction().method1

    def compute_bleu(self, reference, candidate):
        """Compute BLEU score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=self.smoothing)

    def compute_rouge(self, reference, candidate):
        """Compute ROUGE scores"""
        if not candidate or not reference:
            return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

        scores = self.rouge_scorer.score(reference, candidate)
        return {
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        }

    def compute_meteor(self, reference, candidate):
        """Compute METEOR score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return meteor_score([reference_tokens], candidate_tokens)

# ======================  Evaluation Function ======================
def evaluate_hyperparameters(model, tokenizer, evaluation_dataset, device='cuda',
                            max_new_tokens=128, top_p=0.8, top_k=0, temperature=1.0, do_sample=True,
                            verbose=False):
    """
    Fast evaluation for hyperparameter tuning

    Args:
        model: Trained model
        tokenizer: Model tokenizer
        evaluation_dataset: TherapyEvaluationDataset instance
        device: Device to run evaluation on
        max_new_tokens: Maximum tokens to generate
        top_p: Top-p sampling parameter
        top_k: Top-k sampling parameter
        temperature: Temperature for sampling
        do_sample: Whether to use sampling
        verbose: Whether to print detailed progress

    Returns:
        Dictionary containing evaluation results
    """
    metrics_computer = TherapyEvaluationMetrics()
    evaluation_data = evaluation_dataset.get_evaluation_data()

    results = {
        'bleu_scores': [],
        'rouge1_scores': [],
        'rouge2_scores': [],
        'rougeL_scores': [],
        'meteor_scores': [],
        'emotion_accuracy': [],
        'has_emotion_tag': [],
    }

    model.eval()

    # Progress bar only if verbose
    if verbose:
        progress_bar = tqdm(evaluation_data, desc=f"Eval p={top_p:.2f} k={top_k} T={temperature:.1f}")
        data_iter = progress_bar
    else:
        data_iter = evaluation_data

    for i, sample in enumerate(data_iter):
        input_text = sample['input_text']
        reference_text = sample['reference_text']
        reference_emotion = sample['reference_emotion']

        # Tokenize input
        input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_new_tokens=max_new_tokens,
                do_sample=do_sample,
                top_p=top_p,
                top_k=top_k,
                temperature=temperature,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        # Decode response
        new_tokens = outputs[0][len(input_ids[0]):]
        response = tokenizer.decode(new_tokens, skip_special_tokens=False).strip()

        # Extract model output
        predicted_text, predicted_emotion, has_emotion = extract_model_output(response)

        # Compute metrics
        bleu = metrics_computer.compute_bleu(reference_text, predicted_text)
        rouge = metrics_computer.compute_rouge(reference_text, predicted_text)
        meteor = metrics_computer.compute_meteor(reference_text, predicted_text)

        # Compute emotion accuracy
        emotion_correct = (predicted_emotion.lower() == reference_emotion.lower()) if has_emotion else False

        # Store results
        results['bleu_scores'].append(bleu)
        results['rouge1_scores'].append(rouge['rouge1'])
        results['rouge2_scores'].append(rouge['rouge2'])
        results['rougeL_scores'].append(rouge['rougeL'])
        results['meteor_scores'].append(meteor)
        results['emotion_accuracy'].append(emotion_correct)
        results['has_emotion_tag'].append(has_emotion)

    if verbose and 'progress_bar' in locals():
        progress_bar.close()

    # Compute summary statistics
    summary = {
        'avg_bleu': np.mean(results['bleu_scores']),
        'avg_rouge1': np.mean(results['rouge1_scores']),
        'avg_rouge2': np.mean(results['rouge2_scores']),
        'avg_rougeL': np.mean(results['rougeL_scores']),
        'avg_meteor': np.mean(results['meteor_scores']),
        'emotion_accuracy': np.mean(results['emotion_accuracy']),
        'emotion_tag_coverage': np.mean(results['has_emotion_tag']),
        'total_samples': len(evaluation_data),
        'top_p': top_p,
        'top_k': top_k,
        'temperature': temperature
    }

    return summary

# ====================== Model Loading Functions ======================
def load_sft_model(checkpoint_path, tokenizer_path, device='cuda'):
    """Load SFT model from checkpoint file"""
    from transformers import GPT2LMHeadModel

    print(f"Loading SFT checkpoint: {checkpoint_path}")

    # Load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

    # Load checkpoint
    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)

    # Initialize model
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.resize_token_embeddings(len(tokenizer))
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)

    return model, tokenizer

def load_rl_model(model_dir, tokenizer_path, device='cuda'):
    """Load RL model from directory"""
    from transformers import AutoModelForCausalLM

    print(f"Loading RL model from directory: {model_dir}")

    # Load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

    # Load model from directory
    model = AutoModelForCausalLM.from_pretrained(model_dir).to(device)

    return model, tokenizer

def load_model_auto(model_path, tokenizer_path, device='cuda'):
    """Automatically detect and load model"""
    import os

    if os.path.isfile(model_path):
        print("Detected SFT checkpoint file")
        model, tokenizer = load_sft_model(model_path, tokenizer_path, device)
        return model, tokenizer, 'sft'
    elif os.path.isdir(model_path):
        print("Detected RL model directory")
        model, tokenizer = load_rl_model(model_path, tokenizer_path, device)
        return model, tokenizer, 'rl'
    else:
        raise ValueError(f"Model path {model_path} is neither a file nor a directory")

# ====================== Hyperparameter Tuning Functions ======================
def compute_combined_score(results, weights=None):
    """
    Compute a combined score from multiple metrics

    Args:
        results: Dictionary containing metric results
        weights: Dictionary of weights for each metric

    Returns:
        Combined score (higher is better)
    """
    if weights is None:
        # Default weights - adjust based on importance
        weights = {
            'avg_bleu': 0.2,
            'avg_rouge1': 0.15,
            'avg_rouge2': 0.15,
            'avg_rougeL': 0.15,
            'avg_meteor': 0.2,
            'emotion_accuracy': 0.1,
            'emotion_tag_coverage': 0.05
        }

    score = 0.0
    for metric, weight in weights.items():
        if metric in results:
            score += results[metric] * weight

    return score

def hyperparameter_search(model_path, tokenizer_path, test_data_path, device='cuda',
                         top_p_values=None, top_k_values=None, temperature_values=None, subset_size=100,
                         max_new_tokens=128, output_dir=None, weights=None):
    """
    Perform grid search over top_p, top_k, and temperature values

    Args:
        model_path: Path to trained model
        tokenizer_path: Path to tokenizer
        test_data_path: Path to test JSON data
        device: Device to run evaluation on
        top_p_values: List of top_p values to test
        top_k_values: List of top_k values to test
        temperature_values: List of temperature values to test
        subset_size: Number of samples to use for evaluation
        max_new_tokens: Maximum tokens to generate
        output_dir: Directory to save results
        weights: Weights for combining metrics

    Returns:
        Dictionary with all results and best parameters
    """

    # Default hyperparameter values
    if top_p_values is None:
        top_p_values = [0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]

    if top_k_values is None:
        top_k_values = [0, 5, 10, 20, 40, 50]  # 0 means no top_k filtering

    if temperature_values is None:
        temperature_values = [0.8, 1.0, 1.1, 1.2, 1.3, 1.4]

    # Load model and tokenizer
    print(f"Loading model from {model_path}...")
    model, tokenizer, model_type = load_model_auto(model_path, tokenizer_path, device)
    print(f"Loaded {model_type.upper()} model successfully")

    # Load evaluation dataset (subset)
    print(f"Loading evaluation data subset (size: {subset_size})...")
    eval_dataset = TherapyEvaluationDataset(test_data_path, tokenizer_path, subset_size=subset_size)

    # Initialize results storage
    all_results = []
    best_score = -1
    best_params = None

    # Total combinations
    total_combinations = len(top_p_values) * len(top_k_values) * len(temperature_values)
    print(f"Testing {total_combinations} hyperparameter combinations...")
    print(f"top_p values: {top_p_values}")
    print(f"top_k values: {top_k_values}")
    print(f"temperature values: {temperature_values}")
    print(f"Subset size: {subset_size} samples")

    # Grid search
    combination_count = 0

    for top_p in top_p_values:
        for top_k in top_k_values:
            for temperature in temperature_values:
                combination_count += 1

                print(f"\n[{combination_count}/{total_combinations}] Testing top_p={top_p}, top_k={top_k}, temp={temperature}")

                # Evaluate with current hyperparameters
                results = evaluate_hyperparameters(
                    model, tokenizer, eval_dataset, device,
                    max_new_tokens=max_new_tokens,
                    top_p=top_p, top_k=top_k, temperature=temperature,
                    verbose=False
                )

                # Compute combined score
                combined_score = compute_combined_score(results, weights)
                results['combined_score'] = combined_score

                # Store results
                all_results.append(results)

                # Check if this is the best so far
                if combined_score > best_score:
                    best_score = combined_score
                    best_params = {'top_p': top_p, 'top_k': top_k, 'temperature': temperature}
                    print(f"  *** NEW BEST *** Score: {combined_score:.4f}")

                # Print current results
                print(f"  BLEU: {results['avg_bleu']:.4f}, "
                      f"ROUGE-1: {results['avg_rouge1']:.4f}, "
                      f"ROUGE-2: {results['avg_rouge2']:.4f}, "
                      f"ROUGE-L: {results['avg_rougeL']:.4f}, "
                      f"METEOR: {results['avg_meteor']:.4f}, "
                      f"Emotion: {results['emotion_accuracy']:.4f}, "
                      f"Combined: {combined_score:.4f}")

    # Compile final results
    final_results = {
        'best_params': best_params,
        'best_score': best_score,
        'all_results': all_results,
        'model_path': model_path,
        'model_type': model_type,
        'subset_size': subset_size,
        'search_space': {
            'top_p_values': top_p_values,
            'top_k_values': top_k_values,
            'temperature_values': temperature_values
        },
        'weights': weights or {
            'avg_bleu': 0.2,
            'avg_rouge1': 0.15,
            'avg_rouge2': 0.15,
            'avg_rougeL': 0.15,
            'avg_meteor': 0.2,
            'emotion_accuracy': 0.1,
            'emotion_tag_coverage': 0.05
        },
        'timestamp': datetime.now().isoformat()
    }

    # Print summary
    print(f"\n{'='*60}")
    print("HYPERPARAMETER SEARCH COMPLETED")
    print(f"{'='*60}")
    print(f"Best parameters: top_p={best_params['top_p']}, top_k={best_params['top_k']}, temperature={best_params['temperature']}")
    print(f"Best combined score: {best_score:.4f}")

    # Find best result details
    best_result = max(all_results, key=lambda x: x['combined_score'])
    print(f"\nBest result details:")
    print(f"  BLEU Score: {best_result['avg_bleu']:.4f}")
    print(f"  ROUGE-1: {best_result['avg_rouge1']:.4f}")
    print(f"  ROUGE-2: {best_result['avg_rouge2']:.4f}")
    print(f"  ROUGE-L: {best_result['avg_rougeL']:.4f}")
    print(f"  METEOR: {best_result['avg_meteor']:.4f}")
    print(f"  Emotion Accuracy: {best_result['emotion_accuracy']:.4f}")
    print(f"  Emotion Coverage: {best_result['emotion_tag_coverage']:.4f}")
    print(f"  Combined Score: {best_result['combined_score']:.4f}")

    # Save results if output directory specified
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        results_file = os.path.join(output_dir, f"hyperparameter_search_{timestamp}.pkl")

        with open(results_file, 'wb') as f:
            pickle.dump(final_results, f)

        print(f"\nResults saved to: {results_file}")

        # Also save a summary CSV
        import pandas as pd
        df_results = pd.DataFrame(all_results)
        csv_file = os.path.join(output_dir, f"hyperparameter_results_{timestamp}.csv")
        df_results.to_csv(csv_file, index=False)
        print(f"CSV summary saved to: {csv_file}")

    return final_results

def analyze_results(results_file):
    """Analyze hyperparameter search results"""
    with open(results_file, 'rb') as f:
        results = pickle.load(f)

    import pandas as pd

    # Convert to DataFrame
    df = pd.DataFrame(results['all_results'])

    print(f"Hyperparameter Search Analysis")
    print(f"=" * 50)
    print(f"Model: {results['model_path']}")
    print(f"Search completed: {results['timestamp']}")
    print(f"Total combinations tested: {len(df)}")

    # Top 5 results
    print(f"\nTop 5 Results:")
    top_5 = df.nlargest(5, 'combined_score')[['top_p', 'top_k', 'temperature', 'avg_bleu', 'avg_rouge1',
                                              'avg_rouge2', 'avg_rougeL', 'avg_meteor',
                                              'emotion_accuracy', 'combined_score']]
    print(top_5.to_string(index=False))

    # Best for each metric
    print(f"\nBest parameters for each metric:")
    for metric in ['avg_bleu', 'avg_rouge1', 'avg_rouge2', 'avg_rougeL', 'avg_meteor', 'emotion_accuracy']:
        best_row = df.loc[df[metric].idxmax()]
        print(f"  {metric}: top_p={best_row['top_p']}, top_k={best_row['top_k']}, temp={best_row['temperature']} (score: {best_row[metric]:.4f})")

    return df

if __name__ == "__main__":
    # Configuration
    MODEL_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_4"
    TOKENIZER_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/train_processed_4thFIXED_tokenizer"
    TEST_DATA_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/test.json"
    OUTPUT_DIR = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/Evaluation2/RLhyperparameter_search1"
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    # Hyperparameter search space
    TOP_P_VALUES = [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.00]
    TOP_K_VALUES = [0, 5, 10, 15, 20, 25, 30, 35]
    TEMPERATURE_VALUES = [1.0, 1.1, 1.2, 1.3]

    SUBSET_SIZE = 100

    CUSTOM_WEIGHTS = None

    results = hyperparameter_search(
        model_path=MODEL_PATH,
        tokenizer_path=TOKENIZER_PATH,
        test_data_path=TEST_DATA_PATH,
        device=DEVICE,
        top_p_values=TOP_P_VALUES,
        top_k_values=TOP_K_VALUES,
        temperature_values=TEMPERATURE_VALUES,
        subset_size=SUBSET_SIZE,
        output_dir=OUTPUT_DIR,
        weights=CUSTOM_WEIGHTS
    )

    print(f"\nBest hyperparameters found:")
    print(f"top_p: {results['best_params']['top_p']}")
    print(f"top_k: {results['best_params']['top_k']}")
    print(f"temperature: {results['best_params']['temperature']}")
    print(f"Score: {results['best_score']:.4f}")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading model from /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_4...
Detected RL model directory
Loading RL model from directory: /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_4


Some weights of the model checkpoint at /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_4 were not used when initializing GPT2LMHeadModel: ['v_head.summary.bias', 'v_head.summary.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loaded RL model successfully
Loading evaluation data subset (size: 100)...
Processing evaluation data (subset_size: 100)...


Processing evaluation data:  24%|██▎       | 24/102 [00:00<00:00, 6780.50it/s]

Processed 100 evaluation samples
Testing 288 hyperparameter combinations...
top_p values: [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]
top_k values: [0, 5, 10, 15, 20, 25, 30, 35]
temperature values: [1.0, 1.1, 1.2, 1.3]
Subset size: 100 samples

[1/288] Testing top_p=0.6, top_k=0, temp=1.0





  *** NEW BEST *** Score: 0.1965
  BLEU: 0.0084, ROUGE-1: 0.1265, ROUGE-2: 0.0166, ROUGE-L: 0.1221, METEOR: 0.0401, Emotion: 0.9700, Combined: 0.1965

[2/288] Testing top_p=0.6, top_k=0, temp=1.1
  *** NEW BEST *** Score: 0.1976
  BLEU: 0.0075, ROUGE-1: 0.1289, ROUGE-2: 0.0121, ROUGE-L: 0.1224, METEOR: 0.0429, Emotion: 0.9800, Combined: 0.1976

[3/288] Testing top_p=0.6, top_k=0, temp=1.2
  *** NEW BEST *** Score: 0.2052
  BLEU: 0.0144, ROUGE-1: 0.1344, ROUGE-2: 0.0189, ROUGE-L: 0.1256, METEOR: 0.0622, Emotion: 0.9800, Combined: 0.2052

[4/288] Testing top_p=0.6, top_k=0, temp=1.3
  BLEU: 0.0116, ROUGE-1: 0.1326, ROUGE-2: 0.0148, ROUGE-L: 0.1269, METEOR: 0.0570, Emotion: 0.9600, Combined: 0.2009

[5/288] Testing top_p=0.6, top_k=5, temp=1.0
  BLEU: 0.0114, ROUGE-1: 0.1402, ROUGE-2: 0.0211, ROUGE-L: 0.1321, METEOR: 0.0502, Emotion: 0.9700, Combined: 0.2023

[6/288] Testing top_p=0.6, top_k=5, temp=1.1
  BLEU: 0.0095, ROUGE-1: 0.1227, ROUGE-2: 0.0119, ROUGE-L: 0.1176, METEOR: 0.0378, Emo

RL Evaluation

In [None]:
import json
import re
import torch
from transformers import GPT2Tokenizer
from datasets import Dataset as HFDataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import nltk
from tqdm import tqdm
import numpy as np
from collections import defaultdict

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# ====================== Text Cleaning Functions ======================
def clean_therapy_text(text):
    """Remove descriptive text patterns from SFT data"""
    expression_pattern = r'\s*(The (?:speaker|emotion state)[^.]*\.(?:[^.]*\.)*)'
    match = re.search(expression_pattern, text, re.IGNORECASE | re.DOTALL)
    if match:
        return text[:match.start()].strip()
    return text.strip()

# ====================== Model Format Detection ======================
def detect_model_format(model, tokenizer, device='cuda', test_inputs=None):
    """
    Test model output format (should be consistent since both SFT and RL use same format)

    Args:
        model: The model to test
        tokenizer: Model tokenizer
        device: Device to run test on
        test_inputs: List of test input strings, uses defaults if None

    Returns:
        str: 'standard' if using <therapist_emotion> format, 'unknown' otherwise
    """
    if test_inputs is None:
        test_inputs = [
            "<problem>anxiety <user>I'm worried about work <user_emotion>anxiety <therapist>",
            "<problem>depression <user>I feel very sad <user_emotion>sadness <therapist>",
            "<problem>relationship <user>My partner doesn't understand me <user_emotion>anger <therapist>"
        ]

    emotion_tag_count = 0
    total_tests = len(test_inputs)

    model.eval()
    with torch.no_grad():
        for test_input in test_inputs:
            # Tokenize and generate
            input_ids = tokenizer.encode(test_input, return_tensors='pt').to(device)
            outputs = model.generate(
                input_ids,
                max_new_tokens=128,
                do_sample=True,
                top_p=0.8,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

            # Decode response
            new_tokens = outputs[0][len(input_ids[0]):]
            response = tokenizer.decode(new_tokens, skip_special_tokens=False).strip()

            # Check for emotion tag
            if '<therapist_emotion>' in response:
                emotion_tag_count += 1

    # Determine format
    if emotion_tag_count >= total_tests * 0.5:
        return 'standard'
    else:
        return 'unknown'

# ====================== Model Output Extraction ======================
def extract_model_output(response):
    """
    Extract therapist text and emotion from model output.
    Both SFT and RL models use format: 'text <therapist_emotion> emotion<eos>'

    Returns:
        therapist_text (str): Text before <therapist_emotion>
        emotion (str): Emotion word after <therapist_emotion>
        has_emotion_tag (bool): Whether emotion tag was found
    """
    response = response.strip()

    # Remove <eos> if present
    if response.endswith('<eos>'):
        response = response[:-5].strip()

    # Find <therapist_emotion> tag
    emotion_pattern = r'<therapist_emotion>'
    emotion_match = re.search(emotion_pattern, response)

    if emotion_match:
        # Extract text before <therapist_emotion>
        therapist_text = response[:emotion_match.start()].strip()

        # Extract emotion part after <therapist_emotion>
        emotion_part = response[emotion_match.end():].strip()

        # Get the first word as emotion
        emotion_words = emotion_part.split()
        emotion = emotion_words[0].lower() if emotion_words else ""

        return therapist_text, emotion, True
    else:
        # No emotion tag found, return entire response as text
        return response, "", False

def extract_emotion_only(response):
    """Extract only emotion from response (for compatibility with existing code)"""
    _, emotion, has_emotion = extract_model_output(response)
    return emotion if has_emotion else None

# ====================== Reference Data Processing ======================
class TherapyEvaluationDataset:
    """Dataset class for evaluation data processing"""

    def __init__(self, json_path, tokenizer_path=None):
        with open(json_path, 'r', encoding='utf-8') as f:
            self._data = json.load(f)

        if tokenizer_path:
            self.tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
        else:
            from transformers import AutoTokenizer
            self.tokenizer = AutoTokenizer.from_pretrained('gpt2')

        self.evaluation_data = []
        self._process_evaluation_data()

    def _process_evaluation_data(self):
        """Process dialog data for evaluation"""
        for conv in tqdm(self._data, desc="Processing evaluation data"):
            problem_type = conv.get("problem_type", "").strip()
            dialog = conv.get("dialog", [])

            user_text_parts = []
            user_emotions = []

            for turn in dialog:
                speaker = turn.get("speaker", "")
                text = clean_therapy_text(turn.get("text", ""))
                emotion = turn.get("emotion", "").strip()

                if speaker != "sys":  # User turn
                    if text:
                        user_text_parts.append(text)
                    if emotion:
                        user_emotions.append(emotion)
                else:  # Therapist turn
                    if not user_text_parts:
                        continue

                    therapist_text = clean_therapy_text(text)
                    therapist_emotion = emotion

                    combined_user_text = " ".join(user_text_parts)
                    last_user_emotion = user_emotions[-1] if user_emotions else ""

                    # Create input prompt
                    input_parts = []
                    if problem_type:
                        input_parts.append(f"<problem>{problem_type}")

                    input_parts.append(f"<user>{combined_user_text}")
                    if last_user_emotion:
                        input_parts.append(f"<user_emotion>{last_user_emotion}")

                    input_parts.append("<therapist>")
                    input_text = " ".join(input_parts)

                    self.evaluation_data.append({
                        'input_text': input_text,
                        'reference_text': therapist_text,
                        'reference_emotion': therapist_emotion,
                        'user_input': combined_user_text,
                        'user_emotion': last_user_emotion,
                        'problem_type': problem_type
                    })

                    user_text_parts = []
                    user_emotions = []

        print(f"Processed {len(self.evaluation_data)} evaluation samples")

    def get_evaluation_data(self):
        return self.evaluation_data

# ====================== Evaluation Metrics ======================
class TherapyEvaluationMetrics:
    """Class to compute evaluation metrics"""

    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.smoothing = SmoothingFunction().method1

    def compute_bleu(self, reference, candidate):
        """Compute BLEU score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=self.smoothing)

    def compute_rouge(self, reference, candidate):
        """Compute ROUGE scores"""
        if not candidate or not reference:
            return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

        scores = self.rouge_scorer.score(reference, candidate)
        return {
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        }

    def compute_meteor(self, reference, candidate):
        """Compute METEOR score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return meteor_score([reference_tokens], candidate_tokens)

# ====================== Model Evaluation ======================
def evaluate_model(model, tokenizer, evaluation_dataset, device='cuda', max_new_tokens=128,
                  top_p=0.7, top_k=10, temperature=1.2, do_sample=True):
    """
    Evaluate model on therapy dataset

    Args:
        model: Trained model
        tokenizer: Model tokenizer
        evaluation_dataset: TherapyEvaluationDataset instance
        device: Device to run evaluation on
        max_new_tokens: Maximum tokens to generate
        top_p: Top-p sampling parameter
        do_sample: Whether to use sampling

    Returns:
        Dictionary containing evaluation results
    """
    metrics_computer = TherapyEvaluationMetrics()
    evaluation_data = evaluation_dataset.get_evaluation_data()

    results = {
        'bleu_scores': [],
        'rouge1_scores': [],
        'rouge2_scores': [],
        'rougeL_scores': [],
        'meteor_scores': [],
        'emotion_accuracy': [],
        'has_emotion_tag': [],
        'sample_outputs': []
    }

    emotion_confusion = defaultdict(lambda: defaultdict(int))

    model.eval()

    # Detect model output format
    detected_format = detect_model_format(model, tokenizer, device)
    print(f"Output format check: {detected_format.upper()}")

    print(f"Evaluating model on {len(evaluation_data)} samples...")

    progress_bar = tqdm(evaluation_data, desc="Evaluating",
                       bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]')

    for i, sample in enumerate(progress_bar):
        input_text = sample['input_text']
        reference_text = sample['reference_text']
        reference_emotion = sample['reference_emotion']

        # Tokenize input
        input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_new_tokens=max_new_tokens,
                do_sample=do_sample,
                top_p=top_p,
                top_k=top_k,
                temperature=temperature,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        # Decode response
        new_tokens = outputs[0][len(input_ids[0]):]
        response = tokenizer.decode(new_tokens, skip_special_tokens=False).strip()

        # Extract model output
        predicted_text, predicted_emotion, has_emotion = extract_model_output(response)

        # Compute text generation metrics
        bleu = metrics_computer.compute_bleu(reference_text, predicted_text)
        rouge = metrics_computer.compute_rouge(reference_text, predicted_text)
        meteor = metrics_computer.compute_meteor(reference_text, predicted_text)

        # Compute emotion accuracy
        emotion_correct = (predicted_emotion.lower() == reference_emotion.lower()) if has_emotion else False

        # Store results
        results['bleu_scores'].append(bleu)
        results['rouge1_scores'].append(rouge['rouge1'])
        results['rouge2_scores'].append(rouge['rouge2'])
        results['rougeL_scores'].append(rouge['rougeL'])
        results['meteor_scores'].append(meteor)
        results['emotion_accuracy'].append(emotion_correct)
        results['has_emotion_tag'].append(has_emotion)

        if has_emotion:
            emotion_confusion[reference_emotion.lower()][predicted_emotion.lower()] += 1
        else:
            emotion_confusion[reference_emotion.lower()]['no_prediction'] += 1

        # Store sample outputs for inspection
        if i < 10:
            results['sample_outputs'].append({
                'input': input_text,
                'reference_text': reference_text,
                'predicted_text': predicted_text,
                'reference_emotion': reference_emotion,
                'predicted_emotion': predicted_emotion,
                'full_response': response,
                'bleu': bleu,
                'rouge1': rouge['rouge1'],
                'meteor': meteor
            })

        if (i + 1) % 20 == 0 or i < 5:
            sample_info = f"Sample {i+1}: BLEU={bleu:.3f}, Emotion={'✓' if emotion_correct else '✗'}"
            progress_bar.set_description(f"Evaluating - {sample_info}")

            if (i + 1) % 50 == 0 or i < 3:
                print(f"\n" + "="*80)
                print(f"LIVE SAMPLE {i+1}/{len(evaluation_data)}")
                print("="*80)
                print(f"Input: {input_text[:120]}...")
                print(f"Reference: {reference_text[:100]}...")
                print(f"Generated: {predicted_text[:100]}...")
                print(f"Full Response: {response[:150]}...")
                print(f"Ref Emotion: {reference_emotion} | Pred Emotion: {predicted_emotion} | Match: {'✓' if emotion_correct else '✗'}")
                print(f"BLEU: {bleu:.4f} | ROUGE-1: {rouge['rouge1']:.4f} | METEOR: {meteor:.4f}")
                print("="*80)
        else:
            if i > 0:
                avg_bleu = np.mean(results['bleu_scores'])
                avg_emotion_acc = np.mean(results['emotion_accuracy'])
                progress_bar.set_description(f"Evaluating - Avg BLEU: {avg_bleu:.3f}, Emotion Acc: {avg_emotion_acc:.3f}")

    progress_bar.close()

    # Compute summary statistics
    results['summary'] = {
        'avg_bleu': np.mean(results['bleu_scores']),
        'avg_rouge1': np.mean(results['rouge1_scores']),
        'avg_rouge2': np.mean(results['rouge2_scores']),
        'avg_rougeL': np.mean(results['rougeL_scores']),
        'avg_meteor': np.mean(results['meteor_scores']),
        'emotion_accuracy': np.mean(results['emotion_accuracy']),
        'emotion_tag_coverage': np.mean(results['has_emotion_tag']),
        'total_samples': len(evaluation_data)
    }

    results['emotion_confusion_matrix'] = dict(emotion_confusion)
    results['detected_format'] = detected_format

    # Print final progress summary
    print(f"\n{'='*60}")
    print(f"EVALUATION COMPLETED!")
    print(f"{'='*60}")
    print(f"Processed {len(evaluation_data)} samples")
    print(f"Average BLEU: {results['summary']['avg_bleu']:.4f}")
    print(f"Average ROUGE-1: {results['summary']['avg_rouge1']:.4f}")
    print(f"Emotion Accuracy: {results['summary']['emotion_accuracy']:.4f}")
    print(f"Emotion Tag Coverage: {results['summary']['emotion_tag_coverage']:.4f}")
    print(f"{'='*60}")

    return results

def print_evaluation_results(results):
    """Print formatted evaluation results"""
    summary = results['summary']

    print("\n" + "="*60)
    print("THERAPY MODEL EVALUATION RESULTS")
    print("="*60)
    print(f"Model Type: {results.get('model_type', 'Unknown').upper()}")
    print(f"Output Format: {results.get('detected_format', 'Unknown').upper()}")

    print(f"\nText Generation Metrics:")
    print(f"  BLEU Score:     {summary['avg_bleu']:.4f}")
    print(f"  ROUGE-1:        {summary['avg_rouge1']:.4f}")
    print(f"  ROUGE-2:        {summary['avg_rouge2']:.4f}")
    print(f"  ROUGE-L:        {summary['avg_rougeL']:.4f}")
    print(f"  METEOR:         {summary['avg_meteor']:.4f}")

    print(f"\nEmotion Prediction:")
    print(f"  Emotion Accuracy:    {summary['emotion_accuracy']:.4f}")
    print(f"  Emotion Tag Coverage: {summary['emotion_tag_coverage']:.4f}")

    print(f"\nDataset Info:")
    print(f"  Total Samples:  {summary['total_samples']}")

    print(f"\nSample Outputs:")
    for i, sample in enumerate(results['sample_outputs'][:3], 1):
        print(f"\n  Sample {i}:")
        print(f"    Input: {sample['input'][:100]}...")
        print(f"    Reference: {sample['reference_text'][:80]}...")
        print(f"    Predicted: {sample['predicted_text'][:80]}...")
        print(f"    Ref Emotion: {sample['reference_emotion']}")
        print(f"    Pred Emotion: {sample['predicted_emotion']}")
        print(f"    BLEU: {sample['bleu']:.3f}, ROUGE-1: {sample['rouge1']:.3f}")

# ====================== Model Loading Functions ======================
def load_sft_model(checkpoint_path, tokenizer_path, device='cuda'):
    """Load SFT model from checkpoint file"""
    from transformers import GPT2LMHeadModel

    print(f"Loading SFT checkpoint: {checkpoint_path}")

    # Load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

    # Load checkpoint
    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)

    # Initialize model
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.resize_token_embeddings(len(tokenizer))
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)

    if 'epoch' in checkpoint:
        print(f"Checkpoint info: Epoch {checkpoint['epoch']}, Loss {checkpoint.get('valid_loss', 'N/A')}")

    return model, tokenizer

def load_rl_model(model_dir, tokenizer_path, device='cuda'):
    """Load RL model from directory"""
    from transformers import AutoModelForCausalLM

    print(f"Loading RL model from directory: {model_dir}")

    # Load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

    # Load model from directory
    model = AutoModelForCausalLM.from_pretrained(model_dir).to(device)

    return model, tokenizer

def load_model_auto(model_path, tokenizer_path, device='cuda'):
    """
    Automatically detect and load model (SFT checkpoint or RL directory)

    Args:
        model_path: Path to model (file for SFT, directory for RL)
        tokenizer_path: Path to tokenizer
        device: Device to load model on

    Returns:
        tuple: (model, tokenizer, model_type)
    """
    import os

    if os.path.isfile(model_path):
        # It's a file - assume SFT checkpoint
        print("Detected SFT checkpoint file")
        model, tokenizer = load_sft_model(model_path, tokenizer_path, device)
        return model, tokenizer, 'sft'

    elif os.path.isdir(model_path):
        # It's a directory - assume RL model
        print("Detected RL model directory")
        model, tokenizer = load_rl_model(model_path, tokenizer_path, device)
        return model, tokenizer, 'rl'

    else:
        raise ValueError(f"Model path {model_path} is neither a file nor a directory")

# ====================== Main Evaluation Function ======================
def run_evaluation(model_path, tokenizer_path, test_data_path, device='cuda'):
    """
    Run complete evaluation pipeline

    Args:
        model_path: Path to trained model (file for SFT, directory for RL)
        tokenizer_path: Path to tokenizer
        test_data_path: Path to test JSON data
        device: Device to run evaluation on
    """

    print(f"Loading model from {model_path}...")
    model, tokenizer, model_type = load_model_auto(model_path, tokenizer_path, device)
    print(f"Loaded {model_type.upper()} model successfully")

    print(f"Loading evaluation data from {test_data_path}...")
    eval_dataset = TherapyEvaluationDataset(test_data_path, tokenizer_path)

    results = evaluate_model(model, tokenizer, eval_dataset, device=device)
    results['model_type'] = model_type

    print_evaluation_results(results)

    return results

# ====================== Usage Example ======================
if __name__ == "__main__":


    MODEL_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_4"  # Will auto-detect type
    TOKENIZER_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/train_processed_4thFIXED_tokenizer"
    TEST_DATA_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/test.json"
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    # Run evaluation (works for both SFT and RL)
    results = run_evaluation(MODEL_PATH, TOKENIZER_PATH, TEST_DATA_PATH, DEVICE)

    # Optionally save results
    import pickle
    with open('/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/Evaluation2/temp1dot3RL_evaluation_results.pkl', 'wb') as f:
      pickle.dump(results, f)

SFT No Emotion Hyperarameter Tuning

In [None]:
import json
import re
import torch
from transformers import GPT2Tokenizer
from datasets import Dataset as HFDataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import nltk
from tqdm import tqdm
import numpy as np
from collections import defaultdict
import itertools
import pickle
import os
from datetime import datetime
import random

# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set seed at import
set_seed(42)

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# ====================== Text Cleaning Functions ======================
def clean_therapy_text(text):
    """Remove descriptive text patterns from SFT data"""
    expression_pattern = r'\s*(The (?:speaker|emotion state)[^.]*\.(?:[^.]*\.)*)'
    match = re.search(expression_pattern, text, re.IGNORECASE | re.DOTALL)
    if match:
        return text[:match.start()].strip()
    return text.strip()

# ====================== Model Output Extraction ======================
def extract_model_output_no_emotion(response):
    """Extract therapist text from model output - NO EMOTION VERSION"""
    response = response.strip()

    # Remove <eos> if present
    if response.endswith('<eos>'):
        response = response[:-5].strip()

    # Clean up any accidentally generated emotion tags (shouldn't happen)
    if '<therapist_emotion>' in response:
        response = response.split('<therapist_emotion>')[0].strip()
        print(f"WARNING: Model generated emotion tag (training issue)")

    # Remove any standalone emotion words at the end
    emotion_words = ['anger', 'sadness', 'depression', 'disgust', 'fear', 'joy', 'neutral']
    for emotion in emotion_words:
        pattern = r'\s+' + emotion + r'\s*$'
        response = re.sub(pattern, '', response, flags=re.IGNORECASE)

    return response

# ====================== Reference Data Processing ======================
class TherapyEvaluationDatasetNoEmotion:
    """Dataset class for evaluation data processing - NO EMOTION VERSION"""

    def __init__(self, json_path, tokenizer_path=None, subset_size=None):
        with open(json_path, 'r', encoding='utf-8') as f:
            self._data = json.load(f)

        if tokenizer_path:
            self.tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
        else:
            from transformers import AutoTokenizer
            self.tokenizer = AutoTokenizer.from_pretrained('gpt2')

        self.evaluation_data = []
        self.subset_size = subset_size
        self._process_evaluation_data()

    def _process_evaluation_data(self):
        """Process dialog data for evaluation - matches no-emotion training format"""
        print(f"Processing evaluation data (subset_size: {self.subset_size})...")

        for conv in tqdm(self._data, desc="Processing evaluation data"):
            problem_type = conv.get("problem_type", "").strip()
            dialog = conv.get("dialog", [])

            user_text_parts = []
            user_emotions = []

            for turn in dialog:
                speaker = turn.get("speaker", "")
                text = clean_therapy_text(turn.get("text", ""))
                emotion = turn.get("emotion", "").strip()

                if speaker != "sys":  # User turn
                    if text:
                        user_text_parts.append(text)
                    if emotion:
                        user_emotions.append(emotion)
                else:  # Therapist turn
                    if not user_text_parts:
                        continue

                    therapist_text = clean_therapy_text(text)

                    combined_user_text = " ".join(user_text_parts)
                    last_user_emotion = user_emotions[-1] if user_emotions else ""

                    # Create input prompt - MATCHES TRAINING FORMAT EXACTLY
                    input_parts = ["<bos>"]

                    if problem_type:
                        input_parts.extend(["<problem>", problem_type])

                    input_parts.extend(["<user>", combined_user_text])

                    if last_user_emotion:
                        input_parts.extend(["<user_emotion>", last_user_emotion])

                    input_parts.append("<therapist>")
                    input_text = "".join([self.tokenizer.bos_token] +
                                       ([f"<problem>{problem_type}"] if problem_type else []) +
                                       [f"<user>{combined_user_text}"] +
                                       ([f"<user_emotion>{last_user_emotion}"] if last_user_emotion else []) +
                                       ["<therapist>"])

                    self.evaluation_data.append({
                        'input_text': input_text,
                        'reference_text': therapist_text,
                        'user_input': combined_user_text,
                        'user_emotion': last_user_emotion,
                        'problem_type': problem_type
                    })

                    user_text_parts = []
                    user_emotions = []

                    # Early stopping if subset size is reached
                    if self.subset_size and len(self.evaluation_data) >= self.subset_size:
                        break

            # Early stopping if subset size is reached
            if self.subset_size and len(self.evaluation_data) >= self.subset_size:
                break

        print(f"Processed {len(self.evaluation_data)} evaluation samples")

    def get_evaluation_data(self):
        return self.evaluation_data

# ====================== Evaluation Metrics ======================
class TherapyEvaluationMetricsNoEmotion:
    """Class to compute evaluation metrics - NO EMOTION VERSION"""

    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.smoothing = SmoothingFunction().method1

    def compute_bleu(self, reference, candidate):
        """Compute BLEU score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=self.smoothing)

    def compute_rouge(self, reference, candidate):
        """Compute ROUGE scores"""
        if not candidate or not reference:
            return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

        scores = self.rouge_scorer.score(reference, candidate)
        return {
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        }

    def compute_meteor(self, reference, candidate):
        """Compute METEOR score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return meteor_score([reference_tokens], candidate_tokens)

# ====================== Fast Evaluation Function ======================
def evaluate_hyperparameters_no_emotion(model, tokenizer, evaluation_dataset, device='cuda',
                            max_new_tokens=64, top_p=0.8, top_k=0, temperature=1.0, do_sample=True,
                            verbose=False):
    """
    Fast evaluation for hyperparameter tuning - NO EMOTION VERSION

    Args:
        model: Trained model
        tokenizer: Model tokenizer
        evaluation_dataset: TherapyEvaluationDatasetNoEmotion instance
        device: Device to run evaluation on
        max_new_tokens: Maximum tokens to generate
        top_p: Top-p sampling parameter
        top_k: Top-k sampling parameter
        temperature: Temperature for sampling
        do_sample: Whether to use sampling
        verbose: Whether to print detailed progress

    Returns:
        Dictionary containing evaluation results
    """
    metrics_computer = TherapyEvaluationMetricsNoEmotion()
    evaluation_data = evaluation_dataset.get_evaluation_data()

    results = {
        'bleu_scores': [],
        'rouge1_scores': [],
        'rouge2_scores': [],
        'rougeL_scores': [],
        'meteor_scores': []
    }

    model.eval()

    # Progress bar only if verbose
    if verbose:
        progress_bar = tqdm(evaluation_data, desc=f"Eval p={top_p:.2f} k={top_k} T={temperature:.1f}")
        data_iter = progress_bar
    else:
        data_iter = evaluation_data

    for i, sample in enumerate(data_iter):
        input_text = sample['input_text']
        reference_text = sample['reference_text']

        # Tokenize input
        input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_new_tokens=max_new_tokens,
                do_sample=do_sample,
                top_p=top_p,
                top_k=top_k,
                temperature=temperature,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        # Decode response
        new_tokens = outputs[0][len(input_ids[0]):]
        response = tokenizer.decode(new_tokens, skip_special_tokens=False).strip()

        # Extract model output
        predicted_text = extract_model_output_no_emotion(response)

        # Compute metrics
        bleu = metrics_computer.compute_bleu(reference_text, predicted_text)
        rouge = metrics_computer.compute_rouge(reference_text, predicted_text)
        meteor = metrics_computer.compute_meteor(reference_text, predicted_text)

        # Store results
        results['bleu_scores'].append(bleu)
        results['rouge1_scores'].append(rouge['rouge1'])
        results['rouge2_scores'].append(rouge['rouge2'])
        results['rougeL_scores'].append(rouge['rougeL'])
        results['meteor_scores'].append(meteor)

    if verbose and 'progress_bar' in locals():
        progress_bar.close()

    # Compute summary statistics
    summary = {
        'avg_bleu': np.mean(results['bleu_scores']),
        'avg_rouge1': np.mean(results['rouge1_scores']),
        'avg_rouge2': np.mean(results['rouge2_scores']),
        'avg_rougeL': np.mean(results['rougeL_scores']),
        'avg_meteor': np.mean(results['meteor_scores']),
        'total_samples': len(evaluation_data),
        'top_p': top_p,
        'top_k': top_k,
        'temperature': temperature
    }

    return summary

# ====================== Model Loading Functions ======================
def load_no_emotion_model(checkpoint_path, tokenizer_path, device='cuda'):
    """Load no-emotion SFT model from checkpoint file"""
    from transformers import GPT2LMHeadModel

    print(f"Loading no-emotion checkpoint: {checkpoint_path}")

    # Load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

    # Load checkpoint
    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)

    # Initialize model
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.resize_token_embeddings(len(tokenizer))
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)

    return model, tokenizer

# ====================== Hyperparameter Tuning Functions ======================
def compute_combined_score_no_emotion(results, weights=None):
    """
    Compute a combined score from multiple metrics - NO EMOTION VERSION

    Args:
        results: Dictionary containing metric results
        weights: Dictionary of weights for each metric

    Returns:
        Combined score (higher is better)
    """
    if weights is None:
        weights = {
            'avg_bleu': 0.25,
            'avg_rouge1': 0.20,
            'avg_rouge2': 0.20,
            'avg_rougeL': 0.20,
            'avg_meteor': 0.15
        }

    score = 0.0
    for metric, weight in weights.items():
        if metric in results:
            score += results[metric] * weight

    return score

def hyperparameter_search_no_emotion(model_path, tokenizer_path, test_data_path, device='cuda',
                         top_p_values=None, top_k_values=None, temperature_values=None, subset_size=100,
                         max_new_tokens=128, output_dir=None, weights=None):
    """
    Perform grid search over top_p, top_k, and temperature values - NO EMOTION VERSION

    Args:
        model_path: Path to trained model checkpoint
        tokenizer_path: Path to tokenizer
        test_data_path: Path to test JSON data
        device: Device to run evaluation on
        top_p_values: List of top_p values to test
        top_k_values: List of top_k values to test
        temperature_values: List of temperature values to test
        subset_size: Number of samples to use for evaluation
        max_new_tokens: Maximum tokens to generate
        output_dir: Directory to save results
        weights: Weights for combining metrics

    Returns:
        Dictionary with all results and best parameters
    """

    if top_p_values is None:
        top_p_values = [0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]

    if top_k_values is None:
        top_k_values = [0, 5, 10, 20, 40, 50]

    if temperature_values is None:
        temperature_values = [0.7, 0.8, 1.0, 1.1, 1.2]

    # Load model and tokenizer
    print(f"Loading no-emotion model from {model_path}...")
    model, tokenizer = load_no_emotion_model(model_path, tokenizer_path, device)
    print(f"Loaded no-emotion SFT model successfully")

    # Load evaluation dataset (subset)
    print(f"Loading evaluation data subset (size: {subset_size})...")
    eval_dataset = TherapyEvaluationDatasetNoEmotion(test_data_path, tokenizer_path, subset_size=subset_size)

    # Initialize results storage
    all_results = []
    best_score = -1
    best_params = None

    # Total combinations
    total_combinations = len(top_p_values) * len(top_k_values) * len(temperature_values)
    print(f"Testing {total_combinations} hyperparameter combinations...")
    print(f"top_p values: {top_p_values}")
    print(f"top_k values: {top_k_values}")
    print(f"temperature values: {temperature_values}")
    print(f"Subset size: {subset_size} samples")

    # Grid search
    combination_count = 0

    for top_p in top_p_values:
        for top_k in top_k_values:
            for temperature in temperature_values:
                combination_count += 1

                print(f"\n[{combination_count}/{total_combinations}] Testing top_p={top_p}, top_k={top_k}, temp={temperature}")

                # Evaluate with current hyperparameters
                results = evaluate_hyperparameters_no_emotion(
                    model, tokenizer, eval_dataset, device,
                    max_new_tokens=max_new_tokens,
                    top_p=top_p, top_k=top_k, temperature=temperature,
                    verbose=False
                )

                # Compute combined score
                combined_score = compute_combined_score_no_emotion(results, weights)
                results['combined_score'] = combined_score

                # Store results
                all_results.append(results)

                # Check if this is the best so far
                if combined_score > best_score:
                    best_score = combined_score
                    best_params = {'top_p': top_p, 'top_k': top_k, 'temperature': temperature}
                    print(f"  *** NEW BEST *** Score: {combined_score:.4f}")

                # Print current results
                print(f"  BLEU: {results['avg_bleu']:.4f}, "
                      f"ROUGE-1: {results['avg_rouge1']:.4f}, "
                      f"ROUGE-2: {results['avg_rouge2']:.4f}, "
                      f"ROUGE-L: {results['avg_rougeL']:.4f}, "
                      f"METEOR: {results['avg_meteor']:.4f}, "
                      f"Combined: {combined_score:.4f}")

    # Compile final results
    final_results = {
        'best_params': best_params,
        'best_score': best_score,
        'all_results': all_results,
        'model_path': model_path,
        'model_type': 'no_emotion_sft',
        'subset_size': subset_size,
        'search_space': {
            'top_p_values': top_p_values,
            'top_k_values': top_k_values,
            'temperature_values': temperature_values
        },
        'weights': weights or {
            'avg_bleu': 0.25,
            'avg_rouge1': 0.20,
            'avg_rouge2': 0.20,
            'avg_rougeL': 0.20,
            'avg_meteor': 0.15
        },
        'timestamp': datetime.now().isoformat()
    }

    # Print summary
    print(f"\n{'='*60}")
    print("NO-EMOTION HYPERPARAMETER SEARCH COMPLETED")
    print(f"{'='*60}")
    print(f"Best parameters: top_p={best_params['top_p']}, top_k={best_params['top_k']}, temperature={best_params['temperature']}")
    print(f"Best combined score: {best_score:.4f}")

    # Find best result details
    best_result = max(all_results, key=lambda x: x['combined_score'])
    print(f"\nBest result details:")
    print(f"  BLEU Score: {best_result['avg_bleu']:.4f}")
    print(f"  ROUGE-1: {best_result['avg_rouge1']:.4f}")
    print(f"  ROUGE-2: {best_result['avg_rouge2']:.4f}")
    print(f"  ROUGE-L: {best_result['avg_rougeL']:.4f}")
    print(f"  METEOR: {best_result['avg_meteor']:.4f}")
    print(f"  Combined Score: {best_result['combined_score']:.4f}")

    # Save results if output directory specified
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        results_file = os.path.join(output_dir, f"no_emotion_hyperparameter_search_{timestamp}.pkl")

        with open(results_file, 'wb') as f:
            pickle.dump(final_results, f)

        print(f"\nResults saved to: {results_file}")

        # Also save a summary CSV
        try:
            import pandas as pd
            df_results = pd.DataFrame(all_results)
            csv_file = os.path.join(output_dir, f"no_emotion_hyperparameter_results_{timestamp}.csv")
            df_results.to_csv(csv_file, index=False)
            print(f"CSV summary saved to: {csv_file}")
        except ImportError:
            print("pandas not available, skipping CSV export")

    return final_results

def analyze_no_emotion_results(results_file):
    """Analyze no-emotion hyperparameter search results"""
    with open(results_file, 'rb') as f:
        results = pickle.load(f)

    try:
        import pandas as pd
        # Convert to DataFrame
        df = pd.DataFrame(results['all_results'])

        print(f"No-Emotion Hyperparameter Search Analysis")
        print(f"=" * 50)
        print(f"Model: {results['model_path']}")
        print(f"Search completed: {results['timestamp']}")
        print(f"Total combinations tested: {len(df)}")

        # Top 5 results
        print(f"\nTop 5 Results:")
        top_5 = df.nlargest(5, 'combined_score')[['top_p', 'top_k', 'temperature', 'avg_bleu', 'avg_rouge1',
                                                  'avg_rouge2', 'avg_rougeL', 'avg_meteor', 'combined_score']]
        print(top_5.to_string(index=False))

        # Best for each metric
        print(f"\nBest parameters for each metric:")
        for metric in ['avg_bleu', 'avg_rouge1', 'avg_rouge2', 'avg_rougeL', 'avg_meteor']:
            best_row = df.loc[df[metric].idxmax()]
            print(f"  {metric}: top_p={best_row['top_p']}, top_k={best_row['top_k']}, temp={best_row['temperature']} (score: {best_row[metric]:.4f})")

        return df

    except ImportError:
        print("pandas not available, showing basic analysis")
        print(f"Best parameters: {results['best_params']}")
        print(f"Best score: {results['best_score']:.4f}")
        return results['all_results']

# ====================== Usage Example ======================
if __name__ == "__main__":
    MODEL_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/SFTNoemotions/therapy_checkpoints_4thFIXED_noemotions/best_model.ckpt"
    TOKENIZER_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/SFTNoemotions/train_processed_no_emotion_v1_tokenizer"
    TEST_DATA_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/val.json"
    OUTPUT_DIR = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/SFTNoemotions/hyperparameter_search"
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    # Hyperparameter search space
    TOP_P_VALUES = [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.00]
    TOP_K_VALUES = [0, 5, 10, 15, 20, 25, 30, 35]
    TEMPERATURE_VALUES = [1.0, 1.1, 1.2, 1.3]

    SUBSET_SIZE = 100

    CUSTOM_WEIGHTS = None

    results = hyperparameter_search_no_emotion(
        model_path=MODEL_PATH,
        tokenizer_path=TOKENIZER_PATH,
        test_data_path=TEST_DATA_PATH,
        device=DEVICE,
        top_p_values=TOP_P_VALUES,
        top_k_values=TOP_K_VALUES,
        temperature_values=TEMPERATURE_VALUES,
        subset_size=SUBSET_SIZE,
        max_new_tokens=128,
        output_dir=OUTPUT_DIR,
        weights=CUSTOM_WEIGHTS
    )

    print(f"\nBest hyperparameters found:")
    print(f"top_p: {results['best_params']['top_p']}")
    print(f"top_k: {results['best_params']['top_k']}")
    print(f"temperature: {results['best_params']['temperature']}")
    print(f"Score: {results['best_score']:.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Loading no-emotion model from /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/SFTNoemotions/therapy_checkpoints_4thFIXED_noemotions/best_model.ckpt...
Loading no-emotion checkpoint: /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/SFTNoemotions/therapy_checkpoints_4thFIXED_noemotions/best_model.ckpt


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Loaded no-emotion SFT model successfully
Loading evaluation data subset (size: 100)...
Processing evaluation data (subset_size: 100)...


Processing evaluation data:  25%|██▌       | 26/102 [00:00<00:00, 4784.03it/s]

Processed 100 evaluation samples
Testing 288 hyperparameter combinations...
top_p values: [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]
top_k values: [0, 5, 10, 15, 20, 25, 30, 35]
temperature values: [1.0, 1.1, 1.2, 1.3]
Subset size: 100 samples

[1/288] Testing top_p=0.6, top_k=0, temp=1.0





  *** NEW BEST *** Score: 0.0808
  BLEU: 0.0196, ROUGE-1: 0.1454, ROUGE-2: 0.0424, ROUGE-L: 0.1413, METEOR: 0.0672, Combined: 0.0808

[2/288] Testing top_p=0.6, top_k=0, temp=1.1
  BLEU: 0.0086, ROUGE-1: 0.1131, ROUGE-2: 0.0089, ROUGE-L: 0.1061, METEOR: 0.0428, Combined: 0.0542

[3/288] Testing top_p=0.6, top_k=0, temp=1.2
  BLEU: 0.0092, ROUGE-1: 0.0839, ROUGE-2: 0.0057, ROUGE-L: 0.0775, METEOR: 0.0366, Combined: 0.0412

[4/288] Testing top_p=0.6, top_k=0, temp=1.3
  BLEU: 0.0038, ROUGE-1: 0.0594, ROUGE-2: 0.0049, ROUGE-L: 0.0575, METEOR: 0.0276, Combined: 0.0294

[5/288] Testing top_p=0.6, top_k=5, temp=1.0
  BLEU: 0.0162, ROUGE-1: 0.1313, ROUGE-2: 0.0259, ROUGE-L: 0.1230, METEOR: 0.0496, Combined: 0.0675

[6/288] Testing top_p=0.6, top_k=5, temp=1.1
  BLEU: 0.0158, ROUGE-1: 0.1405, ROUGE-2: 0.0401, ROUGE-L: 0.1339, METEOR: 0.0594, Combined: 0.0758

[7/288] Testing top_p=0.6, top_k=5, temp=1.2
  BLEU: 0.0075, ROUGE-1: 0.1040, ROUGE-2: 0.0098, ROUGE-L: 0.0993, METEOR: 0.0352, Combined

SFT No Emotion EValuation

In [None]:
import json
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer, AutoModelForSequenceClassification
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import nltk
from tqdm import tqdm
import numpy as np
from collections import defaultdict

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# ====================== Text Cleaning Functions ======================
def clean_therapy_text(text):
    """Remove descriptive text patterns from SFT data"""
    expression_pattern = r'\s*(The (?:speaker|emotion state)[^.]*\.(?:[^.]*\.)*)'
    match = re.search(expression_pattern, text, re.IGNORECASE | re.DOTALL)
    if match:
        return text[:match.start()].strip()
    return text.strip()

# ====================== Model Loading Functions ======================
def load_no_emotion_model(checkpoint_path, tokenizer_path, device='cuda'):
    """Load no-emotion trained model"""
    print(f"Loading no-emotion model: {checkpoint_path}")

    # Load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

    # Load checkpoint
    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)

    # Initialize model
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.resize_token_embeddings(len(tokenizer))
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()

    print(f"No-emotion model loaded with vocab size: {len(tokenizer)}")
    return model, tokenizer

def load_baseline_gpt2(device='cuda'):
    """Load vanilla GPT-2 model"""
    print("Loading vanilla GPT-2 model...")
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.to(device)
    model.eval()
    print(f"Vanilla GPT-2 loaded with vocab size: {len(tokenizer)}")
    return model, tokenizer

def load_emotion_classifier(device='cuda'):
    """Load RoBERTa emotion classifier"""
    print("Loading emotion classifier...")
    emotion_tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
    emotion_model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")
    emotion_model.to(device)
    emotion_model.eval()
    return emotion_model, emotion_tokenizer

def predict_emotion_roberta(text, emotion_model, emotion_tokenizer, device='cuda'):
    """Predict emotion using RoBERTa model, constrained to therapy emotions"""
    if not text.strip():
        return "neutral", 0.0

    therapy_emotions = ['anger', 'joy', 'neutral', 'sadness', 'depression', 'disgust', 'fear']

    # Tokenize and predict
    inputs = emotion_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = emotion_model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Get all emotion labels from RoBERTa model
    emotion_labels = emotion_model.config.id2label

    # Enhanced mapping based on the 28 RoBERTa emotions
    emotion_mapping = {
        # Direct matches
        'anger': 'anger',
        'joy': 'joy',
        'sadness': 'sadness',
        'fear': 'fear',
        'disgust': 'disgust',
        'neutral': 'neutral',

        # Joy-related emotions
        'amusement': 'joy',
        'excitement': 'joy',
        'gratitude': 'joy',
        'love': 'joy',
        'optimism': 'joy',
        'pride': 'joy',
        'relief': 'joy',
        'admiration': 'joy',

        # Sadness-related emotions (potential depression indicators)
        'disappointment': 'sadness',
        'embarrassment': 'sadness',
        'grief': 'sadness',
        'remorse': 'sadness',

        # Anger-related emotions
        'annoyance': 'anger',
        'disapproval': 'anger',

        # Fear-related emotions
        'nervousness': 'fear',

        # Neutral-related emotions
        'approval': 'neutral',
        'caring': 'neutral',
        'confusion': 'neutral',
        'curiosity': 'neutral',
        'desire': 'neutral',
        'realization': 'neutral',
        'surprise': 'neutral'
    }

    therapy_scores = {emotion: 0.0 for emotion in therapy_emotions}

    for class_id, score in enumerate(predictions[0]):
        go_emotion = emotion_labels[class_id]
        therapy_emotion = emotion_mapping.get(go_emotion, 'neutral')
        therapy_scores[therapy_emotion] += score.item()

    best_emotion = max(therapy_scores, key=therapy_scores.get)
    best_score = therapy_scores[best_emotion]

    if best_emotion == 'sadness':
        depression_indicators = ['grief', 'remorse', 'disappointment', 'embarrassment']
        depression_score = sum(predictions[0][class_id].item()
                              for class_id, label in emotion_labels.items()
                              if label in depression_indicators)

        if depression_score > 0.2 or (emotion_labels.get(predictions[0].argmax().item()) in ['grief', 'remorse']):
            best_emotion = 'depression'
            best_score = depression_score

    return best_emotion, best_score

# ====================== Reference Data Processing ======================
class TherapyEvaluationDatasetNoEmotion:
    """Dataset class for evaluation data processing - No Emotion Version"""

    def __init__(self, json_path):
        with open(json_path, 'r', encoding='utf-8') as f:
            self._data = json.load(f)

        self.evaluation_data = []
        self._process_evaluation_data()

    def _process_evaluation_data(self):
        """Process dialog data for evaluation"""
        for conv in tqdm(self._data, desc="Processing evaluation data"):
            problem_type = conv.get("problem_type", "").strip()
            dialog = conv.get("dialog", [])

            user_text_parts = []
            user_emotions = []

            for turn in dialog:
                speaker = turn.get("speaker", "")
                text = clean_therapy_text(turn.get("text", ""))
                emotion = turn.get("emotion", "").strip()

                if speaker != "sys":  # User turn
                    if text:
                        user_text_parts.append(text)
                    if emotion:
                        user_emotions.append(emotion)
                else:  # Therapist turn
                    if not user_text_parts:
                        continue

                    therapist_text = clean_therapy_text(text)
                    therapist_emotion = emotion

                    combined_user_text = " ".join(user_text_parts)
                    last_user_emotion = user_emotions[-1] if user_emotions else ""

                    # Create input in no-emotion training format
                    input_parts = ["<bos>"]
                    if problem_type:
                        input_parts.extend(["<problem>", problem_type])
                    input_parts.extend(["<user>", combined_user_text])
                    if last_user_emotion:
                        input_parts.extend(["<user_emotion>", last_user_emotion])
                    input_parts.append("<therapist>")

                    input_text = "".join(input_parts)

                    self.evaluation_data.append({
                        'input_text': input_text,
                        'reference_text': therapist_text,
                        'reference_emotion': therapist_emotion,
                        'user_input': combined_user_text,
                        'user_emotion': last_user_emotion,
                        'problem_type': problem_type
                    })

                    user_text_parts = []
                    user_emotions = []

        print(f"Processed {len(self.evaluation_data)} evaluation samples")

    def get_evaluation_data(self):
        return self.evaluation_data

# ====================== Evaluation Metrics ======================
class TherapyEvaluationMetrics:
    """Class to compute evaluation metrics"""

    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.smoothing = SmoothingFunction().method1

    def compute_bleu(self, reference, candidate):
        """Compute BLEU score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=self.smoothing)

    def compute_rouge(self, reference, candidate):
        """Compute ROUGE scores"""
        if not candidate or not reference:
            return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

        scores = self.rouge_scorer.score(reference, candidate)
        return {
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        }

    def compute_meteor(self, reference, candidate):
        """Compute METEOR score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return meteor_score([reference_tokens], candidate_tokens)

# ====================== Model Evaluation ======================
def evaluate_model_with_emotion_detection(model, tokenizer, evaluation_dataset, device='cuda',
                                         top_p=0.6, top_k=0, temperature=1.0, max_new_tokens=64, model_name="Model"):
    """
    Evaluate model with RoBERTa emotion detection
    """
    # Load emotion classifier
    emotion_model, emotion_tokenizer = load_emotion_classifier(device)
    metrics_computer = TherapyEvaluationMetrics()
    evaluation_data = evaluation_dataset.get_evaluation_data()

    results = {
        'bleu_scores': [],
        'rouge1_scores': [],
        'rouge2_scores': [],
        'rougeL_scores': [],
        'meteor_scores': [],
        'emotion_accuracy': [],
        'sample_outputs': []
    }

    emotion_confusion = defaultdict(lambda: defaultdict(int))
    model.eval()

    print(f"Evaluating {model_name} on {len(evaluation_data)} samples...")

    # Create progress bar
    progress_bar = tqdm(evaluation_data, desc=f"Evaluating {model_name}")

    for i, sample in enumerate(progress_bar):
        input_text = sample['input_text']
        reference_text = sample['reference_text']
        reference_emotion = sample['reference_emotion']

        if "baseline" in model_name.lower():
            user_match = re.search(r'<user>(.*?)(?:<user_emotion>|<therapist>)', input_text)
            if user_match:
                processed_input = user_match.group(1).strip()
            else:
                processed_input = sample['user_input']
        else:
            # For trained model, use full structured input
            processed_input = input_text

        # Tokenize input
        input_ids = tokenizer.encode(processed_input, return_tensors='pt').to(device)

        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                top_p=top_p,
                top_k=top_k,
                temperature=temperature,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        # Decode response
        new_tokens = outputs[0][len(input_ids[0]):]
        predicted_text = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()

        # Predict emotion using RoBERTa
        predicted_emotion, emotion_confidence = predict_emotion_roberta(
            predicted_text, emotion_model, emotion_tokenizer, device
        )

        # Compute text generation metrics
        bleu = metrics_computer.compute_bleu(reference_text, predicted_text)
        rouge = metrics_computer.compute_rouge(reference_text, predicted_text)
        meteor = metrics_computer.compute_meteor(reference_text, predicted_text)

        # Compute emotion accuracy
        emotion_correct = (predicted_emotion.lower() == reference_emotion.lower())

        # Store results
        results['bleu_scores'].append(bleu)
        results['rouge1_scores'].append(rouge['rouge1'])
        results['rouge2_scores'].append(rouge['rouge2'])
        results['rougeL_scores'].append(rouge['rougeL'])
        results['meteor_scores'].append(meteor)
        results['emotion_accuracy'].append(emotion_correct)

        emotion_confusion[reference_emotion.lower()][predicted_emotion.lower()] += 1

        # Store sample outputs for inspection
        if i < 5:
            results['sample_outputs'].append({
                'input': input_text,
                'processed_input': processed_input,
                'reference_text': reference_text,
                'predicted_text': predicted_text,
                'reference_emotion': reference_emotion,
                'predicted_emotion': predicted_emotion,
                'emotion_confidence': emotion_confidence,
                'bleu': bleu,
                'rouge1': rouge['rouge1'],
                'meteor': meteor
            })

        # Update progress
        if (i + 1) % 20 == 0:
            avg_bleu = np.mean(results['bleu_scores'])
            avg_emotion_acc = np.mean(results['emotion_accuracy'])
            progress_bar.set_description(f"Evaluating {model_name} - BLEU: {avg_bleu:.3f}, Emotion: {avg_emotion_acc:.3f}")

    progress_bar.close()

    results['summary'] = {
        'avg_bleu': np.mean(results['bleu_scores']),
        'avg_rouge1': np.mean(results['rouge1_scores']),
        'avg_rouge2': np.mean(results['rouge2_scores']),
        'avg_rougeL': np.mean(results['rougeL_scores']),
        'avg_meteor': np.mean(results['meteor_scores']),
        'emotion_accuracy': np.mean(results['emotion_accuracy']),
        'total_samples': len(evaluation_data)
    }

    results['emotion_confusion_matrix'] = dict(emotion_confusion)
    results['model_name'] = model_name
    results['generation_params'] = {
        'top_p': top_p,
        'top_k': top_k,
        'temperature': temperature,
        'max_new_tokens': max_new_tokens
    }

    return results

def print_evaluation_results(results):
    """Print formatted evaluation results"""
    summary = results['summary']
    model_name = results.get('model_name', 'Unknown')

    print("\n" + "="*60)
    print(f"{model_name.upper()} EVALUATION RESULTS")
    print("="*60)

    # Show generation parameters
    gen_params = results.get('generation_params', {})
    if gen_params:
        print(f"Generation Params: top_p={gen_params.get('top_p', 'N/A')}, "
              f"max_tokens={gen_params.get('max_new_tokens', 'N/A')}")

    print(f"\nText Generation Metrics:")
    print(f"  BLEU Score:     {summary['avg_bleu']:.4f}")
    print(f"  ROUGE-1:        {summary['avg_rouge1']:.4f}")
    print(f"  ROUGE-2:        {summary['avg_rouge2']:.4f}")
    print(f"  ROUGE-L:        {summary['avg_rougeL']:.4f}")
    print(f"  METEOR:         {summary['avg_meteor']:.4f}")

    print(f"\nEmotion Prediction (via RoBERTa):")
    print(f"  Emotion Accuracy:    {summary['emotion_accuracy']:.4f}")

    print(f"\nDataset Info:")
    print(f"  Total Samples:  {summary['total_samples']}")

    print(f"\nSample Outputs:")
    for i, sample in enumerate(results['sample_outputs'][:3], 1):
        print(f"\n  Sample {i}:")
        print(f"    Input: {sample['input'][:80]}...")
        print(f"    Processed: {sample['processed_input'][:60]}...")
        print(f"    Reference: {sample['reference_text'][:60]}...")
        print(f"    Predicted: {sample['predicted_text'][:60]}...")
        print(f"    Ref Emotion: {sample['reference_emotion']}")
        print(f"    Pred Emotion: {sample['predicted_emotion']} (conf: {sample['emotion_confidence']:.3f})")
        print(f"    BLEU: {sample['bleu']:.3f}, ROUGE-1: {sample['rouge1']:.3f}")

def compare_models(results_list):
    """Compare multiple model results"""
    print("\n" + "="*80)
    print("MODEL COMPARISON")
    print("="*80)

    print(f"{'Model':<30} {'BLEU':<8} {'ROUGE-1':<8} {'ROUGE-L':<8} {'METEOR':<8} {'Emotion Acc':<12}")
    print("-" * 80)

    for results in results_list:
        summary = results['summary']
        model_name = results.get('model_name', 'Unknown')
        print(f"{model_name:<30} {summary['avg_bleu']:<8.4f} {summary['avg_rouge1']:<8.4f} "
              f"{summary['avg_rougeL']:<8.4f} {summary['avg_meteor']:<8.4f} {summary['emotion_accuracy']:<12.4f}")

# ====================== Usage Example ======================
if __name__ == "__main__":
    NO_EMOTION_MODEL_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/SFTNoemotions/therapy_checkpoints_4thFIXED_noemotions/best_model.ckpt"
    TOKENIZER_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/SFTNoemotions/train_processed_no_emotion_v1_tokenizer"
    TEST_DATA_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/test.json"
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    eval_dataset = TherapyEvaluationDatasetNoEmotion(TEST_DATA_PATH)

    results_list = []

    print("=== EVALUATING NO-EMOTION TRAINED MODEL ===")
    no_emotion_model, no_emotion_tokenizer = load_no_emotion_model(NO_EMOTION_MODEL_PATH, TOKENIZER_PATH, DEVICE)
    no_emotion_results = evaluate_model_with_emotion_detection(
        no_emotion_model, no_emotion_tokenizer, eval_dataset, DEVICE,
        top_p=0.6, top_k=0, temperature=1.0, max_new_tokens=128, model_name="No-Emotion Trained Model"
    )
    print_evaluation_results(no_emotion_results)
    results_list.append(no_emotion_results)


    compare_models(results_list)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
Processing evaluation data: 100%|██████████| 102/102 [00:00<00:00, 7074.66it/s]


Processed 454 evaluation samples
=== EVALUATING NO-EMOTION TRAINED MODEL ===
Loading no-emotion model: /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/SFTNoemotions/therapy_checkpoints_4thFIXED_noemotions/best_model.ckpt


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


No-emotion model loaded with vocab size: 50269
Loading emotion classifier...


tokenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Evaluating No-Emotion Trained Model on 454 samples...


Evaluating No-Emotion Trained Model - BLEU: 0.011, Emotion: 0.807: 100%|██████████| 454/454 [00:47<00:00,  9.56it/s]


NO-EMOTION TRAINED MODEL EVALUATION RESULTS
Generation Params: top_p=0.6, max_tokens=128

Text Generation Metrics:
  BLEU Score:     0.0106
  ROUGE-1:        0.1160
  ROUGE-2:        0.0159
  ROUGE-L:        0.1058
  METEOR:         0.0433

Emotion Prediction (via RoBERTa):
  Emotion Accuracy:    0.8084

Dataset Info:
  Total Samples:  454

Sample Outputs:

  Sample 1:
    Input: <bos><problem>Breakups or Divorce<user>I mean, shit, don't you know that men are...
    Processed: <bos><problem>Breakups or Divorce<user>I mean, shit, don't y...
    Reference: But I thought you said...
    Predicted: And what do you think it was about you that he lost touch wi...
    Ref Emotion: neutral
    Pred Emotion: neutral (conf: 0.984)
    BLEU: 0.014, ROUGE-1: 0.105

  Sample 2:
    Input: <bos><problem>Breakups or Divorce<user>Well, we were watching tv.But before that...
    Processed: <bos><problem>Breakups or Divorce<user>Well, we were watchin...
    Reference: So you were the one who came up wi




**ESCONV EVALUATION**

RL

In [None]:
import json
import re
import torch
from transformers import GPT2Tokenizer
from datasets import Dataset as HFDataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import nltk
from tqdm import tqdm
import numpy as np
from collections import defaultdict

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# ====================== Text Cleaning Functions ======================
def clean_therapy_text(text):
    """Remove descriptive text patterns and clean text"""
    if not text:
        return ""
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# ====================== ESConv Dataset Processing ======================
class ESConvEvaluationDataset:
    """Dataset class for ESConv evaluation data processing"""

    def __init__(self, json_path, tokenizer_path=None):
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            if isinstance(data, dict):
                self._data = [data]
            else:
                self._data = data

        if tokenizer_path:
            self.tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
        else:
            # Use default tokenizer if path not provided
            from transformers import AutoTokenizer
            self.tokenizer = AutoTokenizer.from_pretrained('gpt2')

        self.evaluation_data = []
        self._process_esconv_data()

    def _process_esconv_data(self):
        """Process ESConv dialog data for evaluation"""
        print("Processing ESConv data...")

        for conv_idx, conv in enumerate(tqdm(self._data, desc="Processing conversations")):
            # Extract conversation metadata
            problem_type = conv.get("problem_type", "").strip()
            emotion_type = conv.get("emotion_type", "").strip()
            dialog = conv.get("dialog", [])

            if not dialog:
                continue

            if conv_idx < 3:
                print(f"\nProcessing conversation {conv_idx + 1}:")
                print(f"  Problem type: {problem_type}")
                print(f"  Emotion type: {emotion_type}")
                print(f"  Dialog length: {len(dialog)} turns")

            # Track conversation state
            user_messages = []  # Accumulate user messages

            for turn_idx, turn in enumerate(dialog):
                speaker = turn.get("speaker", "").strip()
                content = clean_therapy_text(turn.get("content", ""))

                if not content:
                    continue

                if speaker == "seeker":
                    # Accumulate user (seeker) messages
                    user_messages.append(content)

                elif speaker == "supporter":
                    if user_messages:  # Only if we have accumulated user messages

                        # Combine all user messages for this interaction
                        combined_user_text = " ".join(user_messages)
                        therapist_response = content

                        # Create structured input in the expected format
                        input_parts = []
                        if problem_type:
                            input_parts.append(f"<problem>{problem_type}")

                        input_parts.append(f"<user>{combined_user_text}")

                        if emotion_type:
                            input_parts.append(f"<user_emotion>{emotion_type}")

                        input_parts.append("<therapist>")
                        input_text = " ".join(input_parts)

                        # Create evaluation sample
                        eval_sample = {
                            'input_text': input_text,
                            'reference_text': therapist_response,
                            'reference_emotion': emotion_type,
                            'user_input': combined_user_text,
                            'user_emotion': emotion_type,
                            'problem_type': problem_type,
                            'conversation_id': conv_idx,
                            'turn_id': turn_idx
                        }

                        self.evaluation_data.append(eval_sample)

                        if len(self.evaluation_data) <= 3:
                            print(f"\n  Sample {len(self.evaluation_data)}:")
                            print(f"    Input: {input_text[:100]}...")
                            print(f"    Reference: {therapist_response[:100]}...")

                        # Reset user messages for next interaction
                        user_messages = []

        print(f"\nProcessed {len(self.evaluation_data)} evaluation samples from {len(self._data)} conversations")

    def get_evaluation_data(self):
        return self.evaluation_data

    def print_sample_formats(self, num_samples=3):
        """Print sample input/output formats for verification"""
        print(f"\n{'='*80}")
        print("ESCONV SAMPLE INPUT/OUTPUT FORMATS")
        print('='*80)

        for i, sample in enumerate(self.evaluation_data[:num_samples]):
            print(f"\nSample {i+1}:")
            print(f"  Problem Type: {sample['problem_type']}")
            print(f"  User Emotion: {sample['user_emotion']}")
            print(f"  Input Format: {sample['input_text'][:150]}...")
            print(f"  Reference Response: {sample['reference_text'][:100]}...")
            print(f"  User Input Only: {sample['user_input'][:100]}...")

# ====================== Model Format Detection ======================
def detect_model_format(model, tokenizer, device='cuda', test_inputs=None):
    """
    Test model output format (should be consistent since both SFT and RL use same format)

    Args:
        model: The model to test
        tokenizer: Model tokenizer
        device: Device to run test on
        test_inputs: List of test input strings, uses defaults if None

    Returns:
        str: 'standard' if using <therapist_emotion> format, 'unknown' otherwise
    """
    if test_inputs is None:
        test_inputs = [
            "<problem>anxiety <user>I'm worried about work <user_emotion>anxiety <therapist>",
            "<problem>depression <user>I feel very sad <user_emotion>sadness <therapist>",
            "<problem>relationship <user>My partner doesn't understand me <user_emotion>anger <therapist>"
        ]

    emotion_tag_count = 0
    total_tests = len(test_inputs)

    model.eval()
    with torch.no_grad():
        for test_input in test_inputs:
            # Tokenize and generate
            input_ids = tokenizer.encode(test_input, return_tensors='pt').to(device)
            outputs = model.generate(
                input_ids,
                max_new_tokens=128,
                do_sample=True,
                top_p=0.8,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

            # Decode response
            new_tokens = outputs[0][len(input_ids[0]):]
            response = tokenizer.decode(new_tokens, skip_special_tokens=False).strip()

            # Check for emotion tag
            if '<therapist_emotion>' in response:
                emotion_tag_count += 1

    # Determine format
    if emotion_tag_count >= total_tests * 0.5:
        return 'standard'
    else:
        return 'unknown'

# ====================== Model Output Extraction ======================
def extract_model_output(response):
    """
    Extract therapist text and emotion from model output.
    Both SFT and RL models use format: 'text <therapist_emotion> emotion<eos>'

    Returns:
        therapist_text (str): Text before <therapist_emotion>
        emotion (str): Emotion word after <therapist_emotion>
        has_emotion_tag (bool): Whether emotion tag was found
    """
    response = response.strip()

    # Remove <eos> if present
    if response.endswith('<eos>'):
        response = response[:-5].strip()

    # Find <therapist_emotion> tag
    emotion_pattern = r'<therapist_emotion>'
    emotion_match = re.search(emotion_pattern, response)

    if emotion_match:
        # Extract text before <therapist_emotion>
        therapist_text = response[:emotion_match.start()].strip()

        # Extract emotion part after <therapist_emotion>
        emotion_part = response[emotion_match.end():].strip()

        # Get the first word as emotion
        emotion_words = emotion_part.split()
        emotion = emotion_words[0].lower() if emotion_words else ""

        return therapist_text, emotion, True
    else:
        # No emotion tag found, return entire response as text
        return response, "", False

# ====================== Evaluation Metrics ======================
class TherapyEvaluationMetrics:
    """Class to compute evaluation metrics"""

    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.smoothing = SmoothingFunction().method1

    def compute_bleu(self, reference, candidate):
        """Compute BLEU score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=self.smoothing)

    def compute_rouge(self, reference, candidate):
        """Compute ROUGE scores"""
        if not candidate or not reference:
            return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

        scores = self.rouge_scorer.score(reference, candidate)
        return {
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        }

    def compute_meteor(self, reference, candidate):
        """Compute METEOR score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return meteor_score([reference_tokens], candidate_tokens)

# ====================== Model Evaluation ======================
def evaluate_model_esconv(model, tokenizer, evaluation_dataset, device='cuda', max_new_tokens=128,
                         top_p=0.7, top_k=10, temperature=1.2, do_sample=True):
    """
    Evaluate model on ESConv therapy dataset

    Args:
        model: Trained model (SFT or RL)
        tokenizer: Model tokenizer
        evaluation_dataset: ESConvEvaluationDataset instance
        device: Device to run evaluation on
        max_new_tokens: Maximum tokens to generate
        top_p: Top-p sampling parameter
        do_sample: Whether to use sampling

    Returns:
        Dictionary containing evaluation results
    """
    metrics_computer = TherapyEvaluationMetrics()
    evaluation_data = evaluation_dataset.get_evaluation_data()

    results = {
        'bleu_scores': [],
        'rouge1_scores': [],
        'rouge2_scores': [],
        'rougeL_scores': [],
        'meteor_scores': [],
        'emotion_accuracy': [],
        'has_emotion_tag': [],
        'sample_outputs': []
    }

    emotion_confusion = defaultdict(lambda: defaultdict(int))

    model.eval()

    # Detect model output format
    detected_format = detect_model_format(model, tokenizer, device)
    print(f"Output format check: {detected_format.upper()}")

    print(f"Evaluating model on {len(evaluation_data)} ESConv samples...")

    # Create progress bar with custom format
    progress_bar = tqdm(evaluation_data, desc="Evaluating ESConv",
                       bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]')

    for i, sample in enumerate(progress_bar):
        input_text = sample['input_text']
        reference_text = sample['reference_text']
        reference_emotion = sample['reference_emotion']

        # Tokenize input
        input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_new_tokens=max_new_tokens,
                do_sample=do_sample,
                top_p=top_p,
                top_k=top_k,
                temperature=temperature,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        # Decode response
        new_tokens = outputs[0][len(input_ids[0]):]
        response = tokenizer.decode(new_tokens, skip_special_tokens=False).strip()

        # Extract model output
        predicted_text, predicted_emotion, has_emotion = extract_model_output(response)

        # Compute text generation metrics
        bleu = metrics_computer.compute_bleu(reference_text, predicted_text)
        rouge = metrics_computer.compute_rouge(reference_text, predicted_text)
        meteor = metrics_computer.compute_meteor(reference_text, predicted_text)

        # Compute emotion accuracy
        emotion_correct = (predicted_emotion.lower() == reference_emotion.lower()) if has_emotion else False

        # Store results
        results['bleu_scores'].append(bleu)
        results['rouge1_scores'].append(rouge['rouge1'])
        results['rouge2_scores'].append(rouge['rouge2'])
        results['rougeL_scores'].append(rouge['rougeL'])
        results['meteor_scores'].append(meteor)
        results['emotion_accuracy'].append(emotion_correct)
        results['has_emotion_tag'].append(has_emotion)

        # Update confusion matrix
        if has_emotion:
            emotion_confusion[reference_emotion.lower()][predicted_emotion.lower()] += 1
        else:
            emotion_confusion[reference_emotion.lower()]['no_prediction'] += 1

        # Store sample outputs for inspection
        if i < 10:  # Store first 10 samples
            results['sample_outputs'].append({
                'input': input_text,
                'reference_text': reference_text,
                'predicted_text': predicted_text,
                'reference_emotion': reference_emotion,
                'predicted_emotion': predicted_emotion,
                'full_response': response,
                'bleu': bleu,
                'rouge1': rouge['rouge1'],
                'meteor': meteor
            })

        # Show live samples during evaluation (every 20 samples)
        if (i + 1) % 20 == 0 or i < 5:
            # Update progress bar description with current sample info
            sample_info = f"Sample {i+1}: BLEU={bleu:.3f}, Emotion={'✓' if emotion_correct else '✗'}"
            progress_bar.set_description(f"Evaluating ESConv - {sample_info}")

            if (i + 1) % 50 == 0 or i < 3:
                print(f"\n" + "="*80)
                print(f"ESCONV LIVE SAMPLE {i+1}/{len(evaluation_data)}")
                print("="*80)
                print(f"Input: {input_text[:120]}...")
                print(f"Reference: {reference_text[:100]}...")
                print(f"Generated: {predicted_text[:100]}...")
                print(f"Full Response: {response[:150]}...")
                print(f"Ref Emotion: {reference_emotion} | Pred Emotion: {predicted_emotion} | Match: {'✓' if emotion_correct else '✗'}")
                print(f"BLEU: {bleu:.4f} | ROUGE-1: {rouge['rouge1']:.4f} | METEOR: {meteor:.4f}")
                print("="*80)
        else:
            # Just update progress bar with running averages
            if i > 0:
                avg_bleu = np.mean(results['bleu_scores'])
                avg_emotion_acc = np.mean(results['emotion_accuracy'])
                progress_bar.set_description(f"Evaluating ESConv - Avg BLEU: {avg_bleu:.3f}, Emotion Acc: {avg_emotion_acc:.3f}")

    progress_bar.close()

    # Compute summary statistics
    results['summary'] = {
        'avg_bleu': np.mean(results['bleu_scores']),
        'avg_rouge1': np.mean(results['rouge1_scores']),
        'avg_rouge2': np.mean(results['rouge2_scores']),
        'avg_rougeL': np.mean(results['rougeL_scores']),
        'avg_meteor': np.mean(results['meteor_scores']),
        'emotion_accuracy': np.mean(results['emotion_accuracy']),
        'emotion_tag_coverage': np.mean(results['has_emotion_tag']),
        'total_samples': len(evaluation_data)
    }

    results['emotion_confusion_matrix'] = dict(emotion_confusion)
    results['detected_format'] = detected_format

    # Print final progress summary
    print(f"\n{'='*60}")
    print(f"ESCONV EVALUATION COMPLETED!")
    print(f"{'='*60}")
    print(f"Processed {len(evaluation_data)} samples")
    print(f"Average BLEU: {results['summary']['avg_bleu']:.4f}")
    print(f"Average ROUGE-1: {results['summary']['avg_rouge1']:.4f}")
    print(f"Emotion Accuracy: {results['summary']['emotion_accuracy']:.4f}")
    print(f"Emotion Tag Coverage: {results['summary']['emotion_tag_coverage']:.4f}")
    print(f"{'='*60}")

    return results

def print_evaluation_results(results):
    """Print formatted evaluation results"""
    summary = results['summary']

    print("\n" + "="*60)
    print("ESCONV THERAPY MODEL EVALUATION RESULTS")
    print("="*60)
    print(f"Model Type: {results.get('model_type', 'Unknown').upper()}")
    print(f"Output Format: {results.get('detected_format', 'Unknown').upper()}")

    print(f"\nText Generation Metrics:")
    print(f"  BLEU Score:     {summary['avg_bleu']:.4f}")
    print(f"  ROUGE-1:        {summary['avg_rouge1']:.4f}")
    print(f"  ROUGE-2:        {summary['avg_rouge2']:.4f}")
    print(f"  ROUGE-L:        {summary['avg_rougeL']:.4f}")
    print(f"  METEOR:         {summary['avg_meteor']:.4f}")

    print(f"\nEmotion Prediction:")
    print(f"  Emotion Accuracy:    {summary['emotion_accuracy']:.4f}")
    print(f"  Emotion Tag Coverage: {summary['emotion_tag_coverage']:.4f}")

    print(f"\nDataset Info:")
    print(f"  Total Samples:  {summary['total_samples']}")

    # Print emotion confusion matrix
    if 'emotion_confusion_matrix' in results:
        print(f"\nEmotion Confusion Matrix:")
        confusion = results['emotion_confusion_matrix']
        for ref_emotion, pred_dict in confusion.items():
            print(f"  {ref_emotion}:")
            for pred_emotion, count in pred_dict.items():
                print(f"    -> {pred_emotion}: {count}")

    print(f"\nSample Outputs:")
    for i, sample in enumerate(results['sample_outputs'][:3], 1):
        print(f"\n  Sample {i}:")
        print(f"    Input: {sample['input'][:100]}...")
        print(f"    Reference: {sample['reference_text'][:80]}...")
        print(f"    Predicted: {sample['predicted_text'][:80]}...")
        print(f"    Ref Emotion: {sample['reference_emotion']}")
        print(f"    Pred Emotion: {sample['predicted_emotion']}")
        print(f"    BLEU: {sample['bleu']:.3f}, ROUGE-1: {sample['rouge1']:.3f}")

# ====================== Model Loading Functions ======================
def load_sft_model(checkpoint_path, tokenizer_path, device='cuda'):
    """Load SFT model from checkpoint file"""
    from transformers import GPT2LMHeadModel

    print(f"Loading SFT checkpoint: {checkpoint_path}")

    # Load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

    # Load checkpoint
    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)

    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.resize_token_embeddings(len(tokenizer))
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)

    if 'epoch' in checkpoint:
        print(f"Checkpoint info: Epoch {checkpoint['epoch']}, Loss {checkpoint.get('valid_loss', 'N/A')}")

    return model, tokenizer

def load_rl_model(model_dir, tokenizer_path, device='cuda'):
    """Load RL model from directory"""
    from transformers import AutoModelForCausalLM

    print(f"Loading RL model from directory: {model_dir}")

    # Load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

    model = AutoModelForCausalLM.from_pretrained(model_dir).to(device)

    return model, tokenizer

def load_model_auto(model_path, tokenizer_path, device='cuda'):
    """
    Automatically detect and load model (SFT checkpoint or RL directory)

    Args:
        model_path: Path to model (file for SFT, directory for RL)
        tokenizer_path: Path to tokenizer
        device: Device to load model on

    Returns:
        tuple: (model, tokenizer, model_type)
    """
    import os

    if os.path.isfile(model_path):
        print("Detected SFT checkpoint file")
        model, tokenizer = load_sft_model(model_path, tokenizer_path, device)
        return model, tokenizer, 'sft'

    elif os.path.isdir(model_path):
        # It's a directory - assume RL model
        print("Detected RL model directory")
        model, tokenizer = load_rl_model(model_path, tokenizer_path, device)
        return model, tokenizer, 'rl'

    else:
        raise ValueError(f"Model path {model_path} is neither a file nor a directory")

# ====================== Main Evaluation Function ======================
def run_esconv_evaluation(model_path, tokenizer_path, esconv_data_path, device='cuda'):
    """
    Run complete evaluation pipeline on ESConv data

    Args:
        model_path: Path to trained model (file for SFT, directory for RL)
        tokenizer_path: Path to tokenizer
        esconv_data_path: Path to ESConv JSON data
        device: Device to run evaluation on
    """

    # Load model and tokenizer (auto-detect type)
    print(f"Loading model from {model_path}...")
    model, tokenizer, model_type = load_model_auto(model_path, tokenizer_path, device)
    print(f"Loaded {model_type.upper()} model successfully")

    print(f"Loading ESConv evaluation data from {esconv_data_path}...")
    eval_dataset = ESConvEvaluationDataset(esconv_data_path, tokenizer_path)

    # Print sample formats to verify conversion
    eval_dataset.print_sample_formats(3)

    # Run evaluation
    results = evaluate_model_esconv(model, tokenizer, eval_dataset, device=device)
    results['model_type'] = model_type
    results['dataset_type'] = 'esconv'

    # Print results
    print_evaluation_results(results)

    return results

# ====================== Usage Example ======================
if __name__ == "__main__":


    MODEL_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_4"

    TOKENIZER_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/train_processed_4thFIXED_tokenizer"
    ESCONV_DATA_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/PESConv.json"  # Update this path to your ESConv data
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    results = run_esconv_evaluation(MODEL_PATH, TOKENIZER_PATH, ESCONV_DATA_PATH, DEVICE)

    import pickle
    with open('/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/Evaluation2/ESConv_RL_evaluation_results2.pkl', 'wb') as f:
        pickle.dump(results, f)

    print("\nEvaluation completed! Results saved to ESConv_RL_evaluation_results2.pkl")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading model from /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_4...
Detected RL model directory
Loading RL model from directory: /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_4


Some weights of the model checkpoint at /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/8m19enhanced_ppo_model/epoch_4 were not used when initializing GPT2LMHeadModel: ['v_head.summary.bias', 'v_head.summary.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loaded RL model successfully
Loading ESConv evaluation data from /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/PESConv.json...
Processing ESConv data...


Processing conversations:  28%|██▊       | 365/1300 [00:00<00:00, 3649.62it/s]


Processing conversation 1:
  Problem type: job crisis
  Emotion type: anxiety
  Dialog length: 27 turns

  Sample 1:
    Input: <problem>job crisis <user>Hello <user_emotion>anxiety <therapist>...
    Reference: Hello, what would you like to talk about?...

  Sample 2:
    Input: <problem>job crisis <user>I am having a lot of anxiety about quitting my current job. It is too stre...
    Reference: What makes your job stressful for you?...

  Sample 3:
    Input: <problem>job crisis <user>I have to deal with many people in hard financial situations and it is ups...
    Reference: Do you help your clients to make it to a better financial situation?...

Processing conversation 2:
  Problem type: problems with friends
  Emotion type: anger
  Dialog length: 69 turns

Processing conversation 3:
  Problem type: job crisis
  Emotion type: fear
  Dialog length: 37 turns


Processing conversations: 100%|██████████| 1300/1300 [00:00<00:00, 3710.56it/s]



Processed 14642 evaluation samples from 1300 conversations

ESCONV SAMPLE INPUT/OUTPUT FORMATS

Sample 1:
  Problem Type: job crisis
  User Emotion: anxiety
  Input Format: <problem>job crisis <user>Hello <user_emotion>anxiety <therapist>...
  Reference Response: Hello, what would you like to talk about?...
  User Input Only: Hello...

Sample 2:
  Problem Type: job crisis
  User Emotion: anxiety
  Input Format: <problem>job crisis <user>I am having a lot of anxiety about quitting my current job. It is too stressful but pays well <user_emotion>anxiety <therapi...
  Reference Response: What makes your job stressful for you?...
  User Input Only: I am having a lot of anxiety about quitting my current job. It is too stressful but pays well...

Sample 3:
  Problem Type: job crisis
  User Emotion: anxiety
  Input Format: <problem>job crisis <user>I have to deal with many people in hard financial situations and it is upsetting <user_emotion>anxiety <therapist>...
  Reference Response: Do you

Evaluating ESConv - Sample 1: BLEU=0.029, Emotion=✗:   0%|          | 1/14642 [00:00<38:54,  6.27it/s]


ESCONV LIVE SAMPLE 1/14642
Input: <problem>job crisis <user>Hello <user_emotion>anxiety <therapist>...
Reference: Hello, what would you like to talk about?...
Generated: you're nervous about what you're gonna say to me....
Full Response: you're nervous about what you're gonna say to me. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0285 | ROUGE-1: 0.4211 | METEOR: 0.1235


Evaluating ESConv - Sample 3: BLEU=0.000, Emotion=✗:   0%|          | 2/14642 [00:00<49:48,  4.90it/s]


ESCONV LIVE SAMPLE 2/14642
Input: <problem>job crisis <user>I am having a lot of anxiety about quitting my current job. It is too stressful but pays well ...
Reference: What makes your job stressful for you?...
Generated: you're worried about what you're going to do next month. What do you think about it?...
Full Response: you're worried about what you're going to do next month. What do you think about it? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0123 | ROUGE-1: 0.1600 | METEOR: 0.2373

ESCONV LIVE SAMPLE 3/14642
Input: <problem>job crisis <user>I have to deal with many people in hard financial situations and it is upsetting <user_emotion...
Reference: Do you help your clients to make it to a better financial situation?...
Generated: you're anxious?...
Full Response: you're anxious? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1250 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.006, Emotion Acc: 0.000:   1%|          | 102/14642 [00:12<29:08,  8.32it/s]


ESCONV LIVE SAMPLE 100/14642
Input: <problem>job crisis <user>sorry about that, but that's good, people help each other Absolutely! Volunteering is a great ...
Reference: Do you feel you will be getting the vaccine when it is rolled out where you are?...
Generated: if you're going to do that, you need somebody who's going to help you,...
Full Response: if you're going to do that, you need somebody who's going to help you, <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0115 | ROUGE-1: 0.2424 | METEOR: 0.0898


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:   1%|▏         | 201/14642 [00:23<25:57,  9.27it/s]


ESCONV LIVE SAMPLE 200/14642
Input: <problem>job crisis <user>Yes, I really think you are correct! thank you so very much for your help today! <user_emotion...
Reference: The survivor rate from COVID-19 infections is around 99%, so your chances of dying from the virus ar...
Generated: I'm sorry, Alex....
Full Response: I'm sorry, Alex. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:   2%|▏         | 300/14642 [00:34<22:09, 10.78it/s]


ESCONV LIVE SAMPLE 300/14642
Input: <problem>ongoing depression <user>No I think I'm good <user_emotion>depression <therapist>...
Reference: Please click "Finish and Quit" to close the chat.. thank you....
Generated: I think you're good....
Full Response: I think you're good. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1250 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:   3%|▎         | 401/14642 [00:46<31:25,  7.55it/s]


ESCONV LIVE SAMPLE 400/14642
Input: <problem>breakup with partner <user>i will have to find somewhere i can properly exercise my awful vocals lol <user_emot...
Reference: Ah! I like pop songs from the 60's :-)...
Generated: you'll have to find somewhere i can adequately exercise your awful vocals?...
Full Response: you'll have to find somewhere i can adequately exercise your awful vocals? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0909 | METEOR: 0.0538


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:   3%|▎         | 501/14642 [00:58<26:51,  8.77it/s]


ESCONV LIVE SAMPLE 500/14642
Input: <problem>job crisis <user>yes , you prediction was so good <user_emotion>sadness <therapist>...
Reference: That's lovely to hear. Have you tried discussing your doubts with your supervisor? There are times w...
Generated: You're gonna make me a better boxer?...
Full Response: You're gonna make me a better boxer? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0426 | METEOR: 0.0147


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:   4%|▍         | 602/14642 [01:10<21:45, 10.76it/s]


ESCONV LIVE SAMPLE 600/14642
Input: <problem>job crisis <user>Merry Chirstmas. <user_emotion>anxiety <therapist>...
Reference: I'm doing good!...
Generated: you're worried about...
Full Response: you're worried about <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:   5%|▍         | 701/14642 [01:21<29:37,  7.84it/s]


ESCONV LIVE SAMPLE 700/14642
Input: <problem>ongoing depression <user>We are looking at other options via business visa but we don't know how that will go <...
Reference: if you are an American, you can just go to Indonesia, get married to your beautiful fiancée and come...
Generated: You're afraid that you're gonna be stopped by us?...
Full Response: You're afraid that you're gonna be stopped by us? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1143 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:   5%|▌         | 801/14642 [01:34<25:41,  8.98it/s]


ESCONV LIVE SAMPLE 800/14642
Input: <problem>job crisis <user>Bye Have a Good Day Some Report preparation <user_emotion>anger <therapist>...
Reference: Have an amazing Christmas!...
Generated: You're feeling really anxious....
Full Response: You're feeling really anxious. <therapist_emotion> neutral<eos>...
Ref Emotion: anger | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:   6%|▌         | 901/14642 [01:46<27:38,  8.29it/s]


ESCONV LIVE SAMPLE 900/14642
Input: <problem>ongoing depression <user>That actually sounds like a good idea. I hope the shelter near me will take volunteers...
Reference: If you are not comfortable going out due to COVID, you could involve some activities online promotin...
Generated: You're gonna take the volunteers?...
Full Response: You're gonna take the volunteers? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0606 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:   7%|▋         | 1001/14642 [01:58<28:04,  8.10it/s]


ESCONV LIVE SAMPLE 1000/14642
Input: <problem>academic pressure <user>I'll keep that in mind as I move forward. I really appreciate your help. <user_emotion>...
Reference: I commend you for doing the thesis and I also know it can get weary as time passes, but with some go...
Generated: you're helping me....
Full Response: you're helping me. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0465 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:   8%|▊         | 1101/14642 [02:10<28:37,  7.88it/s]


ESCONV LIVE SAMPLE 1100/14642
Input: <problem>breakup with partner <user>With him... he was perfect (or at least I thought), but also, I'm lonely, and it's h...
Reference: Have you tried making other connections online? A lot of people are home during this pandemic, you m...
Generated: Do you feel lonely when you're alone?...
Full Response: Do you feel lonely when you're alone? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0039 | ROUGE-1: 0.1333 | METEOR: 0.0488


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:   8%|▊         | 1201/14642 [02:22<30:02,  7.46it/s]


ESCONV LIVE SAMPLE 1200/14642
Input: <problem>academic pressure <user>i usually try to follow the tutorials and do revisions <user_emotion>anxiety <therapist...
Reference: Can you think of anything you can do, to help make your studying be more successful?...
Generated: you're not sure you wanna follow the drawings?...
Full Response: you're not sure you wanna follow the drawings? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0102 | ROUGE-1: 0.1600 | METEOR: 0.0658


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:   9%|▉         | 1301/14642 [02:34<25:48,  8.62it/s]


ESCONV LIVE SAMPLE 1300/14642
Input: <problem>job crisis <user>And to you You doing alright? <user_emotion>anxiety <therapist>...
Reference: How are you today? I am buried in cookie dough, lol, but happy and wwell....
Generated: I'm... I'm fine....
Full Response: I'm... I'm fine. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  10%|▉         | 1402/14642 [02:45<22:59,  9.60it/s]


ESCONV LIVE SAMPLE 1400/14642
Input: <problem>academic pressure <user>Well, counselors are hard to get ahold of so close to the Holidays. The so-called teach...
Reference: Are you confident in your abilities to succeed in those classes or you think you might lack the mate...
Generated: who does not want you to be around her?...
Full Response: who does not want you to be around her? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0054 | ROUGE-1: 0.1212 | METEOR: 0.0667


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  10%|█         | 1501/14642 [02:57<28:07,  7.79it/s]


ESCONV LIVE SAMPLE 1500/14642
Input: <problem>job crisis <user>I am having a hard time.... Ok, I will look some of them online. I need some financial planner...
Reference: If you want other suggestions, meditation is something that is extremely overlooked and will help yo...
Generated: Do you think that you're being a good mom to your son?...
Full Response: Do you think that you're being a good mom to your son? <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0011 | ROUGE-1: 0.1905 | METEOR: 0.0649


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  11%|█         | 1602/14642 [03:10<26:53,  8.08it/s]


ESCONV LIVE SAMPLE 1600/14642
Input: <problem>problems with friends <user>no other questions, you have been most helpful, thank you! <user_emotion>anxiety <t...
Reference: I have done volunteer work; people want to be friends with those who care and know how to use a nail...
Generated: you've been most helpful,...
Full Response: you've been most helpful, <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0227


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  12%|█▏        | 1701/14642 [03:21<23:20,  9.24it/s]


ESCONV LIVE SAMPLE 1700/14642
Input: <problem>job crisis <user>I feel like a replaceable cog in a giant machine that doesn't give two shits whether I show up...
Reference: you are doing your best and are working really hard...
Generated: you're not getting your numbers up, you're just...
Full Response: you're not getting your numbers up, you're just <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0216 | ROUGE-1: 0.2000 | METEOR: 0.0510


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  12%|█▏        | 1801/14642 [03:33<21:05, 10.15it/s]


ESCONV LIVE SAMPLE 1800/14642
Input: <problem>job crisis <user>Do you smoke and drink alcohol? <user_emotion>depression <therapist>...
Reference: No, I don't find that to be healthy, and I don't enjoy them....
Generated: I'm just asking....
Full Response: I'm just asking. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1053 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  13%|█▎        | 1901/14642 [03:45<26:37,  7.98it/s]


ESCONV LIVE SAMPLE 1900/14642
Input: <problem>academic pressure <user>I think so? I think it is just the way the material is taught now. We have these "Zoom"...
Reference: What happened when you reached out to the teacher?...
Generated: Do you... Do you wish you could retire?...
Full Response: Do you... Do you wish you could retire? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0245 | ROUGE-1: 0.1176 | METEOR: 0.0562


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  14%|█▎        | 2001/14642 [03:57<20:38, 10.20it/s]


ESCONV LIVE SAMPLE 2000/14642
Input: <problem>ongoing depression <user>Well thank you im okay <user_emotion>depression <therapist>...
Reference: Hmm all the best...
Generated: you're fine....
Full Response: you're fine. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  14%|█▍        | 2101/14642 [04:08<24:45,  8.44it/s]


ESCONV LIVE SAMPLE 2100/14642
Input: <problem>ongoing depression <user>Hmm, I didn't know that. I'm on a waiting list for a therapist through my medical insu...
Reference: That is a good idea, especially at this time when it's probably not a good idea to see too many new ...
Generated: Do you think you could get help from somebody...
Full Response: Do you think you could get help from somebody <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0019 | ROUGE-1: 0.0952 | METEOR: 0.0337


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  15%|█▌        | 2201/14642 [04:19<22:34,  9.19it/s]


ESCONV LIVE SAMPLE 2200/14642
Input: <problem>problems with friends <user>hi <user_emotion>anxiety <therapist>...
Reference: Hello....
Generated: I'm sorry....
Full Response: I'm sorry. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  16%|█▌        | 2301/14642 [04:31<22:56,  8.96it/s]


ESCONV LIVE SAMPLE 2300/14642
Input: <problem>job crisis <user>did she start off doing that or was she employed by another company <user_emotion>fear <therap...
Reference: She has over 100 companies that are hiring...
Generated: you're not sure?...
Full Response: you're not sure? <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  16%|█▋        | 2401/14642 [04:43<21:52,  9.32it/s]


ESCONV LIVE SAMPLE 2400/14642
Input: <problem>job crisis <user>I try to find a job but there is not many <user_emotion>anxiety <therapist>...
Reference: I am sorry to hear that. How long have you been looking?...
Generated: you can do that?...
Full Response: you can do that? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0109 | ROUGE-1: 0.2500 | METEOR: 0.0446


Evaluating ESConv - Sample 2500: BLEU=0.002, Emotion=✗:  17%|█▋        | 2500/14642 [04:54<23:25,  8.64it/s]


ESCONV LIVE SAMPLE 2500/14642
Input: <problem>breakup with partner <user>my dog was 13 years old. she was very close to me. i had her when she was a puppy <u...
Reference: The dog then lived a very happy and fulfilling life from what I can tell. Dogs are among the best co...
Generated: you were very close to her?...
Full Response: you were very close to her? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0024 | ROUGE-1: 0.1333 | METEOR: 0.0676


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  18%|█▊        | 2601/14642 [05:06<27:15,  7.36it/s]


ESCONV LIVE SAMPLE 2600/14642
Input: <problem>problems with friends <user>Friends not being there for me when I need emotional support. They also do not pay ...
Reference: So just to make sure I understand, you are having issues with your friends not being there for you e...
Generated: Why do you think they don't want you to be there for them?...
Full Response: Why do you think they don't want you to be there for them? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0082 | ROUGE-1: 0.2917 | METEOR: 0.1795


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  18%|█▊        | 2702/14642 [05:18<20:39,  9.63it/s]


ESCONV LIVE SAMPLE 2700/14642
Input: <problem>ongoing depression <user>That's a very optimistic view... <user_emotion>sadness <therapist>...
Reference: Like I said, I used to be sad too and not optimistic. But now I am optimistic because I know what it...
Generated: That's a very optimistic view?...
Full Response: That's a very optimistic view? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0417 | METEOR: 0.0134


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  19%|█▉        | 2800/14642 [05:30<23:42,  8.32it/s]


ESCONV LIVE SAMPLE 2800/14642
Input: <problem>ongoing depression <user>I just lost interest. I'm losing interest in everything, man I think it's time for my ...
Reference: I certainly hope you don't do that!...
Generated: You don't think you're losing interest in everything?...
Full Response: You don't think you're losing interest in everything? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0278 | ROUGE-1: 0.3333 | METEOR: 0.2641


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  20%|█▉        | 2901/14642 [05:41<23:30,  8.32it/s]


ESCONV LIVE SAMPLE 2900/14642
Input: <problem>job crisis <user>yes, thanks a lot for your time <user_emotion>anger <therapist>...
Reference: You are welcome. I hope you don't give up, and stay optimistic! It sounds like you have a great star...
Generated: You're welcome....
Full Response: You're welcome. <therapist_emotion> neutral<eos>...
Ref Emotion: anger | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1667 | METEOR: 0.0275


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  20%|██        | 3001/14642 [05:53<24:16,  7.99it/s]


ESCONV LIVE SAMPLE 3000/14642
Input: <problem>breakup with partner <user>that will be hard because i am soo used to her being around <user_emotion>sadness <t...
Reference: I can understand that but the only way you can work it out with her if both of you two just talk...
Generated: you're so used to seeing her around?...
Full Response: you're so used to seeing her around? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0039 | ROUGE-1: 0.1333 | METEOR: 0.0488


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  21%|██        | 3100/14642 [06:05<19:13, 10.01it/s]


ESCONV LIVE SAMPLE 3100/14642
Input: <problem>job crisis <user>Hi, I am doing ok, how are you? <user_emotion>fear <therapist>...
Reference: Good thank you. Why only ok? What is bothering you?...
Generated: Good. What are you doing?...
Full Response: Good. What are you doing? <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0198 | ROUGE-1: 0.4000 | METEOR: 0.0526


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  22%|██▏       | 3201/14642 [06:16<22:25,  8.50it/s]


ESCONV LIVE SAMPLE 3200/14642
Input: <problem>job crisis <user>ohh 1 wow, I will definitely try that. It sounds like you're a doctor!" Whenever I get tense o...
Reference: Thank you that is very kind of you to say. I have just had alot of practice in dealing with anxiety....
Generated: I know you're not perfect,...
Full Response: I know you're not perfect, <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0727 | METEOR: 0.0112


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  23%|██▎       | 3301/14642 [06:27<21:16,  8.88it/s]


ESCONV LIVE SAMPLE 3300/14642
Input: <problem>academic pressure <user>i was mad, but now i am just upset. My professor put in grades for my class earlier thi...
Reference: Just to make sure I understand, you got a bad grade in one of your classes and your grade dropped be...
Generated: you're upset?...
Full Response: you're upset? <therapist_emotion> neutral<eos>...
Ref Emotion: anger | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1250 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  23%|██▎       | 3400/14642 [06:38<19:02,  9.84it/s]


ESCONV LIVE SAMPLE 3400/14642
Input: <problem>problems with friends <user>Hello. <user_emotion>sadness <therapist>...
Reference: Hello...
Generated: I'm sorry?...
Full Response: I'm sorry? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  24%|██▍       | 3502/14642 [06:50<19:23,  9.57it/s]


ESCONV LIVE SAMPLE 3500/14642
Input: <problem>ongoing depression <user>Hi <user_emotion>shame <therapist>...
Reference: Hello...
Generated: you're angry. What are you angry about?...
Full Response: you're angry. What are you angry about? <therapist_emotion> neutral<eos>...
Ref Emotion: shame | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  25%|██▍       | 3600/14642 [07:02<18:19, 10.04it/s]


ESCONV LIVE SAMPLE 3600/14642
Input: <problem>job crisis <user>you are so nice. i lost my job and having trouble finding a new on one <user_emotion>anxiety <...
Reference: Oh, I'm so sorry. I can empathize a bit. I lost mine earlier this year due to covid, but they did br...
Generated: you're so nice....
Full Response: you're so nice. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0001 | ROUGE-1: 0.0690 | METEOR: 0.0228


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  25%|██▌       | 3701/14642 [07:14<20:08,  9.05it/s]


ESCONV LIVE SAMPLE 3700/14642
Input: <problem>job crisis <user>I have sold wreaths for awhile but I did not have time to be as serious so I did not sell a to...
Reference: That sounds like an excellent idea to me! I think that you are definitely doing the right thing....
Generated: you're not serious?...
Full Response: you're not serious? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0909 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  26%|██▌       | 3801/14642 [07:26<19:36,  9.22it/s]


ESCONV LIVE SAMPLE 3800/14642
Input: <problem>academic pressure <user>We have gotten him tutors in the past and it helps. I am not sure I could do a tutor wi...
Reference: you are right. Have you tried helping him with his work?...
Generated: I can do a tutor with him?...
Full Response: I can do a tutor with him? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0187 | ROUGE-1: 0.2222 | METEOR: 0.0472


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  27%|██▋       | 3902/14642 [07:38<19:45,  9.06it/s]


ESCONV LIVE SAMPLE 3900/14642
Input: <problem>ongoing depression <user>oh, i dont use it much <user_emotion>depression <therapist>...
Reference: There is also proofreading that can help establish yourself...
Generated: You don't really have to use it?...
Full Response: You don't really have to use it? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  27%|██▋       | 4001/14642 [07:49<18:30,  9.58it/s]


ESCONV LIVE SAMPLE 4000/14642
Input: <problem>ongoing depression <user>Garbo hates small animals also. Especially skunks <user_emotion>anxiety <therapist>...
Reference: Oh Garbo! Hahahaha! She sounds wonderful!...
Generated: He can get you into trouble?...
Full Response: He can get you into trouble? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  28%|██▊       | 4101/14642 [08:00<19:11,  9.15it/s]


ESCONV LIVE SAMPLE 4100/14642
Input: <problem>breakup with partner <user>so sorry i had stepped away <user_emotion>sadness <therapist>...
Reference: I have been having a bit of a bad time in life lately so I can understand and empathise if you are t...
Generated: you were angry?...
Full Response: you were angry? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0001 | ROUGE-1: 0.0769 | METEOR: 0.0238


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  29%|██▊       | 4201/14642 [08:12<20:02,  8.68it/s]


ESCONV LIVE SAMPLE 4200/14642
Input: <problem>problems with friends <user>haha YES! see it is a stupid thing. like dood we hook your ass up all the tome time...
Reference: People suck....
Generated: I'm sorry?...
Full Response: I'm sorry? <therapist_emotion> neutral<eos>...
Ref Emotion: disgust | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  29%|██▉       | 4301/14642 [08:24<18:07,  9.51it/s]


ESCONV LIVE SAMPLE 4300/14642
Input: <problem>academic pressure <user>Yes, I am feeling very anxious about my school work lately. <user_emotion>anxiety <ther...
Reference: I see, so your main worry is about academic pressure?...
Generated: you're anxious?...
Full Response: you're anxious? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  30%|███       | 4402/14642 [08:36<17:22,  9.82it/s]


ESCONV LIVE SAMPLE 4400/14642
Input: <problem>job crisis <user>It was a political game where she isolated the president and made him no longer support her. I...
Reference: ok so your director played a game she was not allowed to play and she got caught playing it and quit...
Generated: You think she's acting like you?...
Full Response: You think she's acting like you? <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0606 | METEOR: 0.0208


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  31%|███       | 4501/14642 [08:48<21:39,  7.80it/s]


ESCONV LIVE SAMPLE 4500/14642
Input: <problem>breakup with partner <user>What are those? I heard of them, but never used them. Most of my old friends are no ...
Reference: Social networks online, you can connect with family or old coworkers. And post pictures and see what...
Generated: You don't think that's true?...
Full Response: You don't think that's true? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0645 | METEOR: 0.0226


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  31%|███▏      | 4601/14642 [09:00<22:56,  7.29it/s]


ESCONV LIVE SAMPLE 4600/14642
Input: <problem>ongoing depression <user>No, since too much time has passed <user_emotion>sadness <therapist>...
Reference: Have you tried any local churches or agencies? findhelp.org is also a great resource....
Generated: You've got a little bit of time....
Full Response: You've got a little bit of time. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0122 | ROUGE-1: 0.1739 | METEOR: 0.0752


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  32%|███▏      | 4700/14642 [09:11<18:01,  9.20it/s]


ESCONV LIVE SAMPLE 4700/14642
Input: <problem>problems with friends <user>That makes sense but I just wish I knew why she said it. Did I do something to her,...
Reference: maybe you should let her know that you love her and say that you would never want to hurt her, so if...
Generated: you're afraid she'll say anything...
Full Response: you're afraid she'll say anything <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0784 | METEOR: 0.0134


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  33%|███▎      | 4800/14642 [09:24<22:15,  7.37it/s]


ESCONV LIVE SAMPLE 4800/14642
Input: <problem>breakup with partner <user>well, it was weird that he got so dressed up to go hang out with friends it makes se...
Reference: Ok, so this is a HUGE red flag to you. You can ignore it, or not. Women KNOW when a man is cheating,...
Generated: you didn't wanna be naked to go hang out with him?...
Full Response: you didn't wanna be naked to go hang out with him? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0012 | ROUGE-1: 0.1071 | METEOR: 0.0369


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  33%|███▎      | 4901/14642 [09:35<19:03,  8.52it/s]


ESCONV LIVE SAMPLE 4900/14642
Input: <problem>job crisis <user>i'm withdrawing from the work and that's bad. i still care about the work, but the toxic envir...
Reference: maybe seek out someone more senior in a different department who can help...
Generated: you're gonna do anything to help me?...
Full Response: you're gonna do anything to help me? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0140 | ROUGE-1: 0.0952 | METEOR: 0.0403


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  34%|███▍      | 5002/14642 [09:47<18:20,  8.76it/s]


ESCONV LIVE SAMPLE 5000/14642
Input: <problem>academic pressure <user>How are you doing today? <user_emotion>anxiety <therapist>...
Reference: I am doing well and how are you feeling today?...
Generated: I'm fine....
Full Response: I'm fine. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1538 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  35%|███▍      | 5101/14642 [10:00<17:52,  8.90it/s]


ESCONV LIVE SAMPLE 5100/14642
Input: <problem>job crisis <user>Yes. <user_emotion>fear <therapist>...
Reference: I can understand how that feels. Have you tried to talk to them about it?...
Generated: And what happened?...
Full Response: And what happened? <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.000:  36%|███▌      | 5201/14642 [10:11<13:38, 11.54it/s]


ESCONV LIVE SAMPLE 5200/14642
Input: <problem>academic pressure <user>Hello! Life is treating me fair, what about you? <user_emotion>anxiety <therapist>...
Reference: Well, I've been a bit worried this year due to Covid and everything. Have you been worrying about an...
Generated: You're anxious....
Full Response: You're anxious. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0870 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  36%|███▌      | 5301/14642 [10:23<17:23,  8.95it/s]


ESCONV LIVE SAMPLE 5300/14642
Input: <problem>problems with friends <user>I agree. She can't see past her own issues to see the whole situation. <user_emotio...
Reference: If you think its a more serious problem maybe ask her to seek professional help....
Generated: She's not seeing the whole picture?...
Full Response: She's not seeing the whole picture? <therapist_emotion> neutral<eos>...
Ref Emotion: anger | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  37%|███▋      | 5400/14642 [10:34<18:17,  8.42it/s]


ESCONV LIVE SAMPLE 5400/14642
Input: <problem>breakup with partner <user>Things have been pretty rough for the last few years. I found out she was hanging ou...
Reference: So from what I am hearing, you ended things with your girlfriend because she was talking to her ex a...
Generated: you're worried that she's going to come to her...
Full Response: you're worried that she's going to come to her  <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0044 | ROUGE-1: 0.2326 | METEOR: 0.0505


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  38%|███▊      | 5501/14642 [10:46<17:45,  8.58it/s]


ESCONV LIVE SAMPLE 5500/14642
Input: <problem>breakup with partner <user>His latest obsession has been 3d printing. I have found weird sites with ' funny'' d...
Reference: goodness...
Generated: He might not like that?...
Full Response: He might not like that? <therapist_emotion> neutral<eos>...
Ref Emotion: anger | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  38%|███▊      | 5600/14642 [10:57<18:03,  8.34it/s]


ESCONV LIVE SAMPLE 5600/14642
Input: <problem>breakup with partner <user>I enjoy snowboarding as a hobby. I might try to get back on my board depending on wh...
Reference: Yes. when you are depressed it is hard to become motivated to do anything. However, it is extremely ...
Generated: Do you think you'll find something else?...
Full Response: Do you think you'll find something else? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0952 | METEOR: 0.0217


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  39%|███▉      | 5701/14642 [11:10<16:58,  8.78it/s]


ESCONV LIVE SAMPLE 5700/14642
Input: <problem>ongoing depression <user>i am depressed <user_emotion>depression <therapist>...
Reference: I'm sorry to hear that! When did you begin feeling this way?...
Generated: You're depressed?...
Full Response: You're depressed? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1250 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  40%|███▉      | 5801/14642 [11:22<14:10, 10.40it/s]


ESCONV LIVE SAMPLE 5800/14642
Input: <problem>breakup with partner <user>I will follow your suggestion. Thanks a lot. <user_emotion>sadness <therapist>...
Reference: You are so welcome! I hope you start to feel better soon....
Generated: You'll follow me?...
Full Response: You'll follow me? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1250 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  40%|████      | 5902/14642 [11:33<15:50,  9.20it/s]


ESCONV LIVE SAMPLE 5900/14642
Input: <problem>problems with friends <user>I appreciate your help! I'll try to have a productive conversation with her soon. <...
Reference: Great, the sooner the better. I have a feeling you'll both be relieved when you do....
Generated: What do you mean, when she'll be happy?...
Full Response: What do you mean, when she'll be happy? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0134 | ROUGE-1: 0.3846 | METEOR: 0.0987


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  41%|████      | 6001/14642 [11:44<15:50,  9.09it/s]


ESCONV LIVE SAMPLE 6000/14642
Input: <problem>ongoing depression <user>I've been felling quite deppressed recently, just wanted to ask for some mental suppor...
Reference: why are you feeling in that way? tell me what it's happening?...
Generated: I'd like to ask for some support?...
Full Response: I'd like to ask for some support? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  42%|████▏     | 6101/14642 [11:55<14:27,  9.85it/s]


ESCONV LIVE SAMPLE 6100/14642
Input: <problem>ongoing depression <user>It's just me and my small family. I was at college but moved home a few weeks ago. At ...
Reference: I have heard that is the norm among colleges. That has to be really tough. So much of college is in ...
Generated: You're......
Full Response: You're... <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0392 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  42%|████▏     | 6202/14642 [12:08<14:48,  9.50it/s]


ESCONV LIVE SAMPLE 6200/14642
Input: <problem>breakup with partner <user>yes i have faith in people very fast seriously its like your seeing my mind <user_em...
Reference: Maybe this could be a learning experience? I always try to see the best in people, but have learned ...
Generated: you're seeing your mind?...
Full Response: you're seeing your mind? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0556 | METEOR: 0.0177


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  43%|████▎     | 6301/14642 [12:19<16:42,  8.32it/s]


ESCONV LIVE SAMPLE 6300/14642
Input: <problem>academic pressure <user>Yes, that is true. I just can't quite seem to quiet my mind. (8) <user_emotion>anxiety ...
Reference: Well, I've tried meditating a few times before which doesn;t take it all away, it just takes the edg...
Generated: You're not quite able to......
Full Response: You're not quite able to... <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0789 | METEOR: 0.0083


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  44%|████▎     | 6401/14642 [12:31<17:05,  8.04it/s]


ESCONV LIVE SAMPLE 6400/14642
Input: <problem>job crisis <user>It might, I am not sure but I don't think my boss will like the idea <user_emotion>anxiety <th...
Reference: Sometimes it's important to remember that part of your bosses job is having this kinds of conversati...
Generated: you've brought in. What would you like me to do?...
Full Response: you've brought in. What would you like me to do? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0056 | ROUGE-1: 0.1081 | METEOR: 0.0426


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  44%|████▍     | 6502/14642 [12:43<13:55,  9.74it/s]


ESCONV LIVE SAMPLE 6500/14642
Input: <problem>ongoing depression <user>but he thinks he is always right and blah blah blah yes My real concern is for the boy...
Reference: I can tell this situation stresses you out and probably consumes a lot of your day to day energy. I ...
Generated: you think it's okay that you're always wrong?...
Full Response: you think it's okay that you're always wrong? <therapist_emotion> neutral<eos>...
Ref Emotion: anger | Pred Emotion: neutral | Match: ✗
BLEU: 0.0008 | ROUGE-1: 0.0870 | METEOR: 0.0151


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  45%|████▌     | 6602/14642 [12:53<13:02, 10.27it/s]


ESCONV LIVE SAMPLE 6600/14642
Input: <problem>ongoing depression <user>True That's a good idea. Thank you for your help I just have this as my mai n income r...
Reference: Okay, then one of you will have income and then he can find a job later....
Generated: Help me to continue to...
Full Response: Help me to continue to <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  46%|████▌     | 6701/14642 [13:05<14:08,  9.36it/s]


ESCONV LIVE SAMPLE 6700/14642
Input: <problem>academic pressure <user>i know thats why im so stress. stressed bye <user_emotion>depression <therapist>...
Reference: Perhaps "teacher" is the right path, buth "math" is not. good Luck to you...
Generated: I'm not gonna......
Full Response: I'm not gonna... <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1111 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  46%|████▋     | 6801/14642 [13:17<13:03, 10.01it/s]


ESCONV LIVE SAMPLE 6800/14642
Input: <problem>problems with friends <user>Hi, I need to vent about an ongoing situation. Can I talk to you? <user_emotion>ang...
Reference: Of course you can talk to me, What would you like to vent about?...
Generated: Sure. How about this?...
Full Response: Sure. How about this? <therapist_emotion> neutral<eos>...
Ref Emotion: anger | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1111 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  47%|████▋     | 6901/14642 [13:28<15:13,  8.48it/s]


ESCONV LIVE SAMPLE 6900/14642
Input: <problem>ongoing depression <user>I do. My other sister is always a big help. Maybe I should text her more often? <user_...
Reference: yes! that would always be a good idea. its always comforting to hear from a loved one....
Generated: You don't want to talk to her too much?...
Full Response: You don't want to talk to her too much? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0099 | ROUGE-1: 0.0741 | METEOR: 0.0309


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  48%|████▊     | 7001/14642 [13:41<17:33,  7.25it/s]


ESCONV LIVE SAMPLE 7000/14642
Input: <problem>breakup with partner <user>My breakup was a few months ago too. I'm in the Poconos. :) <user_emotion>depression...
Reference: I've always wanted to go to Las Vegas. My brother and his girlfriend just took a trip out there not ...
Generated: Do you think it's possible for you to...
Full Response: Do you think it's possible for you to <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0043 | ROUGE-1: 0.0606 | METEOR: 0.0233


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  48%|████▊     | 7101/14642 [13:52<12:31, 10.04it/s]


ESCONV LIVE SAMPLE 7100/14642
Input: <problem>academic pressure <user>Yeah, it can be very stressful when I don't understand though. I'm normally a very good...
Reference: Yes, I had a 100% average almost in high school and still struggled with the class. I kind of wish t...
Generated: you're not feeling well?...
Full Response: you're not feeling well? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  49%|████▉     | 7201/14642 [14:06<17:06,  7.25it/s]


ESCONV LIVE SAMPLE 7200/14642
Input: <problem>academic pressure <user>To be honest I have been a little afraid to approach them about the subject because the...
Reference: Just from chatting to you, I can sense that you are anxious, however, as many students feel this way...
Generated: You're afraid you're gonna be blamed?...
Full Response: You're afraid you're gonna be blamed? <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1053 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  50%|████▉     | 7301/14642 [14:17<12:57,  9.44it/s]


ESCONV LIVE SAMPLE 7300/14642
Input: <problem>breakup with partner <user>I m not sure. why would my friend let it happen, and why would my boyfriend choose t...
Reference: you've had some time to sit with this, i understand completely that you are disgusted, but how have ...
Generated: you're angry with her?...
Full Response: you're angry with her? <therapist_emotion> neutral<eos>...
Ref Emotion: disgust | Pred Emotion: neutral | Match: ✗
BLEU: 0.0004 | ROUGE-1: 0.1290 | METEOR: 0.0218


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  51%|█████     | 7401/14642 [14:29<13:32,  8.91it/s]


ESCONV LIVE SAMPLE 7400/14642
Input: <problem>ongoing depression <user>That must have been difficult to deal with. Mental health affects so many people in so...
Reference: It was. Luckily I had my doctor to help me. Family members meant well but chalked it up to "that's w...
Generated: That's what you're saying?...
Full Response: That's what you're saying? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0001 | ROUGE-1: 0.1500 | METEOR: 0.0166


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  51%|█████     | 7501/14642 [14:41<13:58,  8.52it/s]


ESCONV LIVE SAMPLE 7500/14642
Input: <problem>problems with friends <user>feeling very shame <user_emotion>shame <therapist>...
Reference: Oh my god ! What happened?...
Generated: you're not shameful?...
Full Response: you're not shameful? <therapist_emotion> neutral<eos>...
Ref Emotion: shame | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  52%|█████▏    | 7601/14642 [14:52<12:02,  9.75it/s]


ESCONV LIVE SAMPLE 7600/14642
Input: <problem>ongoing depression <user>Yes I do hope so. <user_emotion>depression <therapist>...
Reference: I love AMC/Holiday movies as well!...
Generated: You're very happy....
Full Response: You're very happy. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  53%|█████▎    | 7701/14642 [15:04<13:23,  8.63it/s]


ESCONV LIVE SAMPLE 7700/14642
Input: <problem>ongoing depression <user>I've been better. Just stressed really bad. but I think everybody just about is at thi...
Reference: it is a stressful time. what's bothering you the most?...
Generated: Do you think everybody else is at this point?...
Full Response: Do you think everybody else is at this point? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0255 | ROUGE-1: 0.2000 | METEOR: 0.1010


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  53%|█████▎    | 7800/14642 [15:15<12:49,  8.89it/s]


ESCONV LIVE SAMPLE 7800/14642
Input: <problem>ongoing depression <user>Yeah maybe. I suppose nobody really understands but I could at least speak to them abo...
Reference: everyone has problems and it is good to talk about them because a problem shared is a problem halved...
Generated: You wanna speak to them about it?...
Full Response: You wanna speak to them about it? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0003 | ROUGE-1: 0.1961 | METEOR: 0.0635


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  54%|█████▍    | 7901/14642 [15:27<13:18,  8.44it/s]


ESCONV LIVE SAMPLE 7900/14642
Input: <problem>breakup with partner <user>yes <user_emotion>anxiety <therapist>...
Reference: You are definitely not the only person to feel this way!...
Generated: you're not gonna do that?...
Full Response: you're not gonna do that? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0162 | ROUGE-1: 0.2353 | METEOR: 0.0481


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  55%|█████▍    | 8000/14642 [15:40<13:03,  8.47it/s]


ESCONV LIVE SAMPLE 8000/14642
Input: <problem>ongoing depression <user>yea, thats what my family is telling me to not rush into thigns <user_emotion>depressi...
Reference: You have to do what's right for you...
Generated: You're gonna rush into this?...
Full Response: You're gonna rush into this? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1333 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  55%|█████▌    | 8101/14642 [15:53<13:33,  8.04it/s]


ESCONV LIVE SAMPLE 8100/14642
Input: <problem>ongoing depression <user>but anyhow, I have been conferring with my friend pretty much daily on the phone and t...
Reference: that's great. having a supportive friend/family is so much a blessing during this pandemic...
Generated: if you'd like to talk to him....
Full Response: if you'd like to talk to him. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  56%|█████▌    | 8201/14642 [16:05<13:03,  8.22it/s]


ESCONV LIVE SAMPLE 8200/14642
Input: <problem>ongoing depression <user>Hi <user_emotion>depression <therapist>...
Reference: What can I help you with today?...
Generated: You're just trying to get a clearer picture....
Full Response: You're just trying to get a clearer picture. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1250 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  57%|█████▋    | 8302/14642 [16:17<12:08,  8.70it/s]


ESCONV LIVE SAMPLE 8300/14642
Input: <problem>problems with friends <user>I'm just having a hard time dealing with hurtful things people have said to me. I f...
Reference: that is really hard. Are you able to tell these people how their words are making you feel?...
Generated: you're just having a hard time coping with hurtful things?...
Full Response: you're just having a hard time coping with hurtful things? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1379 | METEOR: 0.0291


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  57%|█████▋    | 8401/14642 [16:28<12:47,  8.13it/s]


ESCONV LIVE SAMPLE 8400/14642
Input: <problem>job crisis <user>I don't want to. If there is a vaccine in the first quarter of 2021 then the company I worked ...
Reference: you have determination! There is fire within! Tell me, what brings you joy?...
Generated: you're not sure you want to go back to your old job?...
Full Response: you're not sure you want to go back to your old job? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0157 | ROUGE-1: 0.1538 | METEOR: 0.0388


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  58%|█████▊    | 8500/14642 [16:39<09:27, 10.83it/s]


ESCONV LIVE SAMPLE 8500/14642
Input: <problem>ongoing depression <user>My oldest is sad my youngest 7 couldnt care less. He loves that he can just play roblo...
Reference: To be fair Roblox is pretty fun. Have you seen those videos on Facebook where family members visit t...
Generated: You're not sure...
Full Response: You're not sure <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0526 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  59%|█████▊    | 8602/14642 [16:53<10:44,  9.37it/s]


ESCONV LIVE SAMPLE 8600/14642
Input: <problem>ongoing depression <user>I have practiced breathing techniques and meditation before and think that is a great ...
Reference: Absolutely, and it's completely understandable, but it's very hard to wait. I hope that you are able...
Generated: you're afraid he'll become ill?...
Full Response: you're afraid he'll become ill? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0333 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  59%|█████▉    | 8701/14642 [17:05<13:00,  7.61it/s]


ESCONV LIVE SAMPLE 8700/14642
Input: <problem>problems with friends <user>I am not good . What about you? <user_emotion>depression <therapist>...
Reference: i am so sorry to hear that. what seems to be worrying you today?...
Generated: You're good. You're gonna be fine....
Full Response: You're good. You're gonna be fine. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0108 | ROUGE-1: 0.1818 | METEOR: 0.0379


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  60%|██████    | 8801/14642 [17:17<12:02,  8.09it/s]


ESCONV LIVE SAMPLE 8800/14642
Input: <problem>ongoing depression <user>Yeah, the anxiety of being trapped in my apartment has kind of gotten to me but I am d...
Reference: Yeah, socially distancing is quite the isolating experience so I can relate to how you are feeling....
Generated: Would you like me to talk to you about your anxiety?...
Full Response: Would you like me to talk to you about your anxiety? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0130 | ROUGE-1: 0.1429 | METEOR: 0.0610


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  61%|██████    | 8900/14642 [17:29<10:43,  8.93it/s]


ESCONV LIVE SAMPLE 8900/14642
Input: <problem>breakup with partner <user>It's good that you have had a similar experience and understand what I'm going throu...
Reference: Oh I see. That's a huge step to make in a relationship. So you are hesitant to break up with her bec...
Generated: You feel you're rejecting her?...
Full Response: You feel you're rejecting her? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1667 | METEOR: 0.0188


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  61%|██████▏   | 9001/14642 [17:40<10:03,  9.35it/s]


ESCONV LIVE SAMPLE 9000/14642
Input: <problem>ongoing depression <user>No! I keep hearing the negative cruel remarks being repeated in my head over and over ...
Reference: WHAT DO YOU HEAR?...
Generated: you're not telling me you're afraid?...
Full Response: you're not telling me you're afraid? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1667 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  62%|██████▏   | 9101/14642 [17:53<10:02,  9.20it/s]


ESCONV LIVE SAMPLE 9100/14642
Input: <problem>job crisis <user>Yes, sure. I work in an industry heavily impacted by COVID-19, and my company has announced pa...
Reference: Woah. Blessings to you for existing in what sounds like a stressful situation. How is self care goin...
Generated: you've been seeing this...
Full Response: you've been seeing this <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1600 | METEOR: 0.0272


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  63%|██████▎   | 9201/14642 [18:05<09:55,  9.13it/s]


ESCONV LIVE SAMPLE 9200/14642
Input: <problem>job crisis <user>Hi <user_emotion>sadness <therapist>...
Reference: Hi there, how are you today?...
Generated: You're sorry you didn't tell me?...
Full Response: You're sorry you didn't tell me? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0408 | ROUGE-1: 0.1429 | METEOR: 0.0833


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  64%|██████▎   | 9301/14642 [18:17<10:13,  8.70it/s]


ESCONV LIVE SAMPLE 9300/14642
Input: <problem>job crisis <user>I thought I had everything set up for one of my resident's at a new placement and I called tod...
Reference: I see. It seems as though you've done your best, but I understand your feelings. Is it possible you ...
Generated: You're afraid she'll let you go?...
Full Response: You're afraid she'll let you go? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0010 | ROUGE-1: 0.1081 | METEOR: 0.0194


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  64%|██████▍   | 9402/14642 [18:29<08:44, 10.00it/s]


ESCONV LIVE SAMPLE 9400/14642
Input: <problem>breakup with partner <user>I don't know if it was good. Maybe he was just being mean. It's nice of you to liste...
Reference: I know it's useless to tell you to not be sad. Go ahead and let it all out by crying or screaming bu...
Generated: You'll be back in April....
Full Response: You'll be back in April. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0004 | ROUGE-1: 0.1053 | METEOR: 0.0182


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  65%|██████▍   | 9501/14642 [18:40<09:04,  9.44it/s]


ESCONV LIVE SAMPLE 9500/14642
Input: <problem>ongoing depression <user>thanks. I hope you best of luck then with all your endeavours. I know we can all get t...
Reference: Right on, same to you. It's okay to feel anxious right now. It's okay to feel unproductive right now...
Generated: who you're traveling with....
Full Response: who you're traveling with. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0400 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  66%|██████▌   | 9600/14642 [18:52<09:19,  9.01it/s]


ESCONV LIVE SAMPLE 9600/14642
Input: <problem>ongoing depression <user>She did but not so great anymore. I sure hope so. It's been hard on everybody. especia...
Reference: Maybe she just needs some time to decompress. Being a teenager is a stressful time, especially this ...
Generated: You think she's......
Full Response: You think she's... <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0541 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  66%|██████▋   | 9701/14642 [19:04<11:27,  7.18it/s]


ESCONV LIVE SAMPLE 9700/14642
Input: <problem>breakup with partner <user>I am not feeling very positive tonight but I think you are right. I think I reached ...
Reference: Would you feel comfortable giving me a little more detail as far as you problem with self abuse? wha...
Generated: What do you think you're saying to somebody...
Full Response: What do you think you're saying to somebody <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0033 | ROUGE-1: 0.1765 | METEOR: 0.0429


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  67%|██████▋   | 9801/14642 [19:16<09:39,  8.36it/s]


ESCONV LIVE SAMPLE 9800/14642
Input: <problem>ongoing depression <user>It's fine, no need to apologize <user_emotion>anxiety <therapist>...
Reference: Have you had any chance for ME Time where husband friend or family could watch your kids and give mo...
Generated: you can just... you can just......
Full Response: you can just... you can just... <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0003 | ROUGE-1: 0.0465 | METEOR: 0.0152


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  68%|██████▊   | 9901/14642 [19:28<07:55,  9.98it/s]


ESCONV LIVE SAMPLE 9900/14642
Input: <problem>ongoing depression <user>But when i see social media and News every day it increasing my stress level <user_emo...
Reference: I live alone as well and also need to take care of myself. It can be so hard at times....
Generated: You're worried about your Stress?...
Full Response: You're worried about your Stress? <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  68%|██████▊   | 10001/14642 [19:40<08:44,  8.85it/s]


ESCONV LIVE SAMPLE 10000/14642
Input: <problem>ongoing depression <user>I feel somewhat better just thin I feel a little better now. I look forward to talking...
Reference: Be creative! Get silly! most important have fun! this is a hard year but its still a chance to make ...
Generated: You wanna talk about Zoom?...
Full Response: You wanna talk about Zoom? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0246


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  69%|██████▉   | 10102/14642 [19:51<08:14,  9.18it/s]


ESCONV LIVE SAMPLE 10100/14642
Input: <problem>breakup with partner <user>I just need to schedule an appointment with someone and go in and meet them face to ...
Reference: It is great that you can identify what needs to be done in this situation. It sounds like you are re...
Generated: What would you like to meet them face to face?...
Full Response: What would you like to meet them face to face? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0013 | ROUGE-1: 0.1961 | METEOR: 0.0660


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  70%|██████▉   | 10201/14642 [20:03<08:17,  8.93it/s]


ESCONV LIVE SAMPLE 10200/14642
Input: <problem>ongoing depression <user>I'm so sorry that it's a situation that you can relate to, its such a hard situation w...
Reference: Wow, that really is a lot to deal with. It seems as though you have a good understanding of what you...
Generated: What do you wanna say to that?...
Full Response: What do you wanna say to that? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0941 | METEOR: 0.0214


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  70%|███████   | 10301/14642 [20:15<11:19,  6.39it/s]


ESCONV LIVE SAMPLE 10300/14642
Input: <problem>breakup with partner <user>Thanks' so much , you lifted my mood , you being really helpful <user_emotion>depres...
Reference: No problem at all! Sending positive vibes your way!! You can get through this!!...
Generated: You're really glad I didn't feel like you lifted my mood....
Full Response: You're really glad I didn't feel like you lifted my mood. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0741 | METEOR: 0.0365


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  71%|███████   | 10402/14642 [20:27<06:59, 10.10it/s]


ESCONV LIVE SAMPLE 10400/14642
Input: <problem>ongoing depression <user>Because prior to the pandemic, I was pretty regimented with set schedules and can clea...
Reference: Is there anything else I can help you with?...
Generated: you're not feeling this...
Full Response: you're not feeling this <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1429 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  72%|███████▏  | 10501/14642 [20:39<09:18,  7.42it/s]


ESCONV LIVE SAMPLE 10500/14642
Input: <problem>academic pressure <user>I think that's pretty good advice. I know which room I can do this, but just away from ...
Reference: Of course! Is anything else on your mind?...
Generated: you're quite nervous...
Full Response: you're quite nervous <therapist_emotion> neutral<eos>...
Ref Emotion: nervousness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  72%|███████▏  | 10601/14642 [20:51<08:55,  7.54it/s]


ESCONV LIVE SAMPLE 10600/14642
Input: <problem>ongoing depression <user>Well the boyfriend for awhile the other thing for a day or two I got a hold of a thera...
Reference: that's very good that you are going to talk to a therapist. you showed great strength by reaching ou...
Generated: You didn't want him to talk to you for two days?...
Full Response: You didn't want him to talk to you for two days? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0484 | ROUGE-1: 0.3125 | METEOR: 0.2451


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  73%|███████▎  | 10701/14642 [21:03<07:04,  9.29it/s]


ESCONV LIVE SAMPLE 10700/14642
Input: <problem>Alcohol Abuse <user>Yes, I sure hope so. Thank you have a good day! <user_emotion>shame <therapist>...
Reference: You as well!...
Generated: you're late....
Full Response: you're late. <therapist_emotion> neutral<eos>...
Ref Emotion: shame | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.3333 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  74%|███████▍  | 10801/14642 [21:16<07:56,  8.06it/s]


ESCONV LIVE SAMPLE 10800/14642
Input: <problem>Sleep Problems <user>I'm looking forward to start a own business. Yes i hope so. i'll start planning <user_emot...
Reference: Oh that's exciting! What sort of business?...
Generated: You're going to start a business?...
Full Response: You're going to start a business? <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0346 | ROUGE-1: 0.1333 | METEOR: 0.0725


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  74%|███████▍  | 10901/14642 [21:28<07:57,  7.84it/s]


ESCONV LIVE SAMPLE 10900/14642
Input: <problem>Procrastination <user>OK, so with, well, kinf0f like baby steps....right <user_emotion>anxiety <therapist>...
Reference: It is about making small adjustments to feed a good habit....
Generated: You're afraid that you're going to kang...
Full Response: You're afraid that you're going to kang  <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0187 | ROUGE-1: 0.1000 | METEOR: 0.0472


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  75%|███████▌  | 11001/14642 [21:39<07:13,  8.40it/s]


ESCONV LIVE SAMPLE 11000/14642
Input: <problem>Sleep Problems <user>Bye <user_emotion>anxiety <therapist>...
Reference: (please click the quit button)...
Generated: So you're afraid she's gonna......
Full Response: So you're afraid she's gonna... <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  76%|███████▌  | 11101/14642 [21:52<06:30,  9.08it/s]


ESCONV LIVE SAMPLE 11100/14642
Input: <problem>Alcohol Abuse <user>Work has been stressful, but I feel like was before things got bad. Most nights I come home...
Reference: What do you do for work?...
Generated: Do you drink?...
Full Response: Do you drink? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0418 | ROUGE-1: 0.4444 | METEOR: 0.1754


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  76%|███████▋  | 11201/14642 [22:03<06:34,  8.73it/s]


ESCONV LIVE SAMPLE 11200/14642
Input: <problem>Sleep Problems <user>That's a good idea. I could try go out for a walk either before or after work as a startin...
Reference: That sounds like a good plan! It also helps to make sure that it’s dark and very quiet. Surprisingly...
Generated: Would you like to walk?...
Full Response: Would you like to walk? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0001 | ROUGE-1: 0.0889 | METEOR: 0.0281


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  77%|███████▋  | 11301/14642 [22:15<05:55,  9.40it/s]


ESCONV LIVE SAMPLE 11300/14642
Input: <problem>Sleep Problems <user>I'm looking forward to a fulls night rest. <user_emotion>anxiety <therapist>...
Reference: That is good, lavender baths or showers will also ease and relax the troubled mind....
Generated: What do you wanna do?...
Full Response: What do you wanna do? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  78%|███████▊  | 11401/14642 [22:27<06:36,  8.16it/s]


ESCONV LIVE SAMPLE 11400/14642
Input: <problem>Appearance Anxiety <user>mind u i was 125 and fit for the 10 years we were together before we had kids like a d...
Reference: I talked to a nutritionist and it really helped. They are professionals who can look into your diet ...
Generated: You're not in our lives?...
Full Response: You're not in our lives? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  79%|███████▊  | 11502/14642 [22:39<05:26,  9.61it/s]


ESCONV LIVE SAMPLE 11500/14642
Input: <problem>Appearance Anxiety <user>Yes. I mentioned it to them at my 6 week pp check up. I was prescribed to a med to hel...
Reference: Antidepressants can take a couple of weeks to kick in, I am on them too; I hope you will find them h...
Generated: Do you think you've gotten the hang of it yet?...
Full Response: Do you think you've gotten the hang of it yet? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0038 | ROUGE-1: 0.1463 | METEOR: 0.0554


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  79%|███████▉  | 11600/14642 [22:50<05:16,  9.60it/s]


ESCONV LIVE SAMPLE 11600/14642
Input: <problem>Appearance Anxiety <user>I hate everything when I see my reflection. I just feel fat and ugly <user_emotion>dis...
Reference: Well, I have been overweight and I know it sucks. Perhaps you could join a weight loss group?...
Generated: You're so fat and ugly?...
Full Response: You're so fat and ugly? <therapist_emotion> neutral<eos>...
Ref Emotion: disgust | Pred Emotion: neutral | Match: ✗
BLEU: 0.0040 | ROUGE-1: 0.1667 | METEOR: 0.0299


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  80%|███████▉  | 11701/14642 [23:03<05:37,  8.72it/s]


ESCONV LIVE SAMPLE 11700/14642
Input: <problem>job crisis <user>Yes that is a very good summary of what is going on. And I am glad to see you can relate. Whil...
Reference: Are there other jobs available in your area that you could apply for that are more up your alley?...
Generated: you're so depressed....
Full Response: you're so depressed. <therapist_emotion> neutral<eos>...
Ref Emotion: shame | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0870 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  81%|████████  | 11801/14642 [23:14<05:51,  8.08it/s]


ESCONV LIVE SAMPLE 11800/14642
Input: <problem>job crisis <user>SOMEWHERE SCARED AND CRYING IN THE RAIN <user_emotion>fear <therapist>...
Reference: Is it cold up there?...
Generated: You're so scared?...
Full Response: You're so scared? <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  81%|████████▏ | 11901/14642 [23:26<04:57,  9.22it/s]


ESCONV LIVE SAMPLE 11900/14642
Input: <problem>problems with friends <user>i was having an affair and i confided in my best friend about it. i found out that ...
Reference: Your friend violated a confidence. That sounds difficult....
Generated: what did you find out?...
Full Response: what did you find out? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  82%|████████▏ | 12001/14642 [23:38<05:31,  7.97it/s]


ESCONV LIVE SAMPLE 12000/14642
Input: <problem>ongoing depression <user>that sounds weird not what you said I referring to what I said I think it was weird ri...
Reference: What do you think sounds weird?...
Generated: you're right....
Full Response: you're right. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.2222 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  83%|████████▎ | 12101/14642 [23:50<05:34,  7.59it/s]


ESCONV LIVE SAMPLE 12100/14642
Input: <problem>problems with friends <user>Today those things have gone and all are fine. When we happen to think of the old e...
Reference: So you are angry about what happen in the past...
Generated: you're not sure you want to feel happy?...
Full Response: you're not sure you want to feel happy? <therapist_emotion> neutral<eos>...
Ref Emotion: anger | Pred Emotion: neutral | Match: ✗
BLEU: 0.0216 | ROUGE-1: 0.1053 | METEOR: 0.0510


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  83%|████████▎ | 12202/14642 [24:02<04:46,  8.52it/s]


ESCONV LIVE SAMPLE 12200/14642
Input: <problem>job crisis <user>thanks for trying to understand it just seems pretty hopeless <user_emotion>anxiety <therapist...
Reference: No problem. I am encountering a similar situation, looking for a well suited job....
Generated: you're really struggling?...
Full Response: you're really struggling? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  84%|████████▍ | 12301/14642 [24:14<05:38,  6.91it/s]


ESCONV LIVE SAMPLE 12300/14642
Input: <problem>breakup with partner <user>Thank you for listening and giving good advice. You are really good at giving suppor...
Reference: I am glad that you find it helpful. I wish I can be around to hear more about it....
Generated: You're really good at listening to your patients?...
Full Response: You're really good at listening to your patients? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0070 | ROUGE-1: 0.1429 | METEOR: 0.0559


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  85%|████████▍ | 12402/14642 [24:26<03:28, 10.74it/s]


ESCONV LIVE SAMPLE 12400/14642
Input: <problem>academic pressure <user>I just don't feel motivated to study <user_emotion>shame <therapist>...
Reference: from today please feel yourself that you know everything about your studies , i think then you fell ...
Generated: you. You're not motivated to study?...
Full Response: you. You're not motivated to study? <therapist_emotion> neutral<eos>...
Ref Emotion: shame | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.2400 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  85%|████████▌ | 12501/14642 [24:37<04:05,  8.71it/s]


ESCONV LIVE SAMPLE 12500/14642
Input: <problem>breakup with partner <user>We met on AIM, Her ex is a registered sex offender. and it makes me sad that she lef...
Reference: Sometimes we just have to learn to let go *nods sententiously*...
Generated: You think she was trying to get you?...
Full Response: You think she was trying to get you? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0191 | ROUGE-1: 0.1053 | METEOR: 0.0935


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  86%|████████▌ | 12600/14642 [24:49<03:48,  8.94it/s]


ESCONV LIVE SAMPLE 12600/14642
Input: <problem>job crisis <user>We, I and my wife, also think the same way. Even if takes some time, better to wait. But we we...
Reference: That's great! I'm glad I was able to help....
Generated: We'd like to talk about this?...
Full Response: We'd like to talk about this? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0248 | ROUGE-1: 0.1111 | METEOR: 0.0575


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  87%|████████▋ | 12701/14642 [25:01<04:17,  7.55it/s]


ESCONV LIVE SAMPLE 12700/14642
Input: <problem>breakup with partner <user>sure, you too <user_emotion>anxiety <therapist>...
Reference: thank you...
Generated: you're anxious. You're anxious about your sister?...
Full Response: you're anxious. You're anxious about your sister? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1818 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  87%|████████▋ | 12801/14642 [25:13<03:07,  9.82it/s]


ESCONV LIVE SAMPLE 12800/14642
Input: <problem>breakup with partner <user>its hard to plan a future when i am grieving....i don't know how to make myself happ...
Reference: don't be sad because of past...
Generated: you're......
Full Response: you're... <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  88%|████████▊ | 12901/14642 [25:26<03:57,  7.32it/s]


ESCONV LIVE SAMPLE 12900/14642
Input: <problem>job crisis <user>No I haven't yet. But I was thinking about it actually. Do you think that would be weird to do...
Reference: I think that would be a good next step. Communication is always helpful. I think your work will appr...
Generated: I don't think it's unusual....
Full Response: I don't think it's unusual. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0014 | ROUGE-1: 0.1290 | METEOR: 0.0452


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  89%|████████▉ | 13000/14642 [25:38<03:28,  7.88it/s]


ESCONV LIVE SAMPLE 13000/14642
Input: <problem>academic pressure <user>Thanks, that will be helpful to decide on the programs I want to apply to Are you based...
Reference: You welcome. and yes...
Generated: I'm not sure that you want me to be your doctor....
Full Response: I'm not sure that you want me to be your doctor. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1250 | METEOR: 0.1064


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  89%|████████▉ | 13101/14642 [25:49<02:46,  9.24it/s]


ESCONV LIVE SAMPLE 13100/14642
Input: <problem>academic pressure <user>What kind of advise should I be seeking from the counsellor? <user_emotion>anxiety <the...
Reference: Well, perhaps they can at least alleviate your anxiety somewhat. Or maybe even get you a sick note t...
Generated: You're anxious....
Full Response: You're anxious. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0606 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  90%|█████████ | 13201/14642 [26:01<02:31,  9.49it/s]


ESCONV LIVE SAMPLE 13200/14642
Input: <problem>ongoing depression <user>You met with your friends? I've tried meeting with my friends. I feel ok when I'm with...
Reference: yes, you have to be positive at all time and always do something that makes you happy...
Generated: You wanna talk about yourself?...
Full Response: You wanna talk about yourself? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0909 | METEOR: 0.0316


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  91%|█████████ | 13301/14642 [26:13<03:04,  7.27it/s]


ESCONV LIVE SAMPLE 13300/14642
Input: <problem>breakup with partner <user>Thanks! You’ve been really helpful today. I can’t thank you enough for your encourag...
Reference: It was no problem! Thank you for sharing and talking it out!...
Generated: You've been very kind to me....
Full Response: You've been very kind to me. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1053 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  92%|█████████▏| 13402/14642 [26:25<02:21,  8.78it/s]


ESCONV LIVE SAMPLE 13400/14642
Input: <problem>ongoing depression <user>I actually do yoga every evening. She hates it. I'm living in her home without a lease...
Reference: As long as you have property there she would have to formally evict you. That is the good news....
Generated: You're afraid that she'll just leave you?...
Full Response: You're afraid that she'll just leave you? <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.2857 | METEOR: 0.0562


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  92%|█████████▏| 13501/14642 [26:37<02:21,  8.07it/s]


ESCONV LIVE SAMPLE 13500/14642
Input: <problem>academic pressure <user>should i go to my school councellor <user_emotion>anxiety <therapist>...
Reference: Yes, that is a great idea. Have you ever joined a study group? A group of peers can also be helpful....
Generated: who you wanna go to?...
Full Response: who you wanna go to? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0022 | ROUGE-1: 0.0769 | METEOR: 0.0258


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  93%|█████████▎| 13601/14642 [26:49<01:58,  8.82it/s]


ESCONV LIVE SAMPLE 13600/14642
Input: <problem>job crisis <user>At this point, I don't know. I have thought about doing art commissions and selling them onlin...
Reference: That sounds like a fantastic idea, and even if there is competition, there are still a lot of people...
Generated: I know you're anxious....
Full Response: I know you're anxious. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  94%|█████████▎| 13701/14642 [27:01<01:36,  9.77it/s]


ESCONV LIVE SAMPLE 13700/14642
Input: <problem>ongoing depression <user>Yes you are correct the problems with the bank how do i solve not to sink into depress...
Reference: So, I have also struggled with that in the past. The way that really helped me put things into persp...
Generated: you......
Full Response: you... <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0588 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  94%|█████████▍| 13802/14642 [27:14<01:26,  9.71it/s]


ESCONV LIVE SAMPLE 13800/14642
Input: <problem>academic pressure <user>Kind of, but the schedule is a bit weird, like every other day I only have an afternoon...
Reference: it sounds like your schedule is different almost everyday. I know I find schedule changes like that ...
Generated: You've been in this situation before?...
Full Response: You've been in this situation before? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0245


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  95%|█████████▍| 13900/14642 [27:26<01:39,  7.43it/s]


ESCONV LIVE SAMPLE 13900/14642
Input: <problem>breakup with partner <user>We talked about it briefly, but havent spoke about in great detail. I'm just scared,...
Reference: How much of this distance between you do you think is related to COVID measures?...
Generated: What do you think we're gonna do if you do?...
Full Response: What do you think we're gonna do if you do? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0681 | ROUGE-1: 0.3077 | METEOR: 0.1379


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  96%|█████████▌| 14001/14642 [27:37<01:09,  9.22it/s]


ESCONV LIVE SAMPLE 14000/14642
Input: <problem>job crisis <user>Yes. i plan on doing that this afternoon. Thank you so much. I am feeling way more at ease. I ...
Reference: I hope your day goes great as well...
Generated: I appreciate it....
Full Response: I appreciate it. <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0215 | ROUGE-1: 0.1818 | METEOR: 0.0667


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  96%|█████████▋| 14101/14642 [27:50<00:55,  9.82it/s]


ESCONV LIVE SAMPLE 14100/14642
Input: <problem>breakup with partner <user>its gone, she left me <user_emotion>sadness <therapist>...
Reference: That's awful. I'm so sorry to hear that. How did it happen?...
Generated: You've left her?...
Full Response: You've left her? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  97%|█████████▋| 14202/14642 [28:02<00:49,  8.90it/s]


ESCONV LIVE SAMPLE 14200/14642
Input: <problem>academic pressure <user>Thank you so much for your insight. I will be praying for a good outcome. <user_emotion...
Reference: you will get it for sure! is there anything else I can help you with?...
Generated: I'll be praying for a good outcome....
Full Response: I'll be praying for a good outcome. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0105 | ROUGE-1: 0.1739 | METEOR: 0.0352


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  98%|█████████▊| 14302/14642 [28:14<00:37,  9.17it/s]


ESCONV LIVE SAMPLE 14300/14642
Input: <problem>problems with friends <user>That's something you would have to ask him, but I feel like I am at peace for now. ...
Reference: Yeah that a good way to look at it. Maybe take your mind of it by occupying yourself...
Generated: Do you feel that you're at peace for now?...
Full Response: Do you feel that you're at peace for now? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0105 | ROUGE-1: 0.1429 | METEOR: 0.0877


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  98%|█████████▊| 14400/14642 [28:25<00:24, 10.02it/s]


ESCONV LIVE SAMPLE 14400/14642
Input: <problem>academic pressure <user>I'm feeling really anxious these days. I'm finding the COVID online learning experience...
Reference: I understand your frustration. All of us are challenged due to COVID....
Generated: you're feeling really anxious?...
Full Response: you're feeling really anxious? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000:  99%|█████████▉| 14501/14642 [28:37<00:18,  7.59it/s]


ESCONV LIVE SAMPLE 14500/14642
Input: <problem>breakup with partner <user>Yeah she was a little violent. I was honestly a little scared for my life for a seco...
Reference: Have you talked since then?...
Generated: Did she say she was a little scared for your life?...
Full Response: Did she say she was a little scared for your life? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000: 100%|█████████▉| 14601/14642 [28:49<00:04, 10.07it/s]


ESCONV LIVE SAMPLE 14600/14642
Input: <problem>job crisis <user>Yes It is frustrating. When I think about the financial issues I feel depressed <user_emotion>...
Reference: I compleyely understand I have also lost my career due to covid. Dont give up, staying positive and ...
Generated: You're depressed?...
Full Response: You're depressed? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0645 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.000: 100%|██████████| 14642/14642 [28:53<00:00,  8.45it/s]


ESCONV EVALUATION COMPLETED!
Processed 14642 samples
Average BLEU: 0.0042
Average ROUGE-1: 0.1110
Emotion Accuracy: 0.0000
Emotion Tag Coverage: 0.9992

ESCONV THERAPY MODEL EVALUATION RESULTS
Model Type: RL
Output Format: STANDARD

Text Generation Metrics:
  BLEU Score:     0.0042
  ROUGE-1:        0.1110
  ROUGE-2:        0.0092
  ROUGE-L:        0.1002
  METEOR:         0.0290

Emotion Prediction:
  Emotion Accuracy:    0.0000
  Emotion Tag Coverage: 0.9992

Dataset Info:
  Total Samples:  14642

Emotion Confusion Matrix:
  anxiety:
    -> neutral: 3907
    -> emotional: 2
    -> you're: 14
    -> of: 1
    -> anger?: 1
    -> no_prediction: 4
    -> parent?: 1
    -> you: 1
    -> if: 1
    -> anxiety.: 1
  anger:
    -> neutral: 1255
    -> you're: 1
    -> anger?: 1
  fear:
    -> neutral: 1048
    -> <therapist_emotion>: 1
  depression:
    -> neutral: 3816
    -> no_prediction: 4
    -> cause: 2
    -> s?: 1
    -> <therapist_emotion>: 1
    -> anger.: 1
    -> will: 1
    -> 




SFT

In [None]:
import json
import re
import torch
from transformers import GPT2Tokenizer
from datasets import Dataset as HFDataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import nltk
from tqdm import tqdm
import numpy as np
from collections import defaultdict

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# ====================== Text Cleaning Functions ======================
def clean_therapy_text(text):
    """Remove descriptive text patterns and clean text"""
    if not text:
        return ""
    # Remove newlines and extra whitespace
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# ====================== ESConv Dataset Processing ======================
class ESConvEvaluationDataset:
    """Dataset class for ESConv evaluation data processing"""

    def __init__(self, json_path, tokenizer_path=None):
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            if isinstance(data, dict):
                self._data = [data]
            else:
                self._data = data

        if tokenizer_path:
            self.tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
        else:
            # Use default tokenizer if path not provided
            from transformers import AutoTokenizer
            self.tokenizer = AutoTokenizer.from_pretrained('gpt2')

        self.evaluation_data = []
        self._process_esconv_data()

    def _process_esconv_data(self):
        """Process ESConv dialog data for evaluation"""
        print("Processing ESConv data...")

        for conv_idx, conv in enumerate(tqdm(self._data, desc="Processing conversations")):
            problem_type = conv.get("problem_type", "").strip()
            emotion_type = conv.get("emotion_type", "").strip()
            dialog = conv.get("dialog", [])

            if not dialog:
                continue

            if conv_idx < 3:
                print(f"\nProcessing conversation {conv_idx + 1}:")
                print(f"  Problem type: {problem_type}")
                print(f"  Emotion type: {emotion_type}")
                print(f"  Dialog length: {len(dialog)} turns")

            user_messages = []  # Accumulate user messages

            for turn_idx, turn in enumerate(dialog):
                speaker = turn.get("speaker", "").strip()
                content = clean_therapy_text(turn.get("content", ""))

                if not content:
                    continue

                if speaker == "seeker":
                    # Accumulate user (seeker) messages
                    user_messages.append(content)

                elif speaker == "supporter":
                    if user_messages:

                        # Combine all user messages for this interaction
                        combined_user_text = " ".join(user_messages)
                        therapist_response = content

                        # Create structured input in the expected format
                        input_parts = []
                        if problem_type:
                            input_parts.append(f"<problem>{problem_type}")

                        input_parts.append(f"<user>{combined_user_text}")

                        if emotion_type:
                            input_parts.append(f"<user_emotion>{emotion_type}")

                        input_parts.append("<therapist>")
                        input_text = " ".join(input_parts)

                        # Create evaluation sample
                        eval_sample = {
                            'input_text': input_text,
                            'reference_text': therapist_response,
                            'reference_emotion': emotion_type,  # Use conversation-level emotion
                            'user_input': combined_user_text,
                            'user_emotion': emotion_type,
                            'problem_type': problem_type,
                            'conversation_id': conv_idx,
                            'turn_id': turn_idx
                        }

                        self.evaluation_data.append(eval_sample)

                        # Show first few examples
                        if len(self.evaluation_data) <= 3:
                            print(f"\n  Sample {len(self.evaluation_data)}:")
                            print(f"    Input: {input_text[:100]}...")
                            print(f"    Reference: {therapist_response[:100]}...")

                        user_messages = []

        print(f"\nProcessed {len(self.evaluation_data)} evaluation samples from {len(self._data)} conversations")

    def get_evaluation_data(self):
        return self.evaluation_data

    def print_sample_formats(self, num_samples=3):
        """Print sample input/output formats for verification"""
        print(f"\n{'='*80}")
        print("ESCONV SAMPLE INPUT/OUTPUT FORMATS")
        print('='*80)

        for i, sample in enumerate(self.evaluation_data[:num_samples]):
            print(f"\nSample {i+1}:")
            print(f"  Problem Type: {sample['problem_type']}")
            print(f"  User Emotion: {sample['user_emotion']}")
            print(f"  Input Format: {sample['input_text'][:150]}...")
            print(f"  Reference Response: {sample['reference_text'][:100]}...")
            print(f"  User Input Only: {sample['user_input'][:100]}...")

# ====================== Model Format Detection ======================
def detect_model_format(model, tokenizer, device='cuda', test_inputs=None):
    """
    Test model output format (should be consistent since both SFT and RL use same format)

    Args:
        model: The model to test
        tokenizer: Model tokenizer
        device: Device to run test on
        test_inputs: List of test input strings, uses defaults if None

    Returns:
        str: 'standard' if using <therapist_emotion> format, 'unknown' otherwise
    """
    if test_inputs is None:
        test_inputs = [
            "<problem>anxiety <user>I'm worried about work <user_emotion>anxiety <therapist>",
            "<problem>depression <user>I feel very sad <user_emotion>sadness <therapist>",
            "<problem>relationship <user>My partner doesn't understand me <user_emotion>anger <therapist>"
        ]

    emotion_tag_count = 0
    total_tests = len(test_inputs)

    model.eval()
    with torch.no_grad():
        for test_input in test_inputs:
            # Tokenize and generate
            input_ids = tokenizer.encode(test_input, return_tensors='pt').to(device)
            outputs = model.generate(
                input_ids,
                max_new_tokens=128,
                do_sample=True,
                top_p=0.6,
                top_k=30,
                temperature=1.0,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

            # Decode response
            new_tokens = outputs[0][len(input_ids[0]):]
            response = tokenizer.decode(new_tokens, skip_special_tokens=False).strip()

            # Check for emotion tag
            if '<therapist_emotion>' in response:
                emotion_tag_count += 1

    # Determine format
    if emotion_tag_count >= total_tests * 0.5:  # At least 50% have emotion tags
        return 'standard'
    else:
        return 'unknown'

# ====================== Model Output Extraction ======================
def extract_model_output(response):
    """
    Extract therapist text and emotion from model output.
    Both SFT and RL models use format: 'text <therapist_emotion> emotion<eos>'

    Returns:
        therapist_text (str): Text before <therapist_emotion>
        emotion (str): Emotion word after <therapist_emotion>
        has_emotion_tag (bool): Whether emotion tag was found
    """
    response = response.strip()

    # Remove <eos> if present
    if response.endswith('<eos>'):
        response = response[:-5].strip()

    # Find <therapist_emotion> tag
    emotion_pattern = r'<therapist_emotion>'
    emotion_match = re.search(emotion_pattern, response)

    if emotion_match:
        # Extract text before <therapist_emotion>
        therapist_text = response[:emotion_match.start()].strip()

        # Extract emotion part after <therapist_emotion>
        emotion_part = response[emotion_match.end():].strip()

        # Get the first word as emotion
        emotion_words = emotion_part.split()
        emotion = emotion_words[0].lower() if emotion_words else ""

        return therapist_text, emotion, True
    else:
        return response, "", False

# ====================== Evaluation Metrics ======================
class TherapyEvaluationMetrics:
    """Class to compute evaluation metrics"""

    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.smoothing = SmoothingFunction().method1

    def compute_bleu(self, reference, candidate):
        """Compute BLEU score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=self.smoothing)

    def compute_rouge(self, reference, candidate):
        """Compute ROUGE scores"""
        if not candidate or not reference:
            return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

        scores = self.rouge_scorer.score(reference, candidate)
        return {
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        }

    def compute_meteor(self, reference, candidate):
        """Compute METEOR score"""
        if not candidate or not reference:
            return 0.0

        reference_tokens = reference.split()
        candidate_tokens = candidate.split()

        if len(candidate_tokens) == 0:
            return 0.0

        return meteor_score([reference_tokens], candidate_tokens)

# ====================== Model Evaluation ======================
def evaluate_model_esconv(model, tokenizer, evaluation_dataset, device='cuda', max_new_tokens=128,
                         top_p=0.6, top_k=30, temperature=1.0, do_sample=True):
    """
    Evaluate model on ESConv therapy dataset

    Args:
        model: Trained model (SFT or RL)
        tokenizer: Model tokenizer
        evaluation_dataset: ESConvEvaluationDataset instance
        device: Device to run evaluation on
        max_new_tokens: Maximum tokens to generate
        top_p: Top-p sampling parameter
        do_sample: Whether to use sampling

    Returns:
        Dictionary containing evaluation results
    """
    metrics_computer = TherapyEvaluationMetrics()
    evaluation_data = evaluation_dataset.get_evaluation_data()

    results = {
        'bleu_scores': [],
        'rouge1_scores': [],
        'rouge2_scores': [],
        'rougeL_scores': [],
        'meteor_scores': [],
        'emotion_accuracy': [],
        'has_emotion_tag': [],
        'sample_outputs': []
    }

    emotion_confusion = defaultdict(lambda: defaultdict(int))

    model.eval()

    # Detect model output format
    detected_format = detect_model_format(model, tokenizer, device)
    print(f"Output format check: {detected_format.upper()}")

    print(f"Evaluating model on {len(evaluation_data)} ESConv samples...")

    # Create progress bar with custom format
    progress_bar = tqdm(evaluation_data, desc="Evaluating ESConv",
                       bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]')

    for i, sample in enumerate(progress_bar):
        input_text = sample['input_text']
        reference_text = sample['reference_text']
        reference_emotion = sample['reference_emotion']

        # Tokenize input
        input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_new_tokens=max_new_tokens,
                do_sample=do_sample,
                top_p=top_p,
                top_k=top_k,
                temperature=temperature,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        # Decode response
        new_tokens = outputs[0][len(input_ids[0]):]
        response = tokenizer.decode(new_tokens, skip_special_tokens=False).strip()

        # Extract model output
        predicted_text, predicted_emotion, has_emotion = extract_model_output(response)

        # Compute text generation metrics
        bleu = metrics_computer.compute_bleu(reference_text, predicted_text)
        rouge = metrics_computer.compute_rouge(reference_text, predicted_text)
        meteor = metrics_computer.compute_meteor(reference_text, predicted_text)

        # Compute emotion accuracy
        emotion_correct = (predicted_emotion.lower() == reference_emotion.lower()) if has_emotion else False

        # Store results
        results['bleu_scores'].append(bleu)
        results['rouge1_scores'].append(rouge['rouge1'])
        results['rouge2_scores'].append(rouge['rouge2'])
        results['rougeL_scores'].append(rouge['rougeL'])
        results['meteor_scores'].append(meteor)
        results['emotion_accuracy'].append(emotion_correct)
        results['has_emotion_tag'].append(has_emotion)

        # Update confusion matrix
        if has_emotion:
            emotion_confusion[reference_emotion.lower()][predicted_emotion.lower()] += 1
        else:
            emotion_confusion[reference_emotion.lower()]['no_prediction'] += 1

        # Store sample outputs for inspection
        if i < 10:  # Store first 10 samples
            results['sample_outputs'].append({
                'input': input_text,
                'reference_text': reference_text,
                'predicted_text': predicted_text,
                'reference_emotion': reference_emotion,
                'predicted_emotion': predicted_emotion,
                'full_response': response,
                'bleu': bleu,
                'rouge1': rouge['rouge1'],
                'meteor': meteor
            })

        if (i + 1) % 20 == 0 or i < 5:
            sample_info = f"Sample {i+1}: BLEU={bleu:.3f}, Emotion={'✓' if emotion_correct else '✗'}"
            progress_bar.set_description(f"Evaluating ESConv - {sample_info}")

            if (i + 1) % 50 == 0 or i < 3:
                print(f"\n" + "="*80)
                print(f"ESCONV LIVE SAMPLE {i+1}/{len(evaluation_data)}")
                print("="*80)
                print(f"Input: {input_text[:120]}...")
                print(f"Reference: {reference_text[:100]}...")
                print(f"Generated: {predicted_text[:100]}...")
                print(f"Full Response: {response[:150]}...")
                print(f"Ref Emotion: {reference_emotion} | Pred Emotion: {predicted_emotion} | Match: {'✓' if emotion_correct else '✗'}")
                print(f"BLEU: {bleu:.4f} | ROUGE-1: {rouge['rouge1']:.4f} | METEOR: {meteor:.4f}")
                print("="*80)
        else:
            # Just update progress bar with running averages
            if i > 0:
                avg_bleu = np.mean(results['bleu_scores'])
                avg_emotion_acc = np.mean(results['emotion_accuracy'])
                progress_bar.set_description(f"Evaluating ESConv - Avg BLEU: {avg_bleu:.3f}, Emotion Acc: {avg_emotion_acc:.3f}")

    progress_bar.close()

    # Compute summary statistics
    results['summary'] = {
        'avg_bleu': np.mean(results['bleu_scores']),
        'avg_rouge1': np.mean(results['rouge1_scores']),
        'avg_rouge2': np.mean(results['rouge2_scores']),
        'avg_rougeL': np.mean(results['rougeL_scores']),
        'avg_meteor': np.mean(results['meteor_scores']),
        'emotion_accuracy': np.mean(results['emotion_accuracy']),
        'emotion_tag_coverage': np.mean(results['has_emotion_tag']),
        'total_samples': len(evaluation_data)
    }

    results['emotion_confusion_matrix'] = dict(emotion_confusion)
    results['detected_format'] = detected_format

    print(f"\n{'='*60}")
    print(f"ESCONV EVALUATION COMPLETED!")
    print(f"{'='*60}")
    print(f"Processed {len(evaluation_data)} samples")
    print(f"Average BLEU: {results['summary']['avg_bleu']:.4f}")
    print(f"Average ROUGE-1: {results['summary']['avg_rouge1']:.4f}")
    print(f"Emotion Accuracy: {results['summary']['emotion_accuracy']:.4f}")
    print(f"Emotion Tag Coverage: {results['summary']['emotion_tag_coverage']:.4f}")
    print(f"{'='*60}")

    return results

def print_evaluation_results(results):
    """Print formatted evaluation results"""
    summary = results['summary']

    print("\n" + "="*60)
    print("ESCONV THERAPY MODEL EVALUATION RESULTS")
    print("="*60)
    print(f"Model Type: {results.get('model_type', 'Unknown').upper()}")
    print(f"Output Format: {results.get('detected_format', 'Unknown').upper()}")

    print(f"\nText Generation Metrics:")
    print(f"  BLEU Score:     {summary['avg_bleu']:.4f}")
    print(f"  ROUGE-1:        {summary['avg_rouge1']:.4f}")
    print(f"  ROUGE-2:        {summary['avg_rouge2']:.4f}")
    print(f"  ROUGE-L:        {summary['avg_rougeL']:.4f}")
    print(f"  METEOR:         {summary['avg_meteor']:.4f}")

    print(f"\nEmotion Prediction:")
    print(f"  Emotion Accuracy:    {summary['emotion_accuracy']:.4f}")
    print(f"  Emotion Tag Coverage: {summary['emotion_tag_coverage']:.4f}")

    print(f"\nDataset Info:")
    print(f"  Total Samples:  {summary['total_samples']}")

    # Print emotion confusion matrix
    if 'emotion_confusion_matrix' in results:
        print(f"\nEmotion Confusion Matrix:")
        confusion = results['emotion_confusion_matrix']
        for ref_emotion, pred_dict in confusion.items():
            print(f"  {ref_emotion}:")
            for pred_emotion, count in pred_dict.items():
                print(f"    -> {pred_emotion}: {count}")

    print(f"\nSample Outputs:")
    for i, sample in enumerate(results['sample_outputs'][:3], 1):
        print(f"\n  Sample {i}:")
        print(f"    Input: {sample['input'][:100]}...")
        print(f"    Reference: {sample['reference_text'][:80]}...")
        print(f"    Predicted: {sample['predicted_text'][:80]}...")
        print(f"    Ref Emotion: {sample['reference_emotion']}")
        print(f"    Pred Emotion: {sample['predicted_emotion']}")
        print(f"    BLEU: {sample['bleu']:.3f}, ROUGE-1: {sample['rouge1']:.3f}")

# ====================== Model Loading Functions ======================
def load_sft_model(checkpoint_path, tokenizer_path, device='cuda'):
    """Load SFT model from checkpoint file"""
    from transformers import GPT2LMHeadModel

    print(f"Loading SFT checkpoint: {checkpoint_path}")

    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)

    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.resize_token_embeddings(len(tokenizer))
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)

    if 'epoch' in checkpoint:
        print(f"Checkpoint info: Epoch {checkpoint['epoch']}, Loss {checkpoint.get('valid_loss', 'N/A')}")

    return model, tokenizer

def load_rl_model(model_dir, tokenizer_path, device='cuda'):
    """Load RL model from directory"""
    from transformers import AutoModelForCausalLM

    print(f"Loading RL model from directory: {model_dir}")

    # Load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

    # Load model from directory
    model = AutoModelForCausalLM.from_pretrained(model_dir).to(device)

    return model, tokenizer

def load_model_auto(model_path, tokenizer_path, device='cuda'):
    """
    Automatically detect and load model (SFT checkpoint or RL directory)

    Args:
        model_path: Path to model (file for SFT, directory for RL)
        tokenizer_path: Path to tokenizer
        device: Device to load model on

    Returns:
        tuple: (model, tokenizer, model_type)
    """
    import os

    if os.path.isfile(model_path):
        print("Detected SFT checkpoint file")
        model, tokenizer = load_sft_model(model_path, tokenizer_path, device)
        return model, tokenizer, 'sft'

    elif os.path.isdir(model_path):
        print("Detected RL model directory")
        model, tokenizer = load_rl_model(model_path, tokenizer_path, device)
        return model, tokenizer, 'rl'

    else:
        raise ValueError(f"Model path {model_path} is neither a file nor a directory")

# ====================== Main Evaluation Function ======================
def run_esconv_evaluation(model_path, tokenizer_path, esconv_data_path, device='cuda'):
    """
    Run complete evaluation pipeline on ESConv data

    Args:
        model_path: Path to trained model (file for SFT, directory for RL)
        tokenizer_path: Path to tokenizer
        esconv_data_path: Path to ESConv JSON data
        device: Device to run evaluation on
    """

    # Load model and tokenizer (auto-detect type)
    print(f"Loading model from {model_path}...")
    model, tokenizer, model_type = load_model_auto(model_path, tokenizer_path, device)
    print(f"Loaded {model_type.upper()} model successfully")

    # Load ESConv evaluation dataset
    print(f"Loading ESConv evaluation data from {esconv_data_path}...")
    eval_dataset = ESConvEvaluationDataset(esconv_data_path, tokenizer_path)

    eval_dataset.print_sample_formats(3)

    results = evaluate_model_esconv(model, tokenizer, eval_dataset, device=device)
    results['model_type'] = model_type
    results['dataset_type'] = 'esconv'

    print_evaluation_results(results)

    return results

# ====================== Usage Example ======================
if __name__ == "__main__":

    MODEL_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/therapy_model_4thFIXED_epoch_7_loss_2.2373.ckpt"
    TOKENIZER_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/train_processed_4thFIXED_tokenizer"
    ESCONV_DATA_PATH = "/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/PESConv.json"  # Update this path to your ESConv data
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    results = run_esconv_evaluation(MODEL_PATH, TOKENIZER_PATH, ESCONV_DATA_PATH, DEVICE)

    import pickle
    with open('/content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/Evaluation2/ESConv_SFT_evaluation_results.pkl', 'wb') as f:
        pickle.dump(results, f)

    print("\nEvaluation completed! Results saved to ESConv_RL_evaluation_results.pkl")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading model from /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/therapy_model_4thFIXED_epoch_7_loss_2.2373.ckpt...
Detected SFT checkpoint file
Loading SFT checkpoint: /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/therapy_model_4thFIXED_epoch_7_loss_2.2373.ckpt


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Checkpoint info: Epoch 7, Loss 2.237253785133362
Loaded SFT model successfully
Loading ESConv evaluation data from /content/drive/MyDrive/RL-SFT-GPT2 MentalHealth/RL<token>8 6/PESConv.json...
Processing ESConv data...


Processing conversations:  28%|██▊       | 370/1300 [00:00<00:00, 3696.80it/s]


Processing conversation 1:
  Problem type: job crisis
  Emotion type: anxiety
  Dialog length: 27 turns

  Sample 1:
    Input: <problem>job crisis <user>Hello <user_emotion>anxiety <therapist>...
    Reference: Hello, what would you like to talk about?...

  Sample 2:
    Input: <problem>job crisis <user>I am having a lot of anxiety about quitting my current job. It is too stre...
    Reference: What makes your job stressful for you?...

  Sample 3:
    Input: <problem>job crisis <user>I have to deal with many people in hard financial situations and it is ups...
    Reference: Do you help your clients to make it to a better financial situation?...

Processing conversation 2:
  Problem type: problems with friends
  Emotion type: anger
  Dialog length: 69 turns

Processing conversation 3:
  Problem type: job crisis
  Emotion type: fear
  Dialog length: 37 turns


Processing conversations: 100%|██████████| 1300/1300 [00:00<00:00, 3745.70it/s]



Processed 14642 evaluation samples from 1300 conversations

ESCONV SAMPLE INPUT/OUTPUT FORMATS

Sample 1:
  Problem Type: job crisis
  User Emotion: anxiety
  Input Format: <problem>job crisis <user>Hello <user_emotion>anxiety <therapist>...
  Reference Response: Hello, what would you like to talk about?...
  User Input Only: Hello...

Sample 2:
  Problem Type: job crisis
  User Emotion: anxiety
  Input Format: <problem>job crisis <user>I am having a lot of anxiety about quitting my current job. It is too stressful but pays well <user_emotion>anxiety <therapi...
  Reference Response: What makes your job stressful for you?...
  User Input Only: I am having a lot of anxiety about quitting my current job. It is too stressful but pays well...

Sample 3:
  Problem Type: job crisis
  User Emotion: anxiety
  Input Format: <problem>job crisis <user>I have to deal with many people in hard financial situations and it is upsetting <user_emotion>anxiety <therapist>...
  Reference Response: Do you

Evaluating ESConv - Sample 2: BLEU=0.000, Emotion=✗:   0%|          | 2/14642 [00:00<35:58,  6.78it/s]


ESCONV LIVE SAMPLE 1/14642
Input: <problem>job crisis <user>Hello <user_emotion>anxiety <therapist>...
Reference: Hello, what would you like to talk about?...
Generated: I'm sorry. I just wanted to let you know that you're not worried....
Full Response: I'm sorry. I just wanted to let you know that you're not worried. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0185 | ROUGE-1: 0.1739 | METEOR: 0.1176

ESCONV LIVE SAMPLE 2/14642
Input: <problem>job crisis <user>I am having a lot of anxiety about quitting my current job. It is too stressful but pays well ...
Reference: What makes your job stressful for you?...
Generated: You have to live with it....
Full Response: You have to live with it. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1538 | METEOR: 0.0725


Evaluating ESConv - Sample 4: BLEU=0.000, Emotion=✗:   0%|          | 3/14642 [00:00<46:31,  5.24it/s]


ESCONV LIVE SAMPLE 3/14642
Input: <problem>job crisis <user>I have to deal with many people in hard financial situations and it is upsetting <user_emotion...
Reference: Do you help your clients to make it to a better financial situation?...
Generated: you can't control your emotions, you can control your emotions...
Full Response: you can't control your emotions, you can control your emotions  <therapist_emotion> but it's not your fault. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: but | Match: ✗
BLEU: 0.0186 | ROUGE-1: 0.1667 | METEOR: 0.0787


Evaluating ESConv - Sample 100: BLEU=0.000, Emotion=✗:   1%|          | 100/14642 [00:13<32:33,  7.44it/s]


ESCONV LIVE SAMPLE 100/14642
Input: <problem>job crisis <user>sorry about that, but that's good, people help each other Absolutely! Volunteering is a great ...
Reference: Do you feel you will be getting the vaccine when it is rolled out where you are?...
Generated: but......
Full Response: but...  <therapist_emotion> what do you mean, what do you mean? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: what | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.025:   1%|▏         | 200/14642 [00:26<23:48, 10.11it/s]


ESCONV LIVE SAMPLE 200/14642
Input: <problem>job crisis <user>Yes, I really think you are correct! thank you so very much for your help today! <user_emotion...
Reference: The survivor rate from COVID-19 infections is around 99%, so your chances of dying from the virus ar...
Generated: Thank you....
Full Response: Thank you. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.023:   2%|▏         | 301/14642 [00:39<25:38,  9.32it/s]


ESCONV LIVE SAMPLE 300/14642
Input: <problem>ongoing depression <user>No I think I'm good <user_emotion>depression <therapist>...
Reference: Please click "Finish and Quit" to close the chat.. thank you....
Generated: So what does that mean?...
Full Response: So what does that mean? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.017:   3%|▎         | 400/14642 [00:50<20:42, 11.47it/s]


ESCONV LIVE SAMPLE 400/14642
Input: <problem>breakup with partner <user>i will have to find somewhere i can properly exercise my awful vocals lol <user_emot...
Reference: Ah! I like pop songs from the 60's :-)...
Generated: What?...
Full Response: What? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.016:   3%|▎         | 500/14642 [01:03<32:02,  7.36it/s]


ESCONV LIVE SAMPLE 500/14642
Input: <problem>job crisis <user>yes , you prediction was so good <user_emotion>sadness <therapist>...
Reference: That's lovely to hear. Have you tried discussing your doubts with your supervisor? There are times w...
Generated: I'm not sure you can do it again....
Full Response: I'm not sure you can do it again. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0007 | ROUGE-1: 0.0833 | METEOR: 0.0147


Evaluating ESConv - Avg BLEU: 0.004, Emotion Acc: 0.013:   4%|▍         | 601/14642 [01:16<25:40,  9.11it/s]


ESCONV LIVE SAMPLE 600/14642
Input: <problem>job crisis <user>Merry Chirstmas. <user_emotion>anxiety <therapist>...
Reference: I'm doing good!...
Generated: I think you're having trouble sleeping....
Full Response: I think you're having trouble sleeping. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1818 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.011:   5%|▍         | 702/14642 [01:28<23:08, 10.04it/s]


ESCONV LIVE SAMPLE 700/14642
Input: <problem>ongoing depression <user>We are looking at other options via business visa but we don't know how that will go <...
Reference: if you are an American, you can just go to Indonesia, get married to your beautiful fiancée and come...
Generated: So what do you think is the best route to go?...
Full Response: So what do you think is the best route to go? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0075 | ROUGE-1: 0.1714 | METEOR: 0.0917


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.019:   5%|▌         | 801/14642 [01:41<33:54,  6.80it/s]


ESCONV LIVE SAMPLE 800/14642
Input: <problem>job crisis <user>Bye Have a Good Day Some Report preparation <user_emotion>anger <therapist>...
Reference: Have an amazing Christmas!...
Generated: I'm sorry. I just... I don't know....
Full Response: I'm sorry. I just... I don't know. <therapist_emotion> neutral<eos>...
Ref Emotion: anger | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.018:   6%|▌         | 901/14642 [01:53<28:41,  7.98it/s]


ESCONV LIVE SAMPLE 900/14642
Input: <problem>ongoing depression <user>That actually sounds like a good idea. I hope the shelter near me will take volunteers...
Reference: If you are not comfortable going out due to COVID, you could involve some activities online promotin...
Generated: And how would you like them to go about packing up?...
Full Response: And how would you like them to go about packing up? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0052 | ROUGE-1: 0.2105 | METEOR: 0.0984


Evaluating ESConv - Sample 1000: BLEU=0.000, Emotion=✗:   7%|▋         | 999/14642 [02:06<25:03,  9.08it/s] 


ESCONV LIVE SAMPLE 1000/14642
Input: <problem>academic pressure <user>I'll keep that in mind as I move forward. I really appreciate your help. <user_emotion>...
Reference: I commend you for doing the thesis and I also know it can get weary as time passes, but with some go...
Generated: I appreciate your help....
Full Response: I appreciate your help. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0465 | METEOR: 0.0141


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.015:   8%|▊         | 1101/14642 [02:18<32:58,  6.84it/s]


ESCONV LIVE SAMPLE 1100/14642
Input: <problem>breakup with partner <user>With him... he was perfect (or at least I thought), but also, I'm lonely, and it's h...
Reference: Have you tried making other connections online? A lot of people are home during this pandemic, you m...
Generated: It's hard to process that you're in such a state?...
Full Response: It's hard to process that you're in such a state? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1176 | METEOR: 0.0240


Evaluating ESConv - Sample 1200: BLEU=0.000, Emotion=✗:   8%|▊         | 1200/14642 [02:29<28:43,  7.80it/s]


ESCONV LIVE SAMPLE 1200/14642
Input: <problem>academic pressure <user>i usually try to follow the tutorials and do revisions <user_emotion>anxiety <therapist...
Reference: Can you think of anything you can do, to help make your studying be more successful?...
Generated: but you've been doing that for a while now....
Full Response: but you've been doing that for a while now. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1538 | METEOR: 0.0654


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.015:   9%|▉         | 1300/14642 [02:43<24:38,  9.02it/s]


ESCONV LIVE SAMPLE 1300/14642
Input: <problem>job crisis <user>And to you You doing alright? <user_emotion>anxiety <therapist>...
Reference: How are you today? I am buried in cookie dough, lol, but happy and wwell....
Generated: I think that's true....
Full Response: I think that's true. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0051 | ROUGE-1: 0.1000 | METEOR: 0.0360


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.014:  10%|▉         | 1401/14642 [02:56<25:06,  8.79it/s]


ESCONV LIVE SAMPLE 1400/14642
Input: <problem>academic pressure <user>Well, counselors are hard to get ahold of so close to the Holidays. The so-called teach...
Reference: Are you confident in your abilities to succeed in those classes or you think you might lack the mate...
Generated: I know....
Full Response: I know. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.015:  10%|█         | 1501/14642 [03:10<29:42,  7.37it/s]


ESCONV LIVE SAMPLE 1500/14642
Input: <problem>job crisis <user>I am having a hard time.... Ok, I will look some of them online. I need some financial planner...
Reference: If you want other suggestions, meditation is something that is extremely overlooked and will help yo...
Generated: So you think that you are struggling,...
Full Response: So you think that you are struggling, <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0001 | ROUGE-1: 0.1053 | METEOR: 0.0328


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.014:  11%|█         | 1601/14642 [03:23<25:29,  8.53it/s]


ESCONV LIVE SAMPLE 1600/14642
Input: <problem>problems with friends <user>no other questions, you have been most helpful, thank you! <user_emotion>anxiety <t...
Reference: I have done volunteer work; people want to be friends with those who care and know how to use a nail...
Generated: I appreciate it....
Full Response: I appreciate it. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0001 | ROUGE-1: 0.0769 | METEOR: 0.0228


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.014:  12%|█▏        | 1701/14642 [03:35<31:32,  6.84it/s]


ESCONV LIVE SAMPLE 1700/14642
Input: <problem>job crisis <user>I feel like a replaceable cog in a giant machine that doesn't give two shits whether I show up...
Reference: you are doing your best and are working really hard...
Generated: So I think you're not really retired,...
Full Response: So I think you're not really retired, <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0215 | ROUGE-1: 0.2222 | METEOR: 0.0515


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.014:  12%|█▏        | 1801/14642 [03:48<24:23,  8.77it/s]


ESCONV LIVE SAMPLE 1800/14642
Input: <problem>job crisis <user>Do you smoke and drink alcohol? <user_emotion>depression <therapist>...
Reference: No, I don't find that to be healthy, and I don't enjoy them....
Generated: No, I don't....
Full Response: No, I don't. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0086 | ROUGE-1: 0.4211 | METEOR: 0.0833


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.013:  13%|█▎        | 1901/14642 [04:00<30:33,  6.95it/s]


ESCONV LIVE SAMPLE 1900/14642
Input: <problem>academic pressure <user>I think so? I think it is just the way the material is taught now. We have these "Zoom"...
Reference: What happened when you reached out to the teacher?...
Generated: I'm sure that you feel that you're losing your passion....
Full Response: I'm sure that you feel that you're losing your passion. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0211 | ROUGE-1: 0.0952 | METEOR: 0.0549


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.013:  14%|█▎        | 2001/14642 [04:12<21:15,  9.91it/s]


ESCONV LIVE SAMPLE 2000/14642
Input: <problem>ongoing depression <user>Well thank you im okay <user_emotion>depression <therapist>...
Reference: Hmm all the best...
Generated: I'm fine....
Full Response: I'm fine. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.013:  14%|█▍        | 2102/14642 [04:26<24:36,  8.49it/s]


ESCONV LIVE SAMPLE 2100/14642
Input: <problem>ongoing depression <user>Hmm, I didn't know that. I'm on a waiting list for a therapist through my medical insu...
Reference: That is a good idea, especially at this time when it's probably not a good idea to see too many new ...
Generated: I think it's possible that you might be able to get a therapist who is......
Full Response: I think it's possible that you might be able to get a therapist who is... <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0060 | ROUGE-1: 0.2857 | METEOR: 0.0825


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.012:  15%|█▌        | 2200/14642 [04:38<21:04,  9.84it/s]


ESCONV LIVE SAMPLE 2200/14642
Input: <problem>problems with friends <user>hi <user_emotion>anxiety <therapist>...
Reference: Hello....
Generated: you know,...
Full Response: you know, <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.012:  16%|█▌        | 2301/14642 [04:51<25:45,  7.98it/s]


ESCONV LIVE SAMPLE 2300/14642
Input: <problem>job crisis <user>did she start off doing that or was she employed by another company <user_emotion>fear <therap...
Reference: She has over 100 companies that are hiring...
Generated: you said that she was not employed by another company?...
Full Response: you said that she was not employed by another company? <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0211 | ROUGE-1: 0.3333 | METEOR: 0.1829


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.012:  16%|█▋        | 2400/14642 [05:04<20:41,  9.86it/s]


ESCONV LIVE SAMPLE 2400/14642
Input: <problem>job crisis <user>I try to find a job but there is not many <user_emotion>anxiety <therapist>...
Reference: I am sorry to hear that. How long have you been looking?...
Generated: no one is here to help you....
Full Response: no one is here to help you. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0162 | ROUGE-1: 0.2105 | METEOR: 0.0870


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.011:  17%|█▋        | 2502/14642 [05:15<20:35,  9.83it/s]


ESCONV LIVE SAMPLE 2500/14642
Input: <problem>breakup with partner <user>my dog was 13 years old. she was very close to me. i had her when she was a puppy <u...
Reference: The dog then lived a very happy and fulfilling life from what I can tell. Dogs are among the best co...
Generated: So you had a very close relationship with her when she was a puppy....
Full Response: So you had a very close relationship with her when she was a puppy. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0148 | ROUGE-1: 0.1053 | METEOR: 0.0652


Evaluating ESConv - Sample 2600: BLEU=0.000, Emotion=✗:  18%|█▊        | 2600/14642 [05:27<23:19,  8.61it/s]


ESCONV LIVE SAMPLE 2600/14642
Input: <problem>problems with friends <user>Friends not being there for me when I need emotional support. They also do not pay ...
Reference: So just to make sure I understand, you are having issues with your friends not being there for you e...
Generated: So why don't you ask them?...
Full Response: So why don't you ask them? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0005 | ROUGE-1: 0.1463 | METEOR: 0.0481


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.011:  18%|█▊        | 2701/14642 [05:40<24:17,  8.19it/s]


ESCONV LIVE SAMPLE 2700/14642
Input: <problem>ongoing depression <user>That's a very optimistic view... <user_emotion>sadness <therapist>...
Reference: Like I said, I used to be sad too and not optimistic. But now I am optimistic because I know what it...
Generated: It's not....
Full Response: It's not. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0889 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.011:  19%|█▉        | 2802/14642 [05:53<21:51,  9.03it/s]


ESCONV LIVE SAMPLE 2800/14642
Input: <problem>ongoing depression <user>I just lost interest. I'm losing interest in everything, man I think it's time for my ...
Reference: I certainly hope you don't do that!...
Generated: That's not what I'm saying....
Full Response: That's not what I'm saying. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.2667 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.012:  20%|█▉        | 2901/14642 [06:06<36:07,  5.42it/s]


ESCONV LIVE SAMPLE 2900/14642
Input: <problem>job crisis <user>yes, thanks a lot for your time <user_emotion>anger <therapist>...
Reference: You are welcome. I hope you don't give up, and stay optimistic! It sounds like you have a great star...
Generated: I'm just wondering if you're aware of what's going on with your family...
Full Response: I'm just wondering if you're aware of what's going on with your family  <therapist_emotion> anger <therapist_emotion> Well, I've spoken to your mother...
Ref Emotion: anger | Pred Emotion: anger | Match: ✓
BLEU: 0.0000 | ROUGE-1: 0.1081 | METEOR: 0.0259


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.012:  21%|██        | 3002/14642 [06:20<20:49,  9.32it/s]


ESCONV LIVE SAMPLE 3000/14642
Input: <problem>breakup with partner <user>that will be hard because i am soo used to her being around <user_emotion>sadness <t...
Reference: I can understand that but the only way you can work it out with her if both of you two just talk...
Generated: And what do you think she's going to do with that?...
Full Response: And what do you think she's going to do with that? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0082 | ROUGE-1: 0.1765 | METEOR: 0.0718


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.012:  21%|██        | 3101/14642 [06:32<16:49, 11.43it/s]


ESCONV LIVE SAMPLE 3100/14642
Input: <problem>job crisis <user>Hi, I am doing ok, how are you? <user_emotion>fear <therapist>...
Reference: Good thank you. Why only ok? What is bothering you?...
Generated: Good....
Full Response: Good.  <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1818 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.012:  22%|██▏       | 3201/14642 [06:44<28:23,  6.72it/s]


ESCONV LIVE SAMPLE 3200/14642
Input: <problem>job crisis <user>ohh 1 wow, I will definitely try that. It sounds like you're a doctor!" Whenever I get tense o...
Reference: Thank you that is very kind of you to say. I have just had alot of practice in dealing with anxiety....
Generated: You're just... I'm not sure I know what you mean....
Full Response: You're just... I'm not sure I know what you mean. <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0005 | ROUGE-1: 0.1639 | METEOR: 0.0222


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.011:  23%|██▎       | 3301/14642 [06:57<29:29,  6.41it/s]


ESCONV LIVE SAMPLE 3300/14642
Input: <problem>academic pressure <user>i was mad, but now i am just upset. My professor put in grades for my class earlier thi...
Reference: Just to make sure I understand, you got a bad grade in one of your classes and your grade dropped be...
Generated: Why did you drop out?...
Full Response: Why did you drop out? <therapist_emotion> neutral<eos>...
Ref Emotion: anger | Pred Emotion: neutral | Match: ✗
BLEU: 0.0004 | ROUGE-1: 0.1176 | METEOR: 0.0564


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.011:  23%|██▎       | 3402/14642 [07:11<22:25,  8.36it/s]


ESCONV LIVE SAMPLE 3400/14642
Input: <problem>problems with friends <user>Hello. <user_emotion>sadness <therapist>...
Reference: Hello...
Generated: So what's your reaction when I ask you about this?...
Full Response: So what's your reaction when I ask you about this? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.011:  24%|██▍       | 3501/14642 [07:23<27:09,  6.84it/s]


ESCONV LIVE SAMPLE 3500/14642
Input: <problem>ongoing depression <user>Hi <user_emotion>shame <therapist>...
Reference: Hello...
Generated: I'm sorry. I just... I'm not sure I understand what you're trying to say....
Full Response: I'm sorry. I just... I'm not sure I understand what you're trying to say. <therapist_emotion> neutral<eos>...
Ref Emotion: shame | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.010:  25%|██▍       | 3601/14642 [07:37<21:42,  8.48it/s]


ESCONV LIVE SAMPLE 3600/14642
Input: <problem>job crisis <user>you are so nice. i lost my job and having trouble finding a new on one <user_emotion>anxiety <...
Reference: Oh, I'm so sorry. I can empathize a bit. I lost mine earlier this year due to covid, but they did br...
Generated: what?...
Full Response: what? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.010:  25%|██▌       | 3701/14642 [07:49<22:02,  8.27it/s]


ESCONV LIVE SAMPLE 3700/14642
Input: <problem>job crisis <user>I have sold wreaths for awhile but I did not have time to be as serious so I did not sell a to...
Reference: That sounds like an excellent idea to me! I think that you are definitely doing the right thing....
Generated: You seem to have a lot of energy....
Full Response: You seem to have a lot of energy. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0080 | ROUGE-1: 0.1538 | METEOR: 0.0588


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.010:  26%|██▌       | 3801/14642 [08:01<19:20,  9.34it/s]


ESCONV LIVE SAMPLE 3800/14642
Input: <problem>academic pressure <user>We have gotten him tutors in the past and it helps. I am not sure I could do a tutor wi...
Reference: you are right. Have you tried helping him with his work?...
Generated: What is anxiety?...
Full Response: What is anxiety? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  27%|██▋       | 3901/14642 [08:14<20:25,  8.76it/s]


ESCONV LIVE SAMPLE 3900/14642
Input: <problem>ongoing depression <user>oh, i dont use it much <user_emotion>depression <therapist>...
Reference: There is also proofreading that can help establish yourself...
Generated: I think you have to stop using it....
Full Response: I think you have to stop using it. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  27%|██▋       | 4002/14642 [08:26<18:37,  9.52it/s]


ESCONV LIVE SAMPLE 4000/14642
Input: <problem>ongoing depression <user>Garbo hates small animals also. Especially skunks <user_emotion>anxiety <therapist>...
Reference: Oh Garbo! Hahahaha! She sounds wonderful!...
Generated: Why don't you tell me what...
Full Response: Why don't you tell me what  <therapist_emotion> <therapist_emotion> anger <therapist_emotion> <therapist_emotion> <therapist_emotion> <therapist_emoti...
Ref Emotion: anxiety | Pred Emotion: <therapist_emotion> | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  28%|██▊       | 4101/14642 [08:38<20:34,  8.54it/s]


ESCONV LIVE SAMPLE 4100/14642
Input: <problem>breakup with partner <user>so sorry i had stepped away <user_emotion>sadness <therapist>...
Reference: I have been having a bit of a bad time in life lately so I can understand and empathise if you are t...
Generated: You didn't have a choice....
Full Response: You didn't have a choice. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0017 | ROUGE-1: 0.2069 | METEOR: 0.0708


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  29%|██▊       | 4202/14642 [08:51<21:40,  8.03it/s]


ESCONV LIVE SAMPLE 4200/14642
Input: <problem>problems with friends <user>haha YES! see it is a stupid thing. like dood we hook your ass up all the tome time...
Reference: People suck....
Generated: So what do you think...
Full Response: So what do you think  <therapist_emotion> <therapist_emotion> <therapist_emotion> I think it is  <therapist_emotion> neutral<eos>...
Ref Emotion: disgust | Pred Emotion: <therapist_emotion> | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  29%|██▉       | 4301/14642 [09:02<22:12,  7.76it/s]


ESCONV LIVE SAMPLE 4300/14642
Input: <problem>academic pressure <user>Yes, I am feeling very anxious about my school work lately. <user_emotion>anxiety <ther...
Reference: I see, so your main worry is about academic pressure?...
Generated: It's not like I'm anxious about anything....
Full Response: It's not like I'm anxious about anything. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0215 | ROUGE-1: 0.2105 | METEOR: 0.0515


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  30%|███       | 4401/14642 [09:15<21:15,  8.03it/s]


ESCONV LIVE SAMPLE 4400/14642
Input: <problem>job crisis <user>It was a political game where she isolated the president and made him no longer support her. I...
Reference: ok so your director played a game she was not allowed to play and she got caught playing it and quit...
Generated: You think that's why you left?...
Full Response: You think that's why you left? <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0606 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  31%|███       | 4501/14642 [09:27<20:02,  8.44it/s]


ESCONV LIVE SAMPLE 4500/14642
Input: <problem>breakup with partner <user>What are those? I heard of them, but never used them. Most of my old friends are no ...
Reference: Social networks online, you can connect with family or old coworkers. And post pictures and see what...
Generated: What do you mean?...
Full Response: What do you mean? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0005 | ROUGE-1: 0.2143 | METEOR: 0.0682


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  31%|███▏      | 4601/14642 [09:40<16:22, 10.22it/s]


ESCONV LIVE SAMPLE 4600/14642
Input: <problem>ongoing depression <user>No, since too much time has passed <user_emotion>sadness <therapist>...
Reference: Have you tried any local churches or agencies? findhelp.org is also a great resource....
Generated: we haven't been sleeping....
Full Response: we haven't been sleeping. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  32%|███▏      | 4702/14642 [09:52<17:18,  9.57it/s]


ESCONV LIVE SAMPLE 4700/14642
Input: <problem>problems with friends <user>That makes sense but I just wish I knew why she said it. Did I do something to her,...
Reference: maybe you should let her know that you love her and say that you would never want to hurt her, so if...
Generated: I don't think so....
Full Response: I don't think so. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0408 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  33%|███▎      | 4801/14642 [10:05<19:59,  8.21it/s]


ESCONV LIVE SAMPLE 4800/14642
Input: <problem>breakup with partner <user>well, it was weird that he got so dressed up to go hang out with friends it makes se...
Reference: Ok, so this is a HUGE red flag to you. You can ignore it, or not. Women KNOW when a man is cheating,...
Generated: He was dressed up for a photoshoot?...
Full Response: He was dressed up for a photoshoot? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0002 | ROUGE-1: 0.0784 | METEOR: 0.0248


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  33%|███▎      | 4901/14642 [10:19<26:46,  6.06it/s]


ESCONV LIVE SAMPLE 4900/14642
Input: <problem>job crisis <user>i'm withdrawing from the work and that's bad. i still care about the work, but the toxic envir...
Reference: maybe seek out someone more senior in a different department who can help...
Generated: anger?...
Full Response: anger?  <therapist_emotion> Well, that's true. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: well, | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  34%|███▍      | 5001/14642 [10:32<17:49,  9.02it/s]


ESCONV LIVE SAMPLE 5000/14642
Input: <problem>academic pressure <user>How are you doing today? <user_emotion>anxiety <therapist>...
Reference: I am doing well and how are you feeling today?...
Generated: Well, I'm fine....
Full Response: Well, I'm fine. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.2857 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  35%|███▍      | 5101/14642 [10:45<18:11,  8.74it/s]


ESCONV LIVE SAMPLE 5100/14642
Input: <problem>job crisis <user>Yes. <user_emotion>fear <therapist>...
Reference: I can understand how that feels. Have you tried to talk to them about it?...
Generated: But you're a musician....
Full Response: But you're a musician. <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  36%|███▌      | 5201/14642 [10:58<19:14,  8.18it/s]


ESCONV LIVE SAMPLE 5200/14642
Input: <problem>academic pressure <user>Hello! Life is treating me fair, what about you? <user_emotion>anxiety <therapist>...
Reference: Well, I've been a bit worried this year due to Covid and everything. Have you been worrying about an...
Generated: You're not angry, I'm just......
Full Response: You're not angry, I'm just... <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1481 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  36%|███▌      | 5301/14642 [11:11<19:40,  7.92it/s]


ESCONV LIVE SAMPLE 5300/14642
Input: <problem>problems with friends <user>I agree. She can't see past her own issues to see the whole situation. <user_emotio...
Reference: If you think its a more serious problem maybe ask her to seek professional help....
Generated: So you're trying to justify yourself...
Full Response: So you're trying to justify yourself  <therapist_emotion> anger<eos>...
Ref Emotion: anger | Pred Emotion: anger | Match: ✓
BLEU: 0.0091 | ROUGE-1: 0.1818 | METEOR: 0.0355


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.010:  37%|███▋      | 5401/14642 [11:24<20:58,  7.34it/s]


ESCONV LIVE SAMPLE 5400/14642
Input: <problem>breakup with partner <user>Things have been pretty rough for the last few years. I found out she was hanging ou...
Reference: So from what I am hearing, you ended things with your girlfriend because she was talking to her ex a...
Generated: You know what else?...
Full Response: You know what else?  <therapist_emotion> <therapist_emotion> She's upset about it. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: <therapist_emotion> | Match: ✗
BLEU: 0.0001 | ROUGE-1: 0.1111 | METEOR: 0.0342


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.010:  38%|███▊      | 5502/14642 [11:38<16:39,  9.15it/s]


ESCONV LIVE SAMPLE 5500/14642
Input: <problem>breakup with partner <user>His latest obsession has been 3d printing. I have found weird sites with ' funny'' d...
Reference: goodness...
Generated: You're not suggesting that he's attracted to women....
Full Response: You're not suggesting that he's attracted to women. <therapist_emotion> neutral<eos>...
Ref Emotion: anger | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.010:  38%|███▊      | 5600/14642 [11:51<20:59,  7.18it/s]


ESCONV LIVE SAMPLE 5600/14642
Input: <problem>breakup with partner <user>I enjoy snowboarding as a hobby. I might try to get back on my board depending on wh...
Reference: Yes. when you are depressed it is hard to become motivated to do anything. However, it is extremely ...
Generated: I don't think that's necessarily a problem....
Full Response: I don't think that's necessarily a problem. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0471 | METEOR: 0.0072


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.010:  39%|███▉      | 5701/14642 [12:03<17:07,  8.70it/s]


ESCONV LIVE SAMPLE 5700/14642
Input: <problem>ongoing depression <user>i am depressed <user_emotion>depression <therapist>...
Reference: I'm sorry to hear that! When did you begin feeling this way?...
Generated: You've been depressed?...
Full Response: You've been depressed? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1176 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.010:  40%|███▉      | 5802/14642 [12:16<14:11, 10.38it/s]


ESCONV LIVE SAMPLE 5800/14642
Input: <problem>breakup with partner <user>I will follow your suggestion. Thanks a lot. <user_emotion>sadness <therapist>...
Reference: You are so welcome! I hope you start to feel better soon....
Generated: I'm glad you're here....
Full Response: I'm glad you're here. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.2222 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.010:  40%|████      | 5901/14642 [12:29<23:02,  6.32it/s]


ESCONV LIVE SAMPLE 5900/14642
Input: <problem>problems with friends <user>I appreciate your help! I'll try to have a productive conversation with her soon. <...
Reference: Great, the sooner the better. I have a feeling you'll both be relieved when you do....
Generated: I'm sorry, Mia, I didn't mean to offend you....
Full Response: I'm sorry, Mia, I didn't mean to offend you. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0110 | ROUGE-1: 0.1429 | METEOR: 0.0327


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.010:  41%|████      | 6001/14642 [12:42<16:34,  8.69it/s]


ESCONV LIVE SAMPLE 6000/14642
Input: <problem>ongoing depression <user>I've been felling quite deppressed recently, just wanted to ask for some mental suppor...
Reference: why are you feeling in that way? tell me what it's happening?...
Generated: Do you know how long ago you started falling?...
Full Response: Do you know how long ago you started falling? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0172 | ROUGE-1: 0.0909 | METEOR: 0.0427


Evaluating ESConv - Sample 6100: BLEU=0.000, Emotion=✗:  42%|████▏     | 6100/14642 [12:53<16:33,  8.60it/s]


ESCONV LIVE SAMPLE 6100/14642
Input: <problem>ongoing depression <user>It's just me and my small family. I was at college but moved home a few weeks ago. At ...
Reference: I have heard that is the norm among colleges. That has to be really tough. So much of college is in ...
Generated: So it's not that you don't like it....
Full Response: So it's not that you don't like it. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0002 | ROUGE-1: 0.1695 | METEOR: 0.0445


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.010:  42%|████▏     | 6201/14642 [13:06<21:04,  6.67it/s]


ESCONV LIVE SAMPLE 6200/14642
Input: <problem>breakup with partner <user>yes i have faith in people very fast seriously its like your seeing my mind <user_em...
Reference: Maybe this could be a learning experience? I always try to see the best in people, but have learned ...
Generated: What do you mean, what do i see my mind?...
Full Response: What do you mean, what do i see my mind? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0031 | ROUGE-1: 0.1463 | METEOR: 0.0519


Evaluating ESConv - Sample 6300: BLEU=0.000, Emotion=✗:  43%|████▎     | 6300/14642 [13:19<16:21,  8.49it/s]


ESCONV LIVE SAMPLE 6300/14642
Input: <problem>academic pressure <user>Yes, that is true. I just can't quite seem to quiet my mind. (8) <user_emotion>anxiety ...
Reference: Well, I've tried meditating a few times before which doesn;t take it all away, it just takes the edg...
Generated: You said that you felt you were in a state of limbo,...
Full Response: You said that you felt you were in a state of limbo, <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0002 | ROUGE-1: 0.0732 | METEOR: 0.0248


Evaluating ESConv - Sample 6400: BLEU=0.020, Emotion=✗:  44%|████▎     | 6400/14642 [13:32<16:26,  8.36it/s]


ESCONV LIVE SAMPLE 6400/14642
Input: <problem>job crisis <user>It might, I am not sure but I don't think my boss will like the idea <user_emotion>anxiety <th...
Reference: Sometimes it's important to remember that part of your bosses job is having this kinds of conversati...
Generated: I think he might like the idea of you being angry with him....
Full Response: I think he might like the idea of you being angry with him. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0195 | ROUGE-1: 0.3077 | METEOR: 0.1874


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  44%|████▍     | 6502/14642 [13:46<12:34, 10.79it/s]


ESCONV LIVE SAMPLE 6500/14642
Input: <problem>ongoing depression <user>but he thinks he is always right and blah blah blah yes My real concern is for the boy...
Reference: I can tell this situation stresses you out and probably consumes a lot of your day to day energy. I ...
Generated: I'm not angry at you....
Full Response: I'm not angry at you. <therapist_emotion> neutral<eos>...
Ref Emotion: anger | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0952 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  45%|████▌     | 6602/14642 [13:58<14:00,  9.57it/s]


ESCONV LIVE SAMPLE 6600/14642
Input: <problem>ongoing depression <user>True That's a good idea. Thank you for your help I just have this as my mai n income r...
Reference: Okay, then one of you will have income and then he can find a job later....
Generated: ...
Full Response: <therapist_emotion> I'm sure that you appreciate it. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: i'm | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  46%|████▌     | 6701/14642 [14:12<14:44,  8.98it/s]


ESCONV LIVE SAMPLE 6700/14642
Input: <problem>academic pressure <user>i know thats why im so stress. stressed bye <user_emotion>depression <therapist>...
Reference: Perhaps "teacher" is the right path, buth "math" is not. good Luck to you...
Generated: It's a little bit of a shock to hear that....
Full Response: It's a little bit of a shock to hear that. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0141 | ROUGE-1: 0.0800 | METEOR: 0.0368


Evaluating ESConv - Sample 6800: BLEU=0.000, Emotion=✗:  46%|████▋     | 6800/14642 [14:24<12:47, 10.22it/s]


ESCONV LIVE SAMPLE 6800/14642
Input: <problem>problems with friends <user>Hi, I need to vent about an ongoing situation. Can I talk to you? <user_emotion>ang...
Reference: Of course you can talk to me, What would you like to vent about?...
Generated: Of course....
Full Response: Of course. <therapist_emotion> neutral<eos>...
Ref Emotion: anger | Pred Emotion: neutral | Match: ✗
BLEU: 0.0004 | ROUGE-1: 0.2500 | METEOR: 0.0391


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  47%|████▋     | 6901/14642 [14:36<13:53,  9.29it/s]


ESCONV LIVE SAMPLE 6900/14642
Input: <problem>ongoing depression <user>I do. My other sister is always a big help. Maybe I should text her more often? <user_...
Reference: yes! that would always be a good idea. its always comforting to hear from a loved one....
Generated: Sure....
Full Response: Sure. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.010:  48%|████▊     | 7001/14642 [14:49<15:45,  8.08it/s]


ESCONV LIVE SAMPLE 7000/14642
Input: <problem>breakup with partner <user>My breakup was a few months ago too. I'm in the Poconos. :) <user_emotion>depression...
Reference: I've always wanted to go to Las Vegas. My brother and his girlfriend just took a trip out there not ...
Generated: You mean,...
Full Response: You mean, <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.010:  48%|████▊     | 7101/14642 [15:00<18:03,  6.96it/s]


ESCONV LIVE SAMPLE 7100/14642
Input: <problem>academic pressure <user>Yeah, it can be very stressful when I don't understand though. I'm normally a very good...
Reference: Yes, I had a 100% average almost in high school and still struggled with the class. I kind of wish t...
Generated: cause it's not uncommon for you to have a hard time coping with the stress....
Full Response: cause it's not uncommon for you to have a hard time coping with the stress. <therapist_emotion> How do you feel about that? <therapist_emotion> neutra...
Ref Emotion: anxiety | Pred Emotion: how | Match: ✗
BLEU: 0.0007 | ROUGE-1: 0.1111 | METEOR: 0.0367


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  49%|████▉     | 7201/14642 [15:14<17:32,  7.07it/s]


ESCONV LIVE SAMPLE 7200/14642
Input: <problem>academic pressure <user>To be honest I have been a little afraid to approach them about the subject because the...
Reference: Just from chatting to you, I can sense that you are anxious, however, as many students feel this way...
Generated: You're right....
Full Response: You're right. <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0606 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  50%|████▉     | 7302/14642 [15:30<15:59,  7.65it/s]


ESCONV LIVE SAMPLE 7300/14642
Input: <problem>breakup with partner <user>I m not sure. why would my friend let it happen, and why would my boyfriend choose t...
Reference: you've had some time to sit with this, i understand completely that you are disgusted, but how have ...
Generated: ?...
Full Response: ? <therapist_emotion> neutral<eos>...
Ref Emotion: disgust | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  51%|█████     | 7400/14642 [15:43<14:48,  8.15it/s]


ESCONV LIVE SAMPLE 7400/14642
Input: <problem>ongoing depression <user>That must have been difficult to deal with. Mental health affects so many people in so...
Reference: It was. Luckily I had my doctor to help me. Family members meant well but chalked it up to "that's w...
Generated: I think that's a good thing....
Full Response: I think that's a good thing. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0005 | ROUGE-1: 0.1951 | METEOR: 0.0495


Evaluating ESConv - Sample 7500: BLEU=0.000, Emotion=✗:  51%|█████     | 7499/14642 [15:56<16:38,  7.15it/s] 


ESCONV LIVE SAMPLE 7500/14642
Input: <problem>problems with friends <user>feeling very shame <user_emotion>shame <therapist>...
Reference: Oh my god ! What happened?...
Generated: how did you feel?...
Full Response: how did you feel? <therapist_emotion> neutral<eos>...
Ref Emotion: shame | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Sample 7600: BLEU=0.000, Emotion=✗:  52%|█████▏    | 7600/14642 [16:10<15:12,  7.72it/s]


ESCONV LIVE SAMPLE 7600/14642
Input: <problem>ongoing depression <user>Yes I do hope so. <user_emotion>depression <therapist>...
Reference: I love AMC/Holiday movies as well!...
Generated: What I'm asking you is if there is something...
Full Response: What I'm asking you is if there is something  <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1176 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  53%|█████▎    | 7702/14642 [16:24<12:33,  9.20it/s]


ESCONV LIVE SAMPLE 7700/14642
Input: <problem>ongoing depression <user>I've been better. Just stressed really bad. but I think everybody just about is at thi...
Reference: it is a stressful time. what's bothering you the most?...
Generated: Well, what's the point?...
Full Response: Well, what's the point? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0213 | ROUGE-1: 0.3750 | METEOR: 0.1064


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  53%|█████▎    | 7801/14642 [16:36<12:13,  9.33it/s]


ESCONV LIVE SAMPLE 7800/14642
Input: <problem>ongoing depression <user>Yeah maybe. I suppose nobody really understands but I could at least speak to them abo...
Reference: everyone has problems and it is good to talk about them because a problem shared is a problem halved...
Generated: I think you should talk to your parents....
Full Response: I think you should talk to your parents. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0005 | ROUGE-1: 0.1154 | METEOR: 0.0506


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  54%|█████▍    | 7901/14642 [16:49<14:15,  7.88it/s]


ESCONV LIVE SAMPLE 7900/14642
Input: <problem>breakup with partner <user>yes <user_emotion>anxiety <therapist>...
Reference: You are definitely not the only person to feel this way!...
Generated: What's your mood?...
Full Response: What's your mood? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  55%|█████▍    | 8002/14642 [17:02<11:10,  9.91it/s]


ESCONV LIVE SAMPLE 8000/14642
Input: <problem>ongoing depression <user>yea, thats what my family is telling me to not rush into thigns <user_emotion>depressi...
Reference: You have to do what's right for you...
Generated: I don't think that's what your mother told you....
Full Response: I don't think that's what your mother told you. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.3000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  55%|█████▌    | 8100/14642 [17:16<11:57,  9.12it/s]


ESCONV LIVE SAMPLE 8100/14642
Input: <problem>ongoing depression <user>but anyhow, I have been conferring with my friend pretty much daily on the phone and t...
Reference: that's great. having a supportive friend/family is so much a blessing during this pandemic...
Generated: if you can do that....
Full Response: if you can do that. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0952 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  56%|█████▌    | 8202/14642 [17:30<12:46,  8.41it/s]


ESCONV LIVE SAMPLE 8200/14642
Input: <problem>ongoing depression <user>Hi <user_emotion>depression <therapist>...
Reference: What can I help you with today?...
Generated: And you were just talking about the birth control pill,...
Full Response: And you were just talking about the birth control pill, <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0211 | ROUGE-1: 0.1176 | METEOR: 0.0685


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  57%|█████▋    | 8301/14642 [17:43<10:50,  9.74it/s]


ESCONV LIVE SAMPLE 8300/14642
Input: <problem>problems with friends <user>I'm just having a hard time dealing with hurtful things people have said to me. I f...
Reference: that is really hard. Are you able to tell these people how their words are making you feel?...
Generated: What?...
Full Response: What? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  57%|█████▋    | 8401/14642 [17:57<17:19,  6.00it/s]


ESCONV LIVE SAMPLE 8400/14642
Input: <problem>job crisis <user>I don't want to. If there is a vaccine in the first quarter of 2021 then the company I worked ...
Reference: you have determination! There is fire within! Tell me, what brings you joy?...
Generated: I'm not sure that's the right word....
Full Response: I'm not sure that's the right word. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  58%|█████▊    | 8501/14642 [18:13<13:51,  7.38it/s]


ESCONV LIVE SAMPLE 8500/14642
Input: <problem>ongoing depression <user>My oldest is sad my youngest 7 couldnt care less. He loves that he can just play roblo...
Reference: To be fair Roblox is pretty fun. Have you seen those videos on Facebook where family members visit t...
Generated: Maybe he can......
Full Response: Maybe he can... <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  59%|█████▊    | 8601/14642 [18:26<13:36,  7.40it/s]


ESCONV LIVE SAMPLE 8600/14642
Input: <problem>ongoing depression <user>I have practiced breathing techniques and meditation before and think that is a great ...
Reference: Absolutely, and it's completely understandable, but it's very hard to wait. I hope that you are able...
Generated: What would make you feel better?...
Full Response: What would make you feel better? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0339 | METEOR: 0.0219


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  59%|█████▉    | 8701/14642 [18:40<10:39,  9.29it/s]


ESCONV LIVE SAMPLE 8700/14642
Input: <problem>problems with friends <user>I am not good . What about you? <user_emotion>depression <therapist>...
Reference: i am so sorry to hear that. what seems to be worrying you today?...
Generated: You're not good....
Full Response: You're not good. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1111 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  60%|██████    | 8801/14642 [18:53<11:19,  8.59it/s]


ESCONV LIVE SAMPLE 8800/14642
Input: <problem>ongoing depression <user>Yeah, the anxiety of being trapped in my apartment has kind of gotten to me but I am d...
Reference: Yeah, socially distancing is quite the isolating experience so I can relate to how you are feeling....
Generated: Maybe you could talk to me....
Full Response: Maybe you could talk to me. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0078 | ROUGE-1: 0.1739 | METEOR: 0.0629


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  61%|██████    | 8901/14642 [19:05<12:35,  7.60it/s]


ESCONV LIVE SAMPLE 8900/14642
Input: <problem>breakup with partner <user>It's good that you have had a similar experience and understand what I'm going throu...
Reference: Oh I see. That's a huge step to make in a relationship. So you are hesitant to break up with her bec...
Generated: You know, she's not a good mother....
Full Response: You know, she's not a good mother. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0014 | ROUGE-1: 0.2105 | METEOR: 0.0560


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  61%|██████▏   | 9001/14642 [19:18<09:16, 10.13it/s]


ESCONV LIVE SAMPLE 9000/14642
Input: <problem>ongoing depression <user>No! I keep hearing the negative cruel remarks being repeated in my head over and over ...
Reference: WHAT DO YOU HEAR?...
Generated: ,...
Full Response: , <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  62%|██████▏   | 9101/14642 [19:30<12:15,  7.53it/s]


ESCONV LIVE SAMPLE 9100/14642
Input: <problem>job crisis <user>Yes, sure. I work in an industry heavily impacted by COVID-19, and my company has announced pa...
Reference: Woah. Blessings to you for existing in what sounds like a stressful situation. How is self care goin...
Generated: I think that's true....
Full Response: I think that's true. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  63%|██████▎   | 9201/14642 [19:43<09:28,  9.58it/s]


ESCONV LIVE SAMPLE 9200/14642
Input: <problem>job crisis <user>Hi <user_emotion>sadness <therapist>...
Reference: Hi there, how are you today?...
Generated: What are you waiting for?...
Full Response: What are you waiting for? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0930 | ROUGE-1: 0.3636 | METEOR: 0.3178


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  64%|██████▎   | 9301/14642 [19:56<12:07,  7.34it/s]


ESCONV LIVE SAMPLE 9300/14642
Input: <problem>job crisis <user>I thought I had everything set up for one of my resident's at a new placement and I called tod...
Reference: I see. It seems as though you've done your best, but I understand your feelings. Is it possible you ...
Generated: Do you feel you have to be there for her?...
Full Response: Do you feel you have to be there for her? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0035 | ROUGE-1: 0.1538 | METEOR: 0.0191


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  64%|██████▍   | 9401/14642 [20:09<07:53, 11.07it/s]


ESCONV LIVE SAMPLE 9400/14642
Input: <problem>breakup with partner <user>I don't know if it was good. Maybe he was just being mean. It's nice of you to liste...
Reference: I know it's useless to tell you to not be sad. Go ahead and let it all out by crying or screaming bu...
Generated: I'm sorry?...
Full Response: I'm sorry? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0571 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  65%|██████▍   | 9501/14642 [20:21<08:18, 10.32it/s]


ESCONV LIVE SAMPLE 9500/14642
Input: <problem>ongoing depression <user>thanks. I hope you best of luck then with all your endeavours. I know we can all get t...
Reference: Right on, same to you. It's okay to feel anxious right now. It's okay to feel unproductive right now...
Generated: April,...
Full Response: April, <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  66%|██████▌   | 9601/14642 [20:33<07:52, 10.67it/s]


ESCONV LIVE SAMPLE 9600/14642
Input: <problem>ongoing depression <user>She did but not so great anymore. I sure hope so. It's been hard on everybody. especia...
Reference: Maybe she just needs some time to decompress. Being a teenager is a stressful time, especially this ...
Generated: And how did she feel about this?...
Full Response: And how did she feel about this? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0009 | ROUGE-1: 0.1000 | METEOR: 0.0508


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  66%|██████▋   | 9701/14642 [20:47<09:53,  8.33it/s]


ESCONV LIVE SAMPLE 9700/14642
Input: <problem>breakup with partner <user>I am not feeling very positive tonight but I think you are right. I think I reached ...
Reference: Would you feel comfortable giving me a little more detail as far as you problem with self abuse? wha...
Generated: I think you are right....
Full Response: I think you are right. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0010 | ROUGE-1: 0.0667 | METEOR: 0.0217


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  67%|██████▋   | 9801/14642 [21:00<09:56,  8.12it/s]


ESCONV LIVE SAMPLE 9800/14642
Input: <problem>ongoing depression <user>It's fine, no need to apologize <user_emotion>anxiety <therapist>...
Reference: Have you had any chance for ME Time where husband friend or family could watch your kids and give mo...
Generated: I just wanted to......
Full Response: I just wanted to... <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0488 | METEOR: 0.0305


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  68%|██████▊   | 9901/14642 [21:13<09:08,  8.64it/s]


ESCONV LIVE SAMPLE 9900/14642
Input: <problem>ongoing depression <user>But when i see social media and News every day it increasing my stress level <user_emo...
Reference: I live alone as well and also need to take care of myself. It can be so hard at times....
Generated: you think that's because i'm......
Full Response: you think that's because i'm... <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0741 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  68%|██████▊   | 10002/14642 [21:25<07:07, 10.84it/s]


ESCONV LIVE SAMPLE 10000/14642
Input: <problem>ongoing depression <user>I feel somewhat better just thin I feel a little better now. I look forward to talking...
Reference: Be creative! Get silly! most important have fun! this is a hard year but its still a chance to make ...
Generated: You mean the movie?...
Full Response: You mean the movie? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  69%|██████▉   | 10101/14642 [21:38<08:50,  8.56it/s]


ESCONV LIVE SAMPLE 10100/14642
Input: <problem>breakup with partner <user>I just need to schedule an appointment with someone and go in and meet them face to ...
Reference: It is great that you can identify what needs to be done in this situation. It sounds like you are re...
Generated: And what are you going to do with her?...
Full Response: And what are you going to do with her? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0010 | ROUGE-1: 0.2400 | METEOR: 0.0926


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  70%|██████▉   | 10201/14642 [21:52<11:27,  6.46it/s]


ESCONV LIVE SAMPLE 10200/14642
Input: <problem>ongoing depression <user>I'm so sorry that it's a situation that you can relate to, its such a hard situation w...
Reference: Wow, that really is a lot to deal with. It seems as though you have a good understanding of what you...
Generated: And yet, you feel that you've been given a choice,...
Full Response: And yet, you feel that you've been given a choice, <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1573 | METEOR: 0.0498


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  70%|███████   | 10302/14642 [22:05<08:15,  8.76it/s]


ESCONV LIVE SAMPLE 10300/14642
Input: <problem>breakup with partner <user>Thanks' so much , you lifted my mood , you being really helpful <user_emotion>depres...
Reference: No problem at all! Sending positive vibes your way!! You can get through this!!...
Generated: I'm glad you liked it....
Full Response: I'm glad you liked it. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1000 | METEOR: 0.0382


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  71%|███████   | 10402/14642 [22:17<07:30,  9.42it/s]


ESCONV LIVE SAMPLE 10400/14642
Input: <problem>ongoing depression <user>Because prior to the pandemic, I was pretty regimented with set schedules and can clea...
Reference: Is there anything else I can help you with?...
Generated: You seem like you've been struggling with anxiety....
Full Response: You seem like you've been struggling with anxiety. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.2222 | METEOR: 0.0562


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  72%|███████▏  | 10501/14642 [22:29<08:27,  8.16it/s]


ESCONV LIVE SAMPLE 10500/14642
Input: <problem>academic pressure <user>I think that's pretty good advice. I know which room I can do this, but just away from ...
Reference: Of course! Is anything else on your mind?...
Generated: So what do you think about this?...
Full Response: So what do you think about this? <therapist_emotion> neutral<eos>...
Ref Emotion: nervousness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  72%|███████▏  | 10601/14642 [22:43<09:45,  6.91it/s]


ESCONV LIVE SAMPLE 10600/14642
Input: <problem>ongoing depression <user>Well the boyfriend for awhile the other thing for a day or two I got a hold of a thera...
Reference: that's very good that you are going to talk to a therapist. you showed great strength by reaching ou...
Generated: And they don't talk to you for two weeks?...
Full Response: And they don't talk to you for two weeks? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0185 | ROUGE-1: 0.2000 | METEOR: 0.1420


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  73%|███████▎  | 10701/14642 [22:56<06:49,  9.63it/s]


ESCONV LIVE SAMPLE 10700/14642
Input: <problem>Alcohol Abuse <user>Yes, I sure hope so. Thank you have a good day! <user_emotion>shame <therapist>...
Reference: You as well!...
Generated: I'm sorry....
Full Response: I'm sorry. <therapist_emotion> neutral<eos>...
Ref Emotion: shame | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  74%|███████▍  | 10800/14642 [23:09<07:38,  8.39it/s]


ESCONV LIVE SAMPLE 10800/14642
Input: <problem>Sleep Problems <user>I'm looking forward to start a own business. Yes i hope so. i'll start planning <user_emot...
Reference: Oh that's exciting! What sort of business?...
Generated: So you want to start a company?...
Full Response: So you want to start a company? <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Sample 10900: BLEU=0.016, Emotion=✗:  74%|███████▍  | 10900/14642 [23:21<07:36,  8.20it/s]


ESCONV LIVE SAMPLE 10900/14642
Input: <problem>Procrastination <user>OK, so with, well, kinf0f like baby steps....right <user_emotion>anxiety <therapist>...
Reference: It is about making small adjustments to feed a good habit....
Generated: we don't talk about kinf0f......
Full Response: we don't talk about kinf0f... <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0162 | ROUGE-1: 0.1176 | METEOR: 0.0481


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  75%|███████▌  | 11002/14642 [23:34<07:26,  8.15it/s]


ESCONV LIVE SAMPLE 11000/14642
Input: <problem>Sleep Problems <user>Bye <user_emotion>anxiety <therapist>...
Reference: (please click the quit button)...
Generated: It's not like you're afraid of what you might find....
Full Response: It's not like you're afraid of what you might find. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  76%|███████▌  | 11101/14642 [23:46<08:48,  6.70it/s]


ESCONV LIVE SAMPLE 11100/14642
Input: <problem>Alcohol Abuse <user>Work has been stressful, but I feel like was before things got bad. Most nights I come home...
Reference: What do you do for work?...
Generated: It's hard for you to imagine that you and your family would be feeling this way...
Full Response: It's hard for you to imagine that you and your family would be feeling this way <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0146 | ROUGE-1: 0.1739 | METEOR: 0.1429


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  76%|███████▋  | 11201/14642 [23:58<06:51,  8.36it/s]


ESCONV LIVE SAMPLE 11200/14642
Input: <problem>Sleep Problems <user>That's a good idea. I could try go out for a walk either before or after work as a startin...
Reference: That sounds like a good plan! It also helps to make sure that it’s dark and very quiet. Surprisingly...
Generated: Would you mind telling me what it would do to your body?...
Full Response: Would you mind telling me what it would do to your body? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0018 | ROUGE-1: 0.0769 | METEOR: 0.0413


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  77%|███████▋  | 11301/14642 [24:11<06:05,  9.15it/s]


ESCONV LIVE SAMPLE 11300/14642
Input: <problem>Sleep Problems <user>I'm looking forward to a fulls night rest. <user_emotion>anxiety <therapist>...
Reference: That is good, lavender baths or showers will also ease and relax the troubled mind....
Generated: You're feeling good....
Full Response: You're feeling good. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1053 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  78%|███████▊  | 11401/14642 [24:25<06:52,  7.85it/s]


ESCONV LIVE SAMPLE 11400/14642
Input: <problem>Appearance Anxiety <user>mind u i was 125 and fit for the 10 years we were together before we had kids like a d...
Reference: I talked to a nutritionist and it really helped. They are professionals who can look into your diet ...
Generated: What do you mean?...
Full Response: What do you mean? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  79%|███████▊  | 11501/14642 [24:38<06:01,  8.70it/s]


ESCONV LIVE SAMPLE 11500/14642
Input: <problem>Appearance Anxiety <user>Yes. I mentioned it to them at my 6 week pp check up. I was prescribed to a med to hel...
Reference: Antidepressants can take a couple of weeks to kick in, I am on them too; I hope you will find them h...
Generated: You seem very anxious,...
Full Response: You seem very anxious, <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0588 | METEOR: 0.0189


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  79%|███████▉  | 11601/14642 [24:50<06:01,  8.41it/s]


ESCONV LIVE SAMPLE 11600/14642
Input: <problem>Appearance Anxiety <user>I hate everything when I see my reflection. I just feel fat and ugly <user_emotion>dis...
Reference: Well, I have been overweight and I know it sucks. Perhaps you could join a weight loss group?...
Generated: Maybe it's time for you to get over your...
Full Response: Maybe it's time for you to get over your  <therapist_emotion> ... <therapist_emotion> neutral<eos>...
Ref Emotion: disgust | Pred Emotion: ... | Match: ✗
BLEU: 0.0088 | ROUGE-1: 0.1429 | METEOR: 0.0585


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  80%|███████▉  | 11701/14642 [25:04<05:15,  9.31it/s]


ESCONV LIVE SAMPLE 11700/14642
Input: <problem>job crisis <user>Yes that is a very good summary of what is going on. And I am glad to see you can relate. Whil...
Reference: Are there other jobs available in your area that you could apply for that are more up your alley?...
Generated: I'm sorry....
Full Response: I'm sorry. <therapist_emotion> neutral<eos>...
Ref Emotion: shame | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  81%|████████  | 11801/14642 [25:17<06:33,  7.21it/s]


ESCONV LIVE SAMPLE 11800/14642
Input: <problem>job crisis <user>SOMEWHERE SCARED AND CRYING IN THE RAIN <user_emotion>fear <therapist>...
Reference: Is it cold up there?...
Generated: What?...
Full Response: What? <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  81%|████████▏ | 11901/14642 [25:30<05:33,  8.22it/s]


ESCONV LIVE SAMPLE 11900/14642
Input: <problem>problems with friends <user>i was having an affair and i confided in my best friend about it. i found out that ...
Reference: Your friend violated a confidence. That sounds difficult....
Generated: you're a therapist?...
Full Response: you're a therapist? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0215 | ROUGE-1: 0.1667 | METEOR: 0.0667


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  82%|████████▏ | 12001/14642 [25:42<04:51,  9.05it/s]


ESCONV LIVE SAMPLE 12000/14642
Input: <problem>ongoing depression <user>that sounds weird not what you said I referring to what I said I think it was weird ri...
Reference: What do you think sounds weird?...
Generated: I think that's a little extreme,...
Full Response: I think that's a little extreme, <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0408 | ROUGE-1: 0.1538 | METEOR: 0.0833


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  83%|████████▎ | 12101/14642 [25:56<07:35,  5.58it/s]


ESCONV LIVE SAMPLE 12100/14642
Input: <problem>problems with friends <user>Today those things have gone and all are fine. When we happen to think of the old e...
Reference: So you are angry about what happen in the past...
Generated: And what is it that you feel is anger?...
Full Response: And what is it that you feel is anger? <therapist_emotion> neutral<eos>...
Ref Emotion: anger | Pred Emotion: neutral | Match: ✗
BLEU: 0.0255 | ROUGE-1: 0.2105 | METEOR: 0.1010


Evaluating ESConv - Sample 12200: BLEU=0.009, Emotion=✗:  83%|████████▎ | 12200/14642 [26:10<10:04,  4.04it/s]


ESCONV LIVE SAMPLE 12200/14642
Input: <problem>job crisis <user>thanks for trying to understand it just seems pretty hopeless <user_emotion>anxiety <therapist...
Reference: No problem. I am encountering a similar situation, looking for a well suited job....
Generated: I don't think it's hopeless. I think it's just that I think it's just a lot easier to ignore the thi...
Full Response: I don't think it's hopeless. I think it's just that I think it's just a lot easier to ignore the things that really matter  <therapist_emotion> What r...
Ref Emotion: anxiety | Pred Emotion: what | Match: ✗
BLEU: 0.0094 | ROUGE-1: 0.0952 | METEOR: 0.0667


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  84%|████████▍ | 12301/14642 [26:23<05:58,  6.53it/s]


ESCONV LIVE SAMPLE 12300/14642
Input: <problem>breakup with partner <user>Thank you for listening and giving good advice. You are really good at giving suppor...
Reference: I am glad that you find it helpful. I wish I can be around to hear more about it....
Generated: You know, sometimes it's not so easy to talk about your problems,...
Full Response: You know, sometimes it's not so easy to talk about your problems, <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0113 | ROUGE-1: 0.2500 | METEOR: 0.0820


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  85%|████████▍ | 12401/14642 [26:37<07:01,  5.32it/s]


ESCONV LIVE SAMPLE 12400/14642
Input: <problem>academic pressure <user>I just don't feel motivated to study <user_emotion>shame <therapist>...
Reference: from today please feel yourself that you know everything about your studies , i think then you fell ...
Generated: I don't think that's true. I think it's true. I think it's true that you have...
Full Response: I don't think that's true. I think it's true. I think it's true that you have  <therapist_emotion> anger <therapist_emotion> anger <therapist_emotion>...
Ref Emotion: shame | Pred Emotion: anger | Match: ✗
BLEU: 0.0239 | ROUGE-1: 0.2105 | METEOR: 0.1688


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  85%|████████▌ | 12502/14642 [26:49<03:51,  9.24it/s]


ESCONV LIVE SAMPLE 12500/14642
Input: <problem>breakup with partner <user>We met on AIM, Her ex is a registered sex offender. and it makes me sad that she lef...
Reference: Sometimes we just have to learn to let go *nods sententiously*...
Generated: You know, you're a lawyer....
Full Response: You know, you're a lawyer. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  86%|████████▌ | 12601/14642 [27:02<03:49,  8.88it/s]


ESCONV LIVE SAMPLE 12600/14642
Input: <problem>job crisis <user>We, I and my wife, also think the same way. Even if takes some time, better to wait. But we we...
Reference: That's great! I'm glad I was able to help....
Generated: So you feel...
Full Response: So you feel <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  87%|████████▋ | 12700/14642 [27:15<06:09,  5.25it/s]


ESCONV LIVE SAMPLE 12700/14642
Input: <problem>breakup with partner <user>sure, you too <user_emotion>anxiety <therapist>...
Reference: thank you...
Generated: but you can't help but feel...
Full Response: but you can't help but feel  <therapist_emotion> joy  <therapist_emotion> joy  <therapist_emotion> joy,  <therapist_emotion> joy.  <therapist_emotion>...
Ref Emotion: anxiety | Pred Emotion: joy | Match: ✗
BLEU: 0.0408 | ROUGE-1: 0.2222 | METEOR: 0.2083


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  87%|████████▋ | 12800/14642 [27:29<04:35,  6.68it/s]


ESCONV LIVE SAMPLE 12800/14642
Input: <problem>breakup with partner <user>its hard to plan a future when i am grieving....i don't know how to make myself happ...
Reference: don't be sad because of past...
Generated: You're very much a part of...
Full Response: You're very much a part of  <therapist_emotion> <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: <therapist_emotion> | Match: ✗
BLEU: 0.0408 | ROUGE-1: 0.1429 | METEOR: 0.0833


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  88%|████████▊ | 12901/14642 [27:42<02:54,  9.99it/s]


ESCONV LIVE SAMPLE 12900/14642
Input: <problem>job crisis <user>No I haven't yet. But I was thinking about it actually. Do you think that would be weird to do...
Reference: I think that would be a good next step. Communication is always helpful. I think your work will appr...
Generated: Maybe....
Full Response: Maybe. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.009:  89%|████████▉ | 13001/14642 [27:57<04:21,  6.27it/s]


ESCONV LIVE SAMPLE 13000/14642
Input: <problem>academic pressure <user>Thanks, that will be helpful to decide on the programs I want to apply to Are you based...
Reference: You welcome. and yes...
Generated: I'm sure you're aware that there are a lot of people who would rather live in the US than in Canada....
Full Response: I'm sure you're aware that there are a lot of people who would rather live in the US than in Canada. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0741 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  89%|████████▉ | 13101/14642 [28:09<04:14,  6.05it/s]


ESCONV LIVE SAMPLE 13100/14642
Input: <problem>academic pressure <user>What kind of advise should I be seeking from the counsellor? <user_emotion>anxiety <the...
Reference: Well, perhaps they can at least alleviate your anxiety somewhat. Or maybe even get you a sick note t...
Generated: You know, I've had the best of luck in therapy. I'm sure you can work through whatever it is that yo...
Full Response: You know, I've had the best of luck in therapy. I'm sure you can work through whatever it is that you feel I have to offer. <therapist_emotion> neutra...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0092 | ROUGE-1: 0.1379 | METEOR: 0.1045


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  90%|█████████ | 13201/14642 [28:21<03:33,  6.76it/s]


ESCONV LIVE SAMPLE 13200/14642
Input: <problem>ongoing depression <user>You met with your friends? I've tried meeting with my friends. I feel ok when I'm with...
Reference: yes, you have to be positive at all time and always do something that makes you happy...
Generated: You think that you're not happy when you're alone?...
Full Response: You think that you're not happy when you're alone? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0117 | ROUGE-1: 0.2857 | METEOR: 0.0926


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  91%|█████████ | 13301/14642 [28:36<02:33,  8.74it/s]


ESCONV LIVE SAMPLE 13300/14642
Input: <problem>breakup with partner <user>Thanks! You’ve been really helpful today. I can’t thank you enough for your encourag...
Reference: It was no problem! Thank you for sharing and talking it out!...
Generated: I'm sorry, Paul....
Full Response: I'm sorry, Paul. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  92%|█████████▏| 13402/14642 [28:50<02:36,  7.91it/s]


ESCONV LIVE SAMPLE 13400/14642
Input: <problem>ongoing depression <user>I actually do yoga every evening. She hates it. I'm living in her home without a lease...
Reference: As long as you have property there she would have to formally evict you. That is the good news....
Generated: You know, sometimes you feel guilty about something....
Full Response: You know, sometimes you feel guilty about something. <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0070 | ROUGE-1: 0.1481 | METEOR: 0.0279


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  92%|█████████▏| 13501/14642 [29:02<02:53,  6.57it/s]


ESCONV LIVE SAMPLE 13500/14642
Input: <problem>academic pressure <user>should i go to my school councellor <user_emotion>anxiety <therapist>...
Reference: Yes, that is a great idea. Have you ever joined a study group? A group of peers can also be helpful....
Generated: you don't know how much pressure it's gonna put on you....
Full Response: you don't know how much pressure it's gonna put on you. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0076 | ROUGE-1: 0.0588 | METEOR: 0.0250


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  93%|█████████▎| 13601/14642 [29:14<01:56,  8.95it/s]


ESCONV LIVE SAMPLE 13600/14642
Input: <problem>job crisis <user>At this point, I don't know. I have thought about doing art commissions and selling them onlin...
Reference: That sounds like a fantastic idea, and even if there is competition, there are still a lot of people...
Generated: What's that?...
Full Response: What's that? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0645 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  94%|█████████▎| 13701/14642 [29:28<01:34,  9.96it/s]


ESCONV LIVE SAMPLE 13700/14642
Input: <problem>ongoing depression <user>Yes you are correct the problems with the bank how do i solve not to sink into depress...
Reference: So, I have also struggled with that in the past. The way that really helped me put things into persp...
Generated: ?...
Full Response: ? <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  94%|█████████▍| 13801/14642 [29:41<01:50,  7.58it/s]


ESCONV LIVE SAMPLE 13800/14642
Input: <problem>academic pressure <user>Kind of, but the schedule is a bit weird, like every other day I only have an afternoon...
Reference: it sounds like your schedule is different almost everyday. I know I find schedule changes like that ...
Generated: I'm not sure how you mean it is....
Full Response: I'm not sure how you mean it is. <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0048 | ROUGE-1: 0.1935 | METEOR: 0.0243


Evaluating ESConv - Sample 13900: BLEU=0.013, Emotion=✗:  95%|█████████▍| 13900/14642 [29:53<01:29,  8.26it/s]


ESCONV LIVE SAMPLE 13900/14642
Input: <problem>breakup with partner <user>We talked about it briefly, but havent spoke about in great detail. I'm just scared,...
Reference: How much of this distance between you do you think is related to COVID measures?...
Generated: I don't think it is worth it....
Full Response: I don't think it is worth it. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0125 | ROUGE-1: 0.1739 | METEOR: 0.0704


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  96%|█████████▌| 14001/14642 [30:07<01:25,  7.50it/s]


ESCONV LIVE SAMPLE 14000/14642
Input: <problem>job crisis <user>Yes. i plan on doing that this afternoon. Thank you so much. I am feeling way more at ease. I ...
Reference: I hope your day goes great as well...
Generated: I'm honored to be here today....
Full Response: I'm honored to be here today. <therapist_emotion> neutral<eos>...
Ref Emotion: fear | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.1333 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  96%|█████████▋| 14101/14642 [30:19<00:55,  9.82it/s]


ESCONV LIVE SAMPLE 14100/14642
Input: <problem>breakup with partner <user>its gone, she left me <user_emotion>sadness <therapist>...
Reference: That's awful. I'm so sorry to hear that. How did it happen?...
Generated: what do you mean?...
Full Response: what do you mean? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0000 | METEOR: 0.0000


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  97%|█████████▋| 14201/14642 [30:32<00:57,  7.67it/s]


ESCONV LIVE SAMPLE 14200/14642
Input: <problem>academic pressure <user>Thank you so much for your insight. I will be praying for a good outcome. <user_emotion...
Reference: you will get it for sure! is there anything else I can help you with?...
Generated: So what do you think is the most important outcome?...
Full Response: So what do you think is the most important outcome? <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0152 | ROUGE-1: 0.1600 | METEOR: 0.0690


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  98%|█████████▊| 14301/14642 [30:43<00:39,  8.60it/s]


ESCONV LIVE SAMPLE 14300/14642
Input: <problem>problems with friends <user>That's something you would have to ask him, but I feel like I am at peace for now. ...
Reference: Yeah that a good way to look at it. Maybe take your mind of it by occupying yourself...
Generated: I don't think that's what I'm doing....
Full Response: I don't think that's what I'm doing. <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0000 | ROUGE-1: 0.0714 | METEOR: 0.0000


Evaluating ESConv - Sample 14400: BLEU=0.011, Emotion=✗:  98%|█████████▊| 14399/14642 [30:56<00:27,  8.90it/s]


ESCONV LIVE SAMPLE 14400/14642
Input: <problem>academic pressure <user>I'm feeling really anxious these days. I'm finding the COVID online learning experience...
Reference: I understand your frustration. All of us are challenged due to COVID....
Generated: I think that's true....
Full Response: I think that's true. <therapist_emotion> neutral<eos>...
Ref Emotion: anxiety | Pred Emotion: neutral | Match: ✗
BLEU: 0.0109 | ROUGE-1: 0.1176 | METEOR: 0.0446


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008:  99%|█████████▉| 14500/14642 [31:10<00:18,  7.50it/s]


ESCONV LIVE SAMPLE 14500/14642
Input: <problem>breakup with partner <user>Yeah she was a little violent. I was honestly a little scared for my life for a seco...
Reference: Have you talked since then?...
Generated: And how did you react when she did threaten to kill you?...
Full Response: And how did you react when she did threaten to kill you? <therapist_emotion> neutral<eos>...
Ref Emotion: sadness | Pred Emotion: neutral | Match: ✗
BLEU: 0.0170 | ROUGE-1: 0.1176 | METEOR: 0.0877


Evaluating ESConv - Sample 14600: BLEU=0.001, Emotion=✗: 100%|█████████▉| 14600/14642 [31:24<00:04,  9.65it/s]


ESCONV LIVE SAMPLE 14600/14642
Input: <problem>job crisis <user>Yes It is frustrating. When I think about the financial issues I feel depressed <user_emotion>...
Reference: I compleyely understand I have also lost my career due to covid. Dont give up, staying positive and ...
Generated: I can imagine that you feel...
Full Response: I can imagine that you feel  <therapist_emotion> neutral<eos>...
Ref Emotion: depression | Pred Emotion: neutral | Match: ✗
BLEU: 0.0012 | ROUGE-1: 0.1176 | METEOR: 0.0991


Evaluating ESConv - Avg BLEU: 0.005, Emotion Acc: 0.008: 100%|██████████| 14642/14642 [31:29<00:00,  7.75it/s]



ESCONV EVALUATION COMPLETED!
Processed 14642 samples
Average BLEU: 0.0050
Average ROUGE-1: 0.1046
Emotion Accuracy: 0.0080
Emotion Tag Coverage: 0.9979

ESCONV THERAPY MODEL EVALUATION RESULTS
Model Type: SFT
Output Format: STANDARD

Text Generation Metrics:
  BLEU Score:     0.0050
  ROUGE-1:        0.1046
  ROUGE-2:        0.0089
  ROUGE-L:        0.0925
  METEOR:         0.0354

Emotion Prediction:
  Emotion Accuracy:    0.0080
  Emotion Tag Coverage: 0.9979

Dataset Info:
  Total Samples:  14642

Emotion Confusion Matrix:
  anxiety:
    -> neutral: 3359
    -> but: 24
    -> <therapist_emotion>: 70
    -> i: 66
    -> tell: 1
    -> i'm: 49
    -> what: 23
    -> i'm...: 2
    -> who: 11
    -> well,: 10
    -> that: 4
    -> i'll: 2
    -> anger?: 11
    -> and: 13
    -> unwell.: 1
    -> okay,: 2
    -> get: 2
    -> you: 29
    -> cause: 10
    -> unrelated?: 1
    -> you're: 18
    -> you've: 6
    -> ?: 7
    -> anger.: 11
    -> so: 14
    -> but...: 2
    -> .: 6
    -> ap