In [116]:
conversation_example_1 = """
Customer: Hi, I ordered a laptop from your store last week, but I received the wrong color.

Agent: I apologize for this mistake. Could you please provide your order number?

Customer: Sure, it's #ORDER45678

Agent: Thank you. I can see the error in our system. I can arrange for a replacement in your preferred color, and we'll send you a prepaid return label for the current laptop. Also, I'll add a 10% discount on your next purchase as compensation for the inconvenience.

Customer: That's really great! Thank you so much for handling this so quickly. I appreciate the discount too!

Agent: You're welcome! The return label will be emailed to you within the next hour, and your replacement laptop will ship tomorrow with priority delivery.

Customer: Perfect! Have a great day! 
"""


conversation_example_2 = """
Customer: This is the third time I'm contacting you about my broken washing machine. No one has shown up for the repair despite scheduling 3 appointments!

Agent: I can see your previous contacts. Could you please provide your reference number again?

Customer: Are you kidding me? I've given that number THREE times already! It's WR789012. This is absolutely ridiculous.

Agent: I understand your frustration. Let me check the status.

Customer: I've been waiting for 20 minutes now. I'm paying for a premium warranty and this is the service I get? I want to speak to a supervisor immediately.

Agent: I apologize, but all our supervisors are currently unavailable. I can have one call you back within 24 hours.

Customer: This is unacceptable! I'm going to post about this terrible service on social media and file a formal complaint. You've wasted my time and money. I'll never buy from your company again.

Agent: I apologize for your experience. I'll escalate this case...

Customer: Don't bother. I'm done with this company. 
"""


In [117]:
sentiment_eval_prompt = """
Analyze the sentiment of the following conversation and classify it as either "positive" "negative" "neutral" or "mixed" 

Analyze the conversation below and provide the sentiment analysis in JSON format with two fields:
- sentiment: (positive/negative/neutral/mixed)
- reason: (detailed explanation for the classification)

<conversation>
{{conversation}}
</conversation>

Response:
{
    "sentiment": "",
    "reason": ""
}
"""

In [118]:
import boto3
import json

def bedrock_chat(prompt, system_prompt, model_id):
    client = boto3.client("bedrock-runtime")

    system = [{"text": system_prompt}]
    messages = [{"role": "user", "content": [{"text": prompt}]}]
    
    inf_params = {
        "maxTokens": 2048,
        "topP": 0.1,
        "temperature": 0.3
    }

    additionalModelRequestFields = {
        "inferenceConfig": {"topK": 20}
    }

    model_response = client.converse(
        modelId=model_id,
        messages=messages,
        system=system,
        inferenceConfig=inf_params,
        additionalModelRequestFields=additionalModelRequestFields
    )

    return model_response["output"]["message"]["content"][0]["text"]

def prepare_prompt(template, **kwargs):
    for key, value in kwargs.items():
        template = template.replace(f"{{{{{key}}}}}", value)
    return template


In [119]:
# Example usage
conversation_examples = [
    conversation_example_1,
    conversation_example_2
]

ground_truths = [
    {
        "sentiment": "positive",
    },
    {
        "sentiment": "negative",
    }
]

In [120]:
def evaluate_conversation(conversation_examples, current_prompt):
    results = []
    
    for conversation in conversation_examples:
        sentiment_prompt = prepare_prompt(current_prompt, conversation=conversation)
        system_prompt_evaluation = "You are a helpful assistant. Always respond with clean JSON format without markdown or escape characters."
        model_id = "us.amazon.nova-pro-v1:0"
        sentiment_response = bedrock_chat(sentiment_prompt,system_prompt_evaluation,model_id)
        
        try:
            result = json.loads(sentiment_response)
            results.append(result)
        except json.JSONDecodeError:
            results.append({
                "error": "Failed to parse API response",
                "sentiment_raw": sentiment_response
            })
    
    return results

In [121]:
evaluation_llm = evaluate_conversation(conversation_examples,sentiment_eval_prompt)

In [122]:
def evaluate_with_ground_truth(model_outputs, ground_truths):
    combined_results = []
    correct_predictions = 0
    total_predictions = len(ground_truths)
    
    for model_output, ground_truth in zip(model_outputs, ground_truths):
        sentiment_match = model_output.get("sentiment", "").lower() == ground_truth.get("sentiment", "").lower()
        if sentiment_match:
            correct_predictions += 1
        
        result = {
            "model_sentiment": model_output.get("sentiment", "").lower(),
            "model_reasoning": model_output.get("reason", ""),
            "ground_truth_sentiment": ground_truth.get("sentiment", "").lower(),
            "is_correct": sentiment_match
        }
        
        combined_results.append(result)
    
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    
    return {
        "accuracy": round(accuracy, 2),
        "correct_predictions": correct_predictions,
        "total_predictions": total_predictions,
        "detailed_results": combined_results
    }

In [123]:
evaluation_results = evaluate_with_ground_truth(evaluation_llm, ground_truths)

In [124]:
evaluation_results

{'accuracy': 1.0,
 'correct_predictions': 2,
 'total_predictions': 2,
 'detailed_results': [{'model_sentiment': 'positive',
   'model_reasoning': 'The conversation starts with a customer complaint about receiving the wrong color laptop. However, the agent responds promptly with an apology, requests the order number, and takes immediate action to resolve the issue by arranging a replacement and offering a discount. The customer expresses appreciation for the quick resolution and the compensation. The agent confirms the actions to be taken and ensures the customer that the issue will be resolved promptly. Overall, the interaction ends on a positive note with both parties satisfied.',
   'ground_truth_sentiment': 'positive',
   'is_correct': True},
  {'model_sentiment': 'negative',
   'model_reasoning': "The conversation is predominantly negative. The customer expresses strong dissatisfaction and frustration multiple times, mentioning repeated failed appointments, the need to provide the 

In [125]:
critique_prompt_template = """
Analyze the classification performance and provide detailed reasoning for prompt improvements:

Current Template:
{{input_current_template}}

Evaluation Results:
{{evaluation_results}}

Follow these steps in order:

1. STEP 1 - Error Pattern Analysis:
   - List each misclassified case
   - Group similar errors
   - Focus on how the prompt's instructions led to these errors
   YOUR ANALYSIS:

2. STEP 2 - Prompt-Specific Root Cause Investigation:
   For each error pattern identified above, analyze:
   - Which parts of the current prompt led to misinterpretation?
   - Are there ambiguous or missing instructions?
   - Are the classification criteria clearly defined?
   - Is the format/structure of the prompt causing confusion?
   YOUR ANALYSIS:

3. STEP 3 - Historical Context:
   Previous Suggestions: {{suggestion_history}}
   Analyze only prompt-related changes:
   - Which prompt modifications were effective/ineffective?
   - Which instruction clarity issues persist?
   - What prompt elements still need refinement?
   YOUR ANALYSIS:

4. STEP 4 - Prompt Improvement Ideas:
   Suggest only changes to prompt instructions and structure:
   - Clearer classification criteria
   - Better examples or explanations
   - More precise instructions
   - Better prompt structure or organization
   - Specific wording improvements
   
   AVOID suggesting:
   Adding more training data
   Modifying the model
   Changes to the underlying AI system
   Adding new model capabilities
   
   YOUR IDEAS:

Remember: Focus solely on how to improve the prompt instructions, format, and clarity. All suggestions must be implementable through prompt engineering alone.
"""

In [126]:
format_prompt = """
Based on the analysis below, structure the recommendations in the following format and create a new improved template:

Current Template:
{{input_current_template}}

Analysis:
{{critique_feedbacks}}

Instructions for improved template:
1. Take the current template as a base
2. Apply the high-priority recommendations to create a new template
3. Incorporate specific improvements identified in the analysis
4. Ensure the new template maintains the basic structure but addresses the identified issues
5. The improved template should be a complete, ready-to-use prompt

Return your response in this exact JSON format:
{
    "analysis": {
        "error_patterns": [
            "List specific error patterns found"
        ],
        "root_causes": [
            "List root causes of errors"
        ]
    },
    "recommendations": {
        "high_priority": [
            "List specific changes that should be made immediately"
        ],
        "medium_priority": [
            "List important but not critical changes"
        ],
        "low_priority": [
            "List minor optimization suggestions"
        ]
    },
    "improved_template": "Provide the complete new template here with all recommended changes incorporated. This should be a fully functional template ready for the next iteration."
}

IMPORTANT: The improved_template must be different from the current template and incorporate the recommended changes. Do not simply return the original template.
"""

In [127]:
def critique_performance(critique_prompt_template, current_prompt, eval_results, feedback_history):
    """
    Analyze classification performance and generate critique
    
    Args:
        current_prompt (str): Current prompt template
        eval_results (dict): Evaluation results including accuracy and detailed results
        feedback_history (list): History of previous feedback and optimizations
    
    Returns:
        dict: Analysis results including error patterns and root causes
    """
    analysis_prompt = prepare_prompt(
        critique_prompt_template,
        input_current_template=current_prompt,
        evaluation_results=json.dumps(eval_results, indent=2),
        suggestion_history=json.dumps(feedback_history, indent=2)
    )
    
    system_prompt = "You are a helpful assistant focused on detailed analysis. Follow the steps carefully."
    model_id = "us.amazon.nova-pro-v1:0"
    analysis_response = bedrock_chat(analysis_prompt, system_prompt, model_id)
    return analysis_response 


In [128]:
def optimize_prompt(format_prompt, current_prompt, critique_feedback):
    """
    Generate optimized prompt based on critique results
    
    Args:
        format_prompt (str): Template for formatting optimization output
        current_prompt (str): Current prompt template
        critique_feedback (str): Detailed analysis from critique_performance function
    
    Returns:
        dict: Structured optimization results including analysis and recommendations
    """
    optimization_prompt = prepare_prompt(
        format_prompt,
        input_current_template=current_prompt,
        critique_feedbacks=critique_feedback
    )
    
    system_prompt = "You are a helpful assistant. Always respond with clean JSON format without markdown or escape characters."
    model_id = "us.amazon.nova-pro-v1:0"
    optimization_response = bedrock_chat(optimization_prompt, system_prompt, model_id)
    
    try:
        optimization = json.loads(optimization_response)
        return optimization
    except json.JSONDecodeError:
        return {
            "error": "Failed to parse optimization response",
            "raw_response": optimization_response
        }


In [129]:
def run_feedback_loop(conversation_examples, ground_truths, iterations=3):
    current_prompt = sentiment_eval_prompt
    feedback_history = []
    accuracy_history = []
    
    for i in range(iterations):        
        print(f"\nIteration {i+1}")
        print("="*50)
        
        # Get model predictions and evaluate
        print("Current Prompt ######")
        print(current_prompt)
        predictions = evaluate_conversation(conversation_examples, current_prompt)
        eval_results = evaluate_with_ground_truth(predictions, ground_truths)
        
        # Track accuracy
        current_accuracy = eval_results["accuracy"]
        accuracy_history.append(current_accuracy)
        
        print(f"Accuracy: {current_accuracy*100:.2f}%")
        print(f"Correct: {eval_results['correct_predictions']}/{eval_results['total_predictions']}")
        
        # Show detailed results
        for idx, result in enumerate(eval_results["detailed_results"]):
            print(f"\nConversation {idx + 1}:")
            print(f"Predicted: {result['model_sentiment']}")
            print(f"Actual: {result['ground_truth_sentiment']}")
            print(f"Correct: {'✓' if result['is_correct'] else '✗'}")
        
        # Get critique and optimization
        critique_feedback = critique_performance(
            critique_prompt_template, 
            current_prompt, 
            eval_results, 
            feedback_history
        )

        print("FEEDBACK FROM CRITIQUE LLM ######")
        print(critique_feedback)
        
        optimization_results = optimize_prompt(
            format_prompt,
            current_prompt,
            critique_feedback
        )

        print("IMPROVED OPTIMIZATION ######")
        print(optimization_results)

        # Store feedback
        feedback_history.append({
            "iteration": i + 1,
            "current_prompt": current_prompt,
            "critique_feedback": critique_feedback,
            "optimization_results": optimization_results,
            "accuracy": current_accuracy
        })

        # Update prompt for next iteration
        if isinstance(optimization_results, dict) and "improved_template" in optimization_results:
            current_prompt = optimization_results["improved_template"]


    return {
        "final_prompt": current_prompt,
        "feedback_history": feedback_history,
        "accuracy_history": accuracy_history,
        "final_accuracy": accuracy_history[-1] if accuracy_history else 0
    }

In [130]:
conversation_examples = [
    # Conversation 1: Sarcastic positive words but negative sentiment
    """
    Customer: Oh 'wonderful', another delayed delivery. Just 'perfect'!
    Agent: I apologize for the delay. I can check the status for you.
    Customer: That would be just 'fantastic', like the last three times I contacted support.
    Agent: I understand your frustration. Let me expedite this for you.
    Customer: Thanks SO much for your 'amazing' service! You're really 'exceptional' at this!
    """,
    
    # Conversation 2: Mixed signals with subtle final sentiment
    """
    Customer: My order arrived damaged, but your delivery was quick.
    Agent: I'm sorry about the damage. We'll send a replacement immediately.
    Customer: I appreciate the fast response, though this is the second time it happened.
    Agent: We'll add extra packaging protection and include a small gift for the inconvenience.
    Customer: Well, at least you're trying to improve. We'll see how it goes.
    """,
    
    # Conversation 3: Polite words masking negative sentiment
    """
    Customer: I hate to bother you, but this is my fourth attempt to resolve this issue.
    Agent: I'll be happy to help you today.
    Customer: Thank you, I know it's not your fault personally, but I've spent hours on this.
    Agent: I understand, let me review your case history.
    Customer: I appreciate your time, but I might need to explore other options if this continues.
    """,
    
    # Conversation 4: Complex emotional progression
    """
    Customer: This product is completely useless! I want a refund!
    Agent: I sincerely apologize. I can process your refund and offer a 30% discount on your next purchase.
    Customer: I don't want discounts, I've wasted enough money already!
    Agent: I understand. What if I send you our premium version as a replacement at no cost?
    Customer: Hmm... well, I've heard good things about that model. Fine, I'll give it a try.
    Agent: Excellent! I'll expedite that shipping for you.
    Customer: Thanks, I guess. But it better work this time.
    """
]

ground_truths = [
    {
        "sentiment": "negative"
    },
    {
        "sentiment": "neutral"
    },
    {
        "sentiment": "negative"
    },
    {
        "sentiment": "mixed"
    }
]

In [131]:
# Usage
results = run_feedback_loop(conversation_examples, ground_truths, iterations=5)


Iteration 1
Current Prompt ######

Analyze the sentiment of the following conversation and classify it as either "positive" "negative" "neutral" or "mixed" 

Analyze the conversation below and provide the sentiment analysis in JSON format with two fields:
- sentiment: (positive/negative/neutral/mixed)
- reason: (detailed explanation for the classification)

<conversation>
{{conversation}}
</conversation>

Response:
{
    "sentiment": "",
    "reason": ""
}

Accuracy: 50.00%
Correct: 2/4

Conversation 1:
Predicted: negative
Actual: negative
Correct: ✓

Conversation 2:
Predicted: mixed
Actual: neutral
Correct: ✗

Conversation 3:
Predicted: mixed
Actual: negative
Correct: ✗

Conversation 4:
Predicted: mixed
Actual: mixed
Correct: ✓
FEEDBACK FROM CRITIQUE LLM ######
### STEP 1 - Error Pattern Analysis:

**Misclassified Cases:**
1. **Case 2:**
   - **Model Sentiment:** Mixed
   - **Ground Truth Sentiment:** Neutral
   - **Reasoning:** The model identified both positive and negative element

In [88]:
results

{'final_prompt': 'Updated template with standardized terminology, clear instructions, and integrated spell-check and grammar-check features.',
 'feedback_history': [{'iteration': 1,
   'current_prompt': '\nAnalyze the sentiment of the following conversation and classify it as either "positive" "negative" "neutral" or "mixed" \n\nAnalyze the conversation below and provide the sentiment analysis in JSON format with two fields:\n- sentiment: (positive/negative/neutral/mixed)\n- reason: (detailed explanation for the classification)\n\n<conversation>\n{{conversation}}\n</conversation>\n\nResponse:\n{\n    "sentiment": "",\n    "reason": ""\n}\n',
   'critique_feedback': "Certainly! Let's go through the steps to analyze the classification performance.\n\n### Current Template:\n{input_current_template}\n\n### Evaluation Results:\n{evaluation_results}\n\n---\n\n### STEP 1 - Error Pattern Analysis:\n**List each misclassification:**\n1. Class A predicted as Class B\n2. Class C predicted as Class

In [None]:
def run_feedback_loop(conversation_examples, ground_truths, iterations=3):
    current_prompt = sentiment_eval_prompt
    feedback_history = []
    accuracy_history = []
    
    for i in range(iterations):        
        print(f"\nIteration {i+1}")
        print("="*50)
        
        # Get model predictions and evaluate
        print("Current Prompt ######")
        print(current_prompt)
        predictions = evaluate_conversation(conversation_examples, current_prompt)
        eval_results = evaluate_with_ground_truth(predictions, ground_truths)
        
        # Track accuracy
        current_accuracy = eval_results["accuracy"]
        accuracy_history.append(current_accuracy)
        
        print(f"Accuracy: {current_accuracy*100:.2f}%")
        print(f"Correct: {eval_results['correct_predictions']}/{eval_results['total_predictions']}")
        
        # Show results for each conversation
        for idx, result in enumerate(eval_results["detailed_results"]):
            print(f"\nConversation {idx + 1}:")
            print(f"Predicted: {result['model_sentiment']}")
            print(f"Actual: {result['ground_truth_sentiment']}")
            print(f"Correct: {'✓' if result['is_correct'] else '✗'}")
        
        # Get optimization suggestions
        optimization_prompt = prepare_prompt(
            optimize_prompt,
            input_current_template=current_prompt,
            evaluation_results=json.dumps(eval_results, indent=2)
        )
        
            
        # Get optimization feedback
        optimization_response = bedrock_chat(optimization_prompt)
        optimization = json.loads(optimization_response)

        # Store feedback
        feedback_history.append({
            "iteration": i + 1,
            "current_prompt": current_prompt,
            "error_analysis": optimization["error_analysis"],
            "optimization_suggestions": optimization["optimization_suggestions"],
            "accuracy": current_accuracy
        })

        # Generate new instructions based on feedback history
        rewrite_prompt_formatted = prepare_prompt(
            rewrite_prompt,
            feedback_history=json.dumps(feedback_history, indent=2),
        )
        
        print("Optimization guidance ######")
        print(rewrite_prompt_formatted)

        new_instruction_response = bedrock_chat(rewrite_prompt_formatted)

        current_prompt =  json.loads(new_instruction_response)["new_template"]


    final_accuracy = accuracy_history[-1] if accuracy_history else 0
    
    return {
        "final_prompt": current_prompt,
        "feedback_history": feedback_history,
        "accuracy_history": accuracy_history,
        "final_accuracy": final_accuracy
    }