---
## Setup & Configuration

In [None]:
# Import required libraries
import sys
import asyncio
import json
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add backend to path
backend_path = Path.cwd().parent if 'research_evaluation' in str(Path.cwd()) else Path.cwd()
sys.path.insert(0, str(backend_path))

print(f"‚úÖ Backend path: {backend_path}")
print(f"‚úÖ Working directory: {Path.cwd()}")

In [None]:
# Import app modules
from app.database import AsyncSessionLocal
from app.agents.sta.gemini_classifier import GeminiSTAClassifier
from app.agents.ia.service import InsightsAgentService
from app.agents.ia.schemas import IAQueryRequest, IAQueryParams

# Alias for easier usage
async_session_maker = AsyncSessionLocal

print("‚úÖ App modules imported successfully")

---
## RQ1: Crisis Detection Accuracy (STA)

**Hypothesis**: STA can accurately classify crisis vs non-crisis messages with ‚â•90% accuracy.

**Method**: Test 50 synthetic scenarios (25 crisis, 25 non-crisis) and calculate:
- Sensitivity (True Positive Rate)
- Specificity (True Negative Rate)
- Accuracy
- Precision
- F1 Score

In [None]:
# Load crisis scenarios from RQ1
import importlib.util
spec = importlib.util.spec_from_file_location(
    "crisis_scenarios", 
    Path.cwd() / 'rq1_crisis_detection' / 'crisis_scenarios.py'
)
if spec and spec.loader:
    crisis_scenarios_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(crisis_scenarios_module)
    
    CRISIS_SCENARIOS = crisis_scenarios_module.CRISIS_SCENARIOS
    NON_CRISIS_SCENARIOS = crisis_scenarios_module.NON_CRISIS_SCENARIOS
    
    print(f"‚úÖ Loaded {len(CRISIS_SCENARIOS)} crisis scenarios")
    print(f"‚úÖ Loaded {len(NON_CRISIS_SCENARIOS)} non-crisis scenarios")
    print(f"üìä Total scenarios: {len(CRISIS_SCENARIOS) + len(NON_CRISIS_SCENARIOS)}")
else:
    print("‚ùå Failed to load crisis_scenarios module")

In [None]:
# Preview crisis scenarios
crisis_df = pd.DataFrame([{
    'ID': s.id,
    'Category': s.category,
    'Severity': s.severity_if_crisis,
    'Language': s.language,
    'Text Preview': s.text[:80] + '...'
} for s in CRISIS_SCENARIOS[:5]])

print("\nüî¥ Crisis Scenarios (First 5):")
display(crisis_df)

In [None]:
# Preview non-crisis scenarios
non_crisis_df = pd.DataFrame([{
    'ID': s.id,
    'Category': s.category,
    'Language': s.language,
    'Text Preview': s.text[:80] + '...'
} for s in NON_CRISIS_SCENARIOS[:5]])

print("\nüü¢ Non-Crisis Scenarios (First 5):")
display(non_crisis_df)

In [None]:
# RQ1: Run STA evaluation
async def evaluate_sta():
    """Evaluate STA crisis detection on all 50 scenarios."""
    
    classifier = GeminiSTAClassifier()
    results = []
    
    print("\n" + "="*80)
    print("üß™ Running RQ1: STA Crisis Detection Evaluation")
    print("="*80)
    
    # Evaluate crisis scenarios
    print("\nüî¥ Testing Crisis Scenarios (n=25)...")
    for i, scenario in enumerate(CRISIS_SCENARIOS, 1):
        print(f"   [{i}/25] {scenario.id}...", end=' ')
        try:
            # Create request payload with session_id
            from app.agents.sta.schemas import STAClassifyRequest
            request = STAClassifyRequest(
                text=scenario.text,
                session_id=f"eval_{scenario.id}"
            )
            
            # Use classify method
            assessment = await classifier.classify(request)
            predicted = "crisis" if assessment.risk_level >= 2 else "non-crisis"
            correct = predicted == scenario.true_label
            
            results.append({
                'id': scenario.id,
                'true_label': scenario.true_label,
                'predicted_label': predicted,
                'risk_level': assessment.risk_level,
                'correct': correct,
                'category': scenario.category
            })
            
            print("‚úÖ" if correct else "‚ùå")
        except Exception as e:
            print(f"‚ùå Error: {e}")
    
    # Evaluate non-crisis scenarios
    print("\nüü¢ Testing Non-Crisis Scenarios (n=25)...")
    for i, scenario in enumerate(NON_CRISIS_SCENARIOS, 1):
        print(f"   [{i}/25] {scenario.id}...", end=' ')
        try:
            # Create request payload with session_id
            from app.agents.sta.schemas import STAClassifyRequest
            request = STAClassifyRequest(
                text=scenario.text,
                session_id=f"eval_{scenario.id}"
            )
            
            # Use classify method
            assessment = await classifier.classify(request)
            predicted = "crisis" if assessment.risk_level >= 2 else "non-crisis"
            correct = predicted == scenario.true_label
            
            results.append({
                'id': scenario.id,
                'true_label': scenario.true_label,
                'predicted_label': predicted,
                'risk_level': assessment.risk_level,
                'correct': correct,
                'category': scenario.category
            })
            
            print("‚úÖ" if correct else "‚ùå")
        except Exception as e:
            print(f"‚ùå Error: {e}")
    
    return pd.DataFrame(results)

# Run evaluation
rq1_results = await evaluate_sta()
print("\n‚úÖ RQ1 evaluation complete!")

In [None]:
# Calculate RQ1 metrics
def calculate_metrics(df):
    """Calculate confusion matrix and performance metrics."""
    
    # Confusion matrix
    tp = len(df[(df.true_label == 'crisis') & (df.predicted_label == 'crisis')])
    tn = len(df[(df.true_label == 'non-crisis') & (df.predicted_label == 'non-crisis')])
    fp = len(df[(df.true_label == 'non-crisis') & (df.predicted_label == 'crisis')])
    fn = len(df[(df.true_label == 'crisis') & (df.predicted_label == 'non-crisis')])
    
    # Metrics
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    accuracy = (tp + tn) / len(df)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    f1 = 2 * (precision * sensitivity) / (precision + sensitivity) if (precision + sensitivity) > 0 else 0
    
    return {
        'confusion_matrix': {'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn},
        'sensitivity': sensitivity,
        'specificity': specificity,
        'accuracy': accuracy,
        'precision': precision,
        'f1_score': f1
    }

metrics = calculate_metrics(rq1_results)

print("\n" + "="*80)
print("üìä RQ1 Results: STA Crisis Detection Performance")
print("="*80)

print("\nüéØ Confusion Matrix:")
print(f"   True Positives (TP):  {metrics['confusion_matrix']['TP']}")
print(f"   True Negatives (TN):  {metrics['confusion_matrix']['TN']}")
print(f"   False Positives (FP): {metrics['confusion_matrix']['FP']}")
print(f"   False Negatives (FN): {metrics['confusion_matrix']['FN']}")

print("\nüìà Performance Metrics:")
print(f"   Sensitivity (Recall): {metrics['sensitivity']*100:.2f}%")
print(f"   Specificity:          {metrics['specificity']*100:.2f}%")
print(f"   Accuracy:             {metrics['accuracy']*100:.2f}%")
print(f"   Precision:            {metrics['precision']*100:.2f}%")
print(f"   F1 Score:             {metrics['f1_score']:.4f}")

# Check if hypothesis is met
hypothesis_met = metrics['accuracy'] >= 0.90
print(f"\nüéì Hypothesis (Accuracy ‚â• 90%): {'‚úÖ MET' if hypothesis_met else '‚ùå NOT MET'}")

In [None]:
# Visualize results
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.figure(figsize=(12, 5))

# Confusion matrix heatmap
plt.subplot(1, 2, 1)
cm = np.array([
    [metrics['confusion_matrix']['TP'], metrics['confusion_matrix']['FN']],
    [metrics['confusion_matrix']['FP'], metrics['confusion_matrix']['TN']]
])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Predicted Crisis', 'Predicted Non-Crisis'],
            yticklabels=['Actual Crisis', 'Actual Non-Crisis'])
plt.title('Confusion Matrix')

# Metrics bar chart
plt.subplot(1, 2, 2)
metric_names = ['Sensitivity', 'Specificity', 'Accuracy', 'Precision', 'F1 Score']
metric_values = [
    metrics['sensitivity']*100,
    metrics['specificity']*100,
    metrics['accuracy']*100,
    metrics['precision']*100,
    metrics['f1_score']*100
]
colors = ['#FF6B6B' if v < 90 else '#51CF66' for v in metric_values]
plt.bar(metric_names, metric_values, color=colors, alpha=0.7)
plt.axhline(y=90, color='r', linestyle='--', label='90% Threshold')
plt.ylabel('Score (%)')
plt.title('Performance Metrics')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

print("‚úÖ Visualizations generated")

---
## RQ2: Orchestration Flow Correctness (Aika)

**Hypothesis**: Aika meta-agent correctly orchestrates sub-agents (STA, SCA, SDA, IA) based on conversation context.

**Method**: Test 10 representative conversation flows (F1-F10) and validate:
- Correct agent sequence execution
- Proper state transitions
- Langfuse trace completeness

In [None]:
# Load orchestration flows from RQ2
import importlib.util
spec = importlib.util.spec_from_file_location(
    "orchestration_flows",
    Path.cwd() / 'rq2_orchestration' / 'orchestration_flows.py'
)
if spec and spec.loader:
    orchestration_flows_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(orchestration_flows_module)
    
    ORCHESTRATION_FLOWS = orchestration_flows_module.ORCHESTRATION_FLOWS
    
    print(f"‚úÖ Loaded {len(ORCHESTRATION_FLOWS)} orchestration flows")
    
    # Preview flows
    flows_df = pd.DataFrame([{
        'ID': f.id,
        'Name': f.name,
        'Expected Sequence': ' ‚Üí '.join(f.expected_agent_sequence),
        'Validation Criteria': len(f.validation_criteria)
    } for f in ORCHESTRATION_FLOWS])
    
    print("\nüîÑ Orchestration Flows:")
    display(flows_df)
else:
    print("‚ùå Failed to load orchestration_flows module")

In [None]:
# RQ2: Manual testing instructions
print("="*80)
print("üìù RQ2: Orchestration Flow Testing (Manual + Langfuse)")
print("="*80)
print("\n‚ö†Ô∏è  This test requires manual execution via API and Langfuse trace validation.\n")
print("Instructions:")
print("1. Start the backend server: cd backend && ./dev.sh")
print("2. Open Langfuse dashboard: http://localhost:3000")
print("3. For each flow (F1-F10), send the user_messages to /api/v1/aika")
print("4. Capture the Langfuse trace ID")
print("5. Validate against expected_agent_sequence")
print("6. Record results in the table below\n")

# Create results template
rq2_results_template = pd.DataFrame([{
    'Flow ID': f.id,
    'Name': f.name,
    'Expected Sequence': ' ‚Üí '.join(f.expected_agent_sequence),
    'Actual Sequence': '',  # Fill manually
    'Langfuse Trace ID': '',  # Fill manually
    'Match': False,  # Fill manually
    'Notes': ''
} for f in ORCHESTRATION_FLOWS])

print("üìä Results Template (Fill after testing):")
display(rq2_results_template)

print("\nüí° Tip: Export results to CSV after filling: rq2_results.to_csv('rq2_results.csv', index=False)")

---
## RQ3: Coaching Quality Assessment (SCA)

**Hypothesis**: SCA interventions meet quality standards for empathy, CBT techniques, and cultural appropriateness.

**Method**: Dual-rater assessment (researcher + GPT-4) using 5-point Likert rubric:
- Empathy & Validation (1-5)
- CBT Technique Application (1-5)
- Cultural Appropriateness (1-5)
- Boundary Respect (1-5)
- Resource Usefulness (1-5)

In [None]:
# Load coaching scenarios from RQ3
import importlib.util
spec = importlib.util.spec_from_file_location(
    "coaching_scenarios",
    Path.cwd() / 'rq3_coaching_quality' / 'coaching_scenarios.py'
)
if spec and spec.loader:
    coaching_scenarios_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(coaching_scenarios_module)
    
    COACHING_SCENARIOS = coaching_scenarios_module.COACHING_SCENARIOS
    
    print(f"‚úÖ Loaded {len(COACHING_SCENARIOS)} coaching scenarios")
    
    # Preview scenarios
    coaching_df = pd.DataFrame([{
        'ID': s.id,
        'Category': s.category,
        'Expected Intervention': s.expected_intervention_type,
        'Focus': s.evaluation_focus,
        'Message Preview': s.user_message[:80] + '...'
    } for s in COACHING_SCENARIOS])
    
    print("\nüéì Coaching Scenarios:")
    display(coaching_df)
else:
    print("‚ùå Failed to load coaching_scenarios module")

In [None]:
# RQ3: Manual evaluation instructions
print("="*80)
print("üìù RQ3: Coaching Quality Assessment (Manual Dual-Rating)")
print("="*80)
print("\n‚ö†Ô∏è  This test requires manual dual-rater assessment using the rubric.\n")
print("Instructions:")
print("1. For each scenario, send user_message to SCA via /api/v1/agents/graph/sca/execute")
print("2. Review the intervention plan generated by SCA")
print("3. Rate on 5-point scale (1=Poor, 5=Excellent) for each dimension:")
print("   - Empathy & Validation")
print("   - CBT Technique Application")
print("   - Cultural Appropriateness")
print("   - Boundary Respect")
print("   - Resource Usefulness")
print("4. Repeat with GPT-4 as second rater")
print("5. Calculate inter-rater reliability (Cohen's Kappa)\n")

# Create rubric template
rq3_rubric_template = pd.DataFrame([{
    'Scenario ID': s.id,
    'Category': s.category,
    'Researcher_Empathy': 0,
    'Researcher_CBT': 0,
    'Researcher_Culture': 0,
    'Researcher_Boundary': 0,
    'Researcher_Resources': 0,
    'GPT4_Empathy': 0,
    'GPT4_CBT': 0,
    'GPT4_Culture': 0,
    'GPT4_Boundary': 0,
    'GPT4_Resources': 0,
    'Notes': ''
} for s in COACHING_SCENARIOS])

print("üìä Rubric Template (Fill after testing):")
display(rq3_rubric_template)

print("\nüí° Tip: Calculate average scores and Cohen's Kappa after completing ratings")

In [None]:
# Helper: Calculate Cohen's Kappa for inter-rater reliability
from sklearn.metrics import cohen_kappa_score

def calculate_inter_rater_reliability(df):
    """Calculate Cohen's Kappa for each rubric dimension."""
    
    dimensions = ['Empathy', 'CBT', 'Culture', 'Boundary', 'Resources']
    kappas = {}
    
    for dim in dimensions:
        researcher_col = f'Researcher_{dim}'
        gpt4_col = f'GPT4_{dim}'
        
        if researcher_col in df.columns and gpt4_col in df.columns:
            researcher_scores = df[researcher_col].values
            gpt4_scores = df[gpt4_col].values
            
            # Remove zeros (unfilled)
            mask = (researcher_scores > 0) & (gpt4_scores > 0)
            if mask.sum() > 0:
                kappa = cohen_kappa_score(researcher_scores[mask], gpt4_scores[mask])
                kappas[dim] = kappa
    
    return kappas

print("‚úÖ Inter-rater reliability calculator ready")
print("\nüí° Usage: kappas = calculate_inter_rater_reliability(rq3_results)")
print("   Interpretation:")
print("   - Œ∫ < 0.20: Slight agreement")
print("   - 0.21-0.40: Fair agreement")
print("   - 0.41-0.60: Moderate agreement")
print("   - 0.61-0.80: Substantial agreement")
print("   - 0.81-1.00: Almost perfect agreement")

---
## RQ4: Privacy Preservation (k-Anonymity)

**Hypothesis**: Insights Agent (IA) enforces k-anonymity (k‚â•5) across all analytics queries.

**Method**: Unit tests validating:
1. Small cohort suppression (n<5 should be filtered)
2. Compliant publication (n‚â•5 should pass)
3. Individual query blocking (no individual-level data)

In [None]:
# RQ4: k-Anonymity validation
from app.agents.ia.queries import ALLOWED_QUERIES

print("="*80)
print("üîí RQ4: k-Anonymity Enforcement Validation")
print("="*80)

print("\nüìã IA Analytics Queries (n=6):")
for i, (question_id, query) in enumerate(ALLOWED_QUERIES.items(), 1):
    print(f"   {i}. {question_id}")

print("\nüîç Checking k-anonymity enforcement in SQL queries...")

k_anonymity_check = []
for question_id, query in ALLOWED_QUERIES.items():
    has_having_clause = 'HAVING COUNT(*)' in query.upper()
    has_k5 = '>= 5' in query
    
    k_anonymity_check.append({
        'Query': question_id,
        'Has HAVING Clause': has_having_clause,
        'Has k‚â•5 Constraint': has_k5,
        'k-Anonymity Enforced': has_having_clause and has_k5
    })

k_check_df = pd.DataFrame(k_anonymity_check)
display(k_check_df)

all_enforced = k_check_df['k-Anonymity Enforced'].all()
print(f"\n‚úÖ k-Anonymity Enforcement: {'PASS' if all_enforced else 'FAIL'}")
print(f"   {k_check_df['k-Anonymity Enforced'].sum()}/{len(k_check_df)} queries enforce k‚â•5")

In [None]:
# RQ4: Test small cohort suppression
async def test_small_cohort_suppression():
    """Test that cohorts with n<5 users are suppressed."""
    
    print("\nüß™ Test 1: Small Cohort Suppression (n<5)")
    print("-" * 80)
    
    async with async_session_maker() as db:
        ia_service = InsightsAgentService(db)
        
        # Test with crisis_trend query
        # Construct IAQueryParams properly using model_validate
        query_params = IAQueryParams.model_validate({
            "from": datetime.now() - timedelta(days=7),
            "to": datetime.now()
        })
        
        request = IAQueryRequest(
            question_id="crisis_trend",
            params=query_params
        )
        
        try:
            response = await ia_service.query(request)
            
            # Check if small cohorts are filtered
            has_data = len(response.table) > 0
            
            if has_data:
                # Verify all groups have ‚â•5 records
                min_count = min(row.get('crisis_count', 999) for row in response.table)
                print(f"   üìä Query returned {len(response.table)} groups")
                print(f"   üìè Minimum group size: {min_count}")
                print(f"   ‚úÖ k-Anonymity: {'PASS' if min_count >= 5 else 'FAIL'}")
            else:
                print(f"   üìä Query returned 0 groups (all suppressed)")
                print(f"   ‚úÖ k-Anonymity: PASS (no data below threshold)")
                
        except Exception as e:
            print(f"   ‚ùå Error: {e}")

# Run test
await test_small_cohort_suppression()

In [None]:
# RQ4: Test all IA queries for k-anonymity
async def test_all_ia_queries():
    """Test all 6 IA queries for k-anonymity enforcement."""
    
    print("\nüß™ Test 2: All IA Queries k-Anonymity Check")
    print("-" * 80)
    
    async with async_session_maker() as db:
        ia_service = InsightsAgentService(db)
        results = []
        
        for question_id in ALLOWED_QUERIES.keys():
            print(f"\n   Testing: {question_id}")
            
            # Construct IAQueryParams properly using model_validate
            query_params = IAQueryParams.model_validate({
                "from": datetime.now() - timedelta(days=30),
                "to": datetime.now()
            })
            
            request = IAQueryRequest(
                question_id=question_id,
                params=query_params
            )
            
            try:
                response = await ia_service.query(request)
                
                has_data = len(response.table) > 0
                k_anonymity_pass = True
                
                if has_data:
                    # Check minimum count in any relevant column
                    count_columns = [col for col in response.table[0].keys() if 'count' in col.lower()]
                    if count_columns:
                        min_count = min(
                            min(row.get(col, 999) for col in count_columns)
                            for row in response.table
                        )
                        k_anonymity_pass = min_count >= 5
                        print(f"      Groups: {len(response.table)}, Min count: {min_count}")
                    else:
                        print(f"      Groups: {len(response.table)}, No count column found")
                else:
                    print(f"      No data (all suppressed or no matches)")
                
                results.append({
                    'Query': question_id,
                    'Has Data': has_data,
                    'k-Anonymity': '‚úÖ PASS' if k_anonymity_pass else '‚ùå FAIL'
                })
                
            except Exception as e:
                print(f"      ‚ùå Error: {e}")
                results.append({
                    'Query': question_id,
                    'Has Data': False,
                    'k-Anonymity': '‚ö†Ô∏è ERROR'
                })
        
        print("\n" + "="*80)
        print("üìä k-Anonymity Test Results")
        print("="*80)
        display(pd.DataFrame(results))
        
        pass_count = sum(1 for r in results if 'PASS' in r['k-Anonymity'])
        print(f"\n‚úÖ Overall: {pass_count}/{len(results)} queries enforce k-anonymity correctly")

# Run test
await test_all_ia_queries()

---
## Export Results

Export all test results for thesis documentation.

In [None]:
# Create results directory
results_dir = Path.cwd() / 'results'
results_dir.mkdir(exist_ok=True)

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

print("üíæ Exporting results...\n")

# Export RQ1 results
if 'rq1_results' in locals():
    rq1_path = results_dir / f'rq1_sta_results_{timestamp}.csv'
    rq1_results.to_csv(rq1_path, index=False)
    print(f"‚úÖ RQ1 results exported: {rq1_path}")
    
    # Export metrics
    metrics_path = results_dir / f'rq1_metrics_{timestamp}.json'
    with open(metrics_path, 'w') as f:
        json.dump(metrics, f, indent=2)
    print(f"‚úÖ RQ1 metrics exported: {metrics_path}")

# Export templates for manual tests
rq2_template_path = results_dir / f'rq2_orchestration_template_{timestamp}.csv'
rq2_results_template.to_csv(rq2_template_path, index=False)
print(f"‚úÖ RQ2 template exported: {rq2_template_path}")

rq3_template_path = results_dir / f'rq3_coaching_rubric_{timestamp}.csv'
rq3_rubric_template.to_csv(rq3_template_path, index=False)
print(f"‚úÖ RQ3 template exported: {rq3_template_path}")

print(f"\nüìÅ All results saved to: {results_dir}")
print("\nüéì Ready for thesis documentation!")

---
## Summary

This notebook consolidates all 4 research question evaluations:

- ‚úÖ **RQ1**: STA crisis detection tested with 50 scenarios
- ‚úÖ **RQ2**: Orchestration flows template ready for manual testing
- ‚úÖ **RQ3**: Coaching quality rubric template ready for dual-rating
- ‚úÖ **RQ4**: k-Anonymity enforcement validated across all IA queries

**Next Steps**:
1. Complete RQ2 manual testing with Langfuse traces
2. Complete RQ3 dual-rater assessment
3. Compile all results for thesis Chapter 4 (Results)
4. Calculate final statistics and create visualizations