## 1. Setup

In [None]:
import sys
sys.path.append('src')

import torch
import numpy as np
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")

## 2. 모델 경로 확인

In [None]:
# 저장된 모델 체크포인트 확인
checkpoint_dir = Path('checkpoints')

print("Available checkpoints:")
if checkpoint_dir.exists():
    for ckpt in sorted(checkpoint_dir.glob('*.pt')):
        size_mb = ckpt.stat().st_size / (1024 * 1024)
        print(f"  - {ckpt.name:40s} ({size_mb:.1f} MB)")
else:
    print("  No checkpoints directory found!")
    print("  Please train models first using 2_train_baseline.py and 4_train_multimodal.py")

## 3. 평가 설정

In [None]:
# 평가 설정
CONFIG = {
    # 모델 경로 (위에서 확인한 경로로 수정하세요)
    'baseline_model': 'checkpoints/baseline_best.pt',
    'multimodal_model': 'checkpoints/multimodal_best.pt',
    
    # 평가 설정
    'n_games': 1000,  # 시뮬레이션할 게임 수
    'aggression_level': 0.5,  # Rule-based agent 공격성 (0-1)
    'random_seed': 42,
    
    # 출력 설정
    'output_dir': 'outputs',
    'verbose': True,  # 상세 로그 출력
    
    # Multimodal 대화 생성 (실시간 LLM 사용 여부)
    'use_llm_dialogue': False,  # True로 설정하면 실시간 LLM 대화 생성 (느림)
}

print("Configuration:")
for key, value in CONFIG.items():
    print(f"  {key:20s}: {value}")

## 4. Baseline 모델 평가

In [None]:
print("="*60)
print("BASELINE MODEL EVALUATION")
print("="*60)

# 스크립트 실행
import subprocess

cmd = [
    'python', '5_evaluate_vs_rule_based.py',
    '--model_type', 'baseline',
    '--model_path', CONFIG['baseline_model'],
    '--n_games', str(CONFIG['n_games']),
    '--output_dir', CONFIG['output_dir'],
    '--device', str(device),
]

if CONFIG['verbose']:
    cmd.append('--verbose')

result = subprocess.run(cmd, capture_output=False, text=True)

if result.returncode == 0:
    print("\n✓ Baseline evaluation completed successfully")
else:
    print("\n✗ Baseline evaluation failed")

## 5. Multimodal 모델 평가

In [None]:
print("="*60)
print("MULTIMODAL MODEL EVALUATION")
print("="*60)

cmd = [
    'python', '5_evaluate_vs_rule_based.py',
    '--model_type', 'multimodal',
    '--model_path', CONFIG['multimodal_model'],
    '--n_games', str(CONFIG['n_games']),
    '--output_dir', CONFIG['output_dir'],
    '--device', str(device),
]

if CONFIG['verbose']:
    cmd.append('--verbose')

result = subprocess.run(cmd, capture_output=False, text=True)

if result.returncode == 0:
    print("\n✓ Multimodal evaluation completed successfully")
else:
    print("\n✗ Multimodal evaluation failed")

## 6. 결과 비교 및 시각화

In [None]:
# 결과 로드
output_dir = Path(CONFIG['output_dir'])

baseline_results = None
multimodal_results = None

baseline_file = output_dir / 'baseline_vs_rule_based.json'
multimodal_file = output_dir / 'multimodal_vs_rule_based.json'

if baseline_file.exists():
    with open(baseline_file, 'r') as f:
        baseline_results = json.load(f)
    print(f"✓ Loaded baseline results from {baseline_file}")
else:
    print(f"✗ Baseline results not found: {baseline_file}")

if multimodal_file.exists():
    with open(multimodal_file, 'r') as f:
        multimodal_results = json.load(f)
    print(f"✓ Loaded multimodal results from {multimodal_file}")
else:
    print(f"✗ Multimodal results not found: {multimodal_file}")

In [None]:
# 행동 분포 비교
ACTION_NAMES = ['Fold', 'Check/Call', 'Raise Small', 'Raise Medium', 'Raise Large', 'All-in']

if baseline_results and multimodal_results:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Baseline
    ax1 = axes[0]
    x = np.arange(len(ACTION_NAMES))
    width = 0.35
    
    baseline_dist = np.array(baseline_results['agent_action_distribution']) * 100
    opponent_dist = np.array(baseline_results['opponent_action_distribution']) * 100
    
    ax1.bar(x - width/2, baseline_dist, width, label='Baseline Model', alpha=0.8)
    ax1.bar(x + width/2, opponent_dist, width, label='Rule-based Agent', alpha=0.8)
    
    ax1.set_xlabel('Action')
    ax1.set_ylabel('Frequency (%)')
    ax1.set_title('Baseline Model: Action Distribution')
    ax1.set_xticks(x)
    ax1.set_xticklabels(ACTION_NAMES, rotation=45, ha='right')
    ax1.legend()
    ax1.grid(axis='y', alpha=0.3)
    
    # Multimodal
    ax2 = axes[1]
    
    multimodal_dist = np.array(multimodal_results['agent_action_distribution']) * 100
    opponent_dist2 = np.array(multimodal_results['opponent_action_distribution']) * 100
    
    ax2.bar(x - width/2, multimodal_dist, width, label='Multimodal Model', alpha=0.8)
    ax2.bar(x + width/2, opponent_dist2, width, label='Rule-based Agent', alpha=0.8)
    
    ax2.set_xlabel('Action')
    ax2.set_ylabel('Frequency (%)')
    ax2.set_title('Multimodal Model: Action Distribution')
    ax2.set_xticks(x)
    ax2.set_xticklabels(ACTION_NAMES, rotation=45, ha='right')
    ax2.legend()
    ax2.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(output_dir / 'action_distribution_comparison.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print(f"\n✓ Plot saved to {output_dir / 'action_distribution_comparison.png'}")

In [None]:
# Agreement rate 비교
if baseline_results and multimodal_results:
    fig, ax = plt.subplots(figsize=(8, 6))
    
    models = ['Baseline', 'Multimodal']
    agreement_rates = [
        baseline_results['agreement_rate'],
        multimodal_results['agreement_rate']
    ]
    
    colors = ['#3498db', '#e74c3c']
    bars = ax.bar(models, agreement_rates, color=colors, alpha=0.7, edgecolor='black')
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.2f}%',
                ha='center', va='bottom', fontsize=12, fontweight='bold')
    
    ax.set_ylabel('Agreement Rate (%)', fontsize=12)
    ax.set_title('Agreement Rate with Rule-Based Agent', fontsize=14, fontweight='bold')
    ax.set_ylim(0, max(agreement_rates) * 1.2)
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(output_dir / 'agreement_rate_comparison.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print(f"\n✓ Plot saved to {output_dir / 'agreement_rate_comparison.png'}")

In [None]:
# Summary table
if baseline_results and multimodal_results:
    print("\n" + "="*60)
    print("COMPARISON SUMMARY")
    print("="*60)
    
    print(f"\n{'Metric':<30s} {'Baseline':<15s} {'Multimodal':<15s}")
    print("-" * 60)
    
    print(f"{'Agreement Rate':<30s} {baseline_results['agreement_rate']:>14.2f}% {multimodal_results['agreement_rate']:>14.2f}%")
    print(f"{'Games Played':<30s} {baseline_results['n_games']:>14d} {multimodal_results['n_games']:>14d}")
    
    print("\n" + "-" * 60)
    print("Action Distribution:")
    print("-" * 60)
    
    baseline_dist = np.array(baseline_results['agent_action_distribution']) * 100
    multimodal_dist = np.array(multimodal_results['agent_action_distribution']) * 100
    
    for i, action in enumerate(ACTION_NAMES):
        print(f"{action:<30s} {baseline_dist[i]:>14.2f}% {multimodal_dist[i]:>14.2f}%")
    
    # Sample dialogues from multimodal
    if 'sample_dialogues' in multimodal_results:
        print("\n" + "="*60)
        print("SAMPLE DIALOGUES (Multimodal Model)")
        print("="*60)
        for i, dialogue in enumerate(multimodal_results['sample_dialogues'][:5], 1):
            print(f"{i}. \"{dialogue}\"")

## 7. 추가 분석 (선택사항)

In [None]:
# Confusion matrix 스타일 시각화 (모델 vs Rule-based)
if baseline_results:
    from sklearn.metrics import confusion_matrix
    
    agent_actions = baseline_results['agent_actions']
    opponent_actions = baseline_results['opponent_actions']
    
    cm = confusion_matrix(opponent_actions, agent_actions, labels=list(range(6)))
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=ACTION_NAMES, yticklabels=ACTION_NAMES)
    plt.ylabel('Rule-based Agent Action')
    plt.xlabel('Baseline Model Action')
    plt.title('Action Agreement Matrix: Baseline vs Rule-based')
    plt.tight_layout()
    plt.savefig(output_dir / 'baseline_confusion_matrix.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print(f"\n✓ Confusion matrix saved to {output_dir / 'baseline_confusion_matrix.png'}")

## 완료!

평가가 완료되었습니다. 결과 파일들은 `outputs/` 디렉토리에 저장되었습니다:
- `baseline_vs_rule_based.json` - Baseline 모델 평가 결과
- `multimodal_vs_rule_based.json` - Multimodal 모델 평가 결과
- `*.png` - 시각화 그래프들