In [None]:
# ============================================================================
# Part 1: Install and Import Dependencies
# ============================================================================
import subprocess
import sys

def install_if_needed(package, import_name=None):
    """Install package if not available"""
    import_name = import_name or package
    try:
        __import__(import_name)
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])

# Install required packages
install_if_needed("sentence-transformers", "sentence_transformers")
install_if_needed("pandas")
install_if_needed("numpy")
install_if_needed("matplotlib")
install_if_needed("seaborn")
install_if_needed("scipy")

print("‚úÖ All dependencies installed")

In [None]:
# ============================================================================
# Part 2: Import Libraries
# ============================================================================
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

print("‚úÖ Libraries imported successfully")

In [None]:
# ============================================================================
# Part 3: Load BERT Model (Using lightweight all-MiniLM-L6-v2 for speed)
# ============================================================================
print("Loading Sentence-BERT model...")
print("Using 'all-MiniLM-L6-v2' - fast and efficient (80MB, 5x faster than BERT-base)")

# all-MiniLM-L6-v2 is optimized for semantic similarity tasks
# It's ~5x faster than BERT-base while maintaining good quality
model = SentenceTransformer('all-MiniLM-L6-v2')

print(f"‚úÖ Model loaded: {model.get_sentence_embedding_dimension()}-dim embeddings")

In [None]:
# ============================================================================
# Part 4: Define Data Paths
# ============================================================================
DATA_DIR = Path("/home/24068286g/UString/VRU/src/option_generate/data")

# Three versions of VQA datasets
csv_files = {
    "3_options": DATA_DIR / "QA_pair_v1_3options.csv",
    "4_options": DATA_DIR / "QA_pair_v2_4options.csv",
    "5_options": DATA_DIR / "QA_pair_v3_5options.csv"
}

# Check file existence
for name, path in csv_files.items():
    status = "‚úÖ" if path.exists() else "‚ùå"
    print(f"{status} {name}: {path.name}")

In [None]:
# ============================================================================
# Part 5: Load and Parse CSV Data
# ============================================================================
def load_qa_data(csv_path: Path, num_options: int) -> pd.DataFrame:
    """
    Load QA data from CSV and extract questions with their options.
    
    Returns DataFrame with columns:
    - video_number, q_id, question, correct_answer, wrong_options (list)
    """
    df = pd.read_csv(csv_path)
    
    records = []
    
    for _, row in df.iterrows():
        video_number = row['video_number']
        
        # Each row can have up to 6 questions (q1-q6)
        for q_idx in range(1, 7):
            q_text = row.get(f'q{q_idx}_text', None)
            if pd.isna(q_text) or not q_text:
                continue
            
            correct_answer = row.get(f'q{q_idx}_ans_correct', None)
            if pd.isna(correct_answer):
                continue
            
            # Collect wrong options
            wrong_options = []
            for w_idx in range(1, num_options):  # num_options-1 wrong answers
                wrong_key = f'q{q_idx}_ans_wrong{w_idx}'
                wrong_ans = row.get(wrong_key, None)
                if not pd.isna(wrong_ans) and wrong_ans:
                    wrong_options.append(str(wrong_ans).strip())
            
            if wrong_options:  # Only add if we have wrong options
                records.append({
                    'video_number': video_number,
                    'q_id': q_idx,
                    'question': str(q_text).strip(),
                    'correct_answer': str(correct_answer).strip(),
                    'wrong_options': wrong_options,
                    'num_options': len(wrong_options) + 1  # Total options including correct
                })
    
    return pd.DataFrame(records)

# Load all three datasets
datasets = {}
for name, path in csv_files.items():
    if path.exists():
        num_opts = int(name.split('_')[0])
        datasets[name] = load_qa_data(path, num_opts)
        print(f"üìä {name}: {len(datasets[name])} questions loaded")

print(f"\n‚úÖ Total datasets loaded: {len(datasets)}")

In [None]:
# ============================================================================
# Part 6: BERT-based Similarity Functions
# ============================================================================
def compute_bert_similarity(text1: str, text2: str) -> float:
    """
    Compute cosine similarity between two texts using BERT embeddings.
    Returns value in [0, 1] where 1 means identical.
    """
    embeddings = model.encode([text1, text2], convert_to_numpy=True)
    similarity = 1 - cosine(embeddings[0], embeddings[1])
    return max(0, similarity)  # Ensure non-negative

def compute_batch_embeddings(texts: List[str]) -> np.ndarray:
    """Compute embeddings for a batch of texts (much faster)"""
    return model.encode(texts, convert_to_numpy=True, show_progress_bar=False)

def cosine_similarity_matrix(embeddings: np.ndarray) -> np.ndarray:
    """Compute pairwise cosine similarity matrix"""
    # Normalize embeddings
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    normalized = embeddings / norms
    # Compute similarity matrix
    return np.dot(normalized, normalized.T)

# Test the functions
test_sim = compute_bert_similarity(
    "The car crashed into the pedestrian",
    "A vehicle hit a person walking"
)
print(f"Test similarity (semantically similar texts): {test_sim:.4f}")

test_sim2 = compute_bert_similarity(
    "The car crashed into the pedestrian",
    "The weather is sunny today"
)
print(f"Test similarity (unrelated texts): {test_sim2:.4f}")

In [None]:
# ============================================================================
# Part 7: Quality Metrics Computation
# ============================================================================
def analyze_question_quality(row: pd.Series) -> Dict:
    """
    Analyze quality metrics for a single question.
    
    Returns:
    - option_diversity: Average pairwise distance among wrong options (higher = more diverse)
    - distractor_quality: Average distance from correct answer (ideal: medium, not too similar/different)
    - question_relevance: Average relevance of options to the question
    - separability: Distance between correct answer and closest wrong option
    """
    question = row['question']
    correct = row['correct_answer']
    wrong_opts = row['wrong_options']
    
    # Get all embeddings in one batch for efficiency
    all_texts = [question, correct] + wrong_opts
    embeddings = compute_batch_embeddings(all_texts)
    
    q_emb = embeddings[0]
    correct_emb = embeddings[1]
    wrong_embs = embeddings[2:]
    
    # 1. Option Diversity: Pairwise similarity among wrong options
    if len(wrong_embs) > 1:
        wrong_sim_matrix = cosine_similarity_matrix(wrong_embs)
        # Get upper triangle (excluding diagonal)
        upper_tri = wrong_sim_matrix[np.triu_indices(len(wrong_embs), k=1)]
        # Diversity = 1 - average similarity
        option_diversity = 1 - np.mean(upper_tri)
    else:
        option_diversity = 0.5  # Default for single wrong option
    
    # 2. Distractor Quality: Similarity to correct answer
    correct_norm = correct_emb / np.linalg.norm(correct_emb)
    wrong_norms = wrong_embs / np.linalg.norm(wrong_embs, axis=1, keepdims=True)
    distractor_sims = np.dot(wrong_norms, correct_norm)
    avg_distractor_sim = np.mean(distractor_sims)
    # Ideal distractor: not too similar (>0.8) or too different (<0.2)
    # Best range: 0.3-0.6
    distractor_quality = 1 - abs(avg_distractor_sim - 0.45) * 2  # Peak at 0.45
    
    # 3. Question Relevance: How relevant are options to the question
    q_norm = q_emb / np.linalg.norm(q_emb)
    all_opt_embs = np.vstack([correct_emb, wrong_embs])
    all_opt_norms = all_opt_embs / np.linalg.norm(all_opt_embs, axis=1, keepdims=True)
    relevance_scores = np.dot(all_opt_norms, q_norm)
    question_relevance = np.mean(relevance_scores)
    
    # 4. Separability: Distance between correct and closest wrong option
    min_distractor_sim = np.max(distractor_sims)  # Closest wrong option
    separability = 1 - min_distractor_sim  # Higher = more separable
    
    return {
        'option_diversity': float(option_diversity),
        'distractor_quality': float(distractor_quality),
        'question_relevance': float(question_relevance),
        'separability': float(separability),
        'avg_distractor_similarity': float(avg_distractor_sim),
        'min_distractor_distance': float(1 - min_distractor_sim)
    }

# Test with first question from 3_options dataset
if '3_options' in datasets and len(datasets['3_options']) > 0:
    test_row = datasets['3_options'].iloc[0]
    print(f"Question: {test_row['question'][:80]}...")
    print(f"Correct: {test_row['correct_answer'][:50]}...")
    print(f"Wrong options: {len(test_row['wrong_options'])}")
    
    metrics = analyze_question_quality(test_row)
    print("\nüìä Quality Metrics:")
    for k, v in metrics.items():
        print(f"  {k}: {v:.4f}")

In [None]:
# ============================================================================
# Part 8: Batch Analysis for All Datasets
# ============================================================================
from tqdm import tqdm

def analyze_dataset(df: pd.DataFrame, name: str) -> pd.DataFrame:
    """
    Analyze all questions in a dataset and return metrics DataFrame.
    """
    print(f"\nüîÑ Analyzing {name} ({len(df)} questions)...")
    
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing {name}"):
        try:
            metrics = analyze_question_quality(row)
            metrics['video_number'] = row['video_number']
            metrics['q_id'] = row['q_id']
            metrics['num_options'] = row['num_options']
            metrics['dataset'] = name
            results.append(metrics)
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            continue
    
    return pd.DataFrame(results)

# Analyze all datasets
all_results = []
for name, df in datasets.items():
    result_df = analyze_dataset(df, name)
    all_results.append(result_df)
    print(f"‚úÖ {name}: {len(result_df)} questions analyzed")

# Combine all results
combined_results = pd.concat(all_results, ignore_index=True)
print(f"\nüìä Total analyzed: {len(combined_results)} question instances")

In [None]:
# ============================================================================
# Part 9: Statistical Comparison
# ============================================================================
def compute_statistics(combined_df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute summary statistics for each dataset version.
    """
    metrics = ['option_diversity', 'distractor_quality', 'question_relevance', 
               'separability', 'avg_distractor_similarity']
    
    stats_data = []
    
    for dataset_name in combined_df['dataset'].unique():
        subset = combined_df[combined_df['dataset'] == dataset_name]
        
        row = {'dataset': dataset_name}
        for metric in metrics:
            values = subset[metric]
            row[f'{metric}_mean'] = values.mean()
            row[f'{metric}_std'] = values.std()
            row[f'{metric}_median'] = values.median()
        
        row['count'] = len(subset)
        stats_data.append(row)
    
    return pd.DataFrame(stats_data)

summary_stats = compute_statistics(combined_results)

print("\n" + "="*80)
print("üìä BERT-based Quality Analysis Summary")
print("="*80)

# Display key metrics
display_cols = ['dataset', 'count', 
                'option_diversity_mean', 'distractor_quality_mean', 
                'question_relevance_mean', 'separability_mean']

print("\nüìà Mean Quality Scores by Dataset:")
print(summary_stats[display_cols].to_string(index=False))

In [None]:
# ============================================================================
# Part 10: Statistical Significance Tests
# ============================================================================
def perform_statistical_tests(combined_df: pd.DataFrame) -> Dict:
    """
    Perform statistical tests to compare dataset versions.
    Uses Kruskal-Wallis H-test (non-parametric) and pairwise Mann-Whitney U tests.
    """
    metrics = ['option_diversity', 'distractor_quality', 'question_relevance', 'separability']
    datasets_list = sorted(combined_df['dataset'].unique())
    
    results = {}
    
    for metric in metrics:
        print(f"\n{'='*60}")
        print(f"üìä Statistical Tests for: {metric}")
        print(f"{'='*60}")
        
        # Prepare data groups
        groups = [combined_df[combined_df['dataset'] == d][metric].values for d in datasets_list]
        
        # Kruskal-Wallis H-test (non-parametric ANOVA)
        h_stat, p_value = stats.kruskal(*groups)
        print(f"\nKruskal-Wallis H-test:")
        print(f"  H-statistic: {h_stat:.4f}")
        print(f"  p-value: {p_value:.6f}")
        print(f"  Significant (p<0.05): {'‚úÖ YES' if p_value < 0.05 else '‚ùå NO'}")
        
        # Pairwise Mann-Whitney U tests
        print(f"\nPairwise Mann-Whitney U tests:")
        pairwise_results = {}
        for i, d1 in enumerate(datasets_list):
            for j, d2 in enumerate(datasets_list):
                if i < j:
                    g1 = combined_df[combined_df['dataset'] == d1][metric].values
                    g2 = combined_df[combined_df['dataset'] == d2][metric].values
                    u_stat, p_val = stats.mannwhitneyu(g1, g2, alternative='two-sided')
                    
                    # Effect size (rank-biserial correlation)
                    n1, n2 = len(g1), len(g2)
                    effect_size = 1 - (2*u_stat)/(n1*n2)
                    
                    sig = '‚úÖ' if p_val < 0.05 else ''
                    print(f"  {d1} vs {d2}: U={u_stat:.0f}, p={p_val:.4f} {sig}, effect={effect_size:.3f}")
                    pairwise_results[f"{d1}_vs_{d2}"] = {
                        'u_statistic': u_stat,
                        'p_value': p_val,
                        'effect_size': effect_size,
                        'significant': p_val < 0.05
                    }
        
        results[metric] = {
            'kruskal_wallis': {'h_statistic': h_stat, 'p_value': p_value},
            'pairwise': pairwise_results
        }
    
    return results

stat_test_results = perform_statistical_tests(combined_results)

In [None]:
# ============================================================================
# Part 11: Visualization - Box Plots
# ============================================================================
def plot_quality_comparison(combined_df: pd.DataFrame):
    """
    Create comprehensive visualization of quality metrics.
    """
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle('VQA Dataset Quality Comparison (BERT-based Analysis)', 
                 fontsize=16, fontweight='bold')
    
    metrics = [
        ('option_diversity', 'Option Diversity\n(Higher = More diverse distractors)', axes[0, 0]),
        ('distractor_quality', 'Distractor Quality\n(Optimal: ~0.45 similarity)', axes[0, 1]),
        ('question_relevance', 'Question-Option Relevance\n(Higher = More topically related)', axes[1, 0]),
        ('separability', 'Answer Separability\n(Higher = Easier to distinguish)', axes[1, 1])
    ]
    
    colors = {'3_options': '#3498db', '4_options': '#2ecc71', '5_options': '#e74c3c'}
    
    for metric, title, ax in metrics:
        # Box plot
        bp = ax.boxplot(
            [combined_df[combined_df['dataset'] == d][metric].values 
             for d in sorted(combined_df['dataset'].unique())],
            labels=['3 Options', '4 Options', '5 Options'],
            patch_artist=True
        )
        
        # Color the boxes
        for patch, color in zip(bp['boxes'], colors.values()):
            patch.set_facecolor(color)
            patch.set_alpha(0.7)
        
        ax.set_title(title, fontsize=11, fontweight='bold')
        ax.set_ylabel('Score', fontsize=10)
        ax.grid(True, alpha=0.3)
        
        # Add mean markers
        means = [combined_df[combined_df['dataset'] == d][metric].mean() 
                 for d in sorted(combined_df['dataset'].unique())]
        ax.scatter([1, 2, 3], means, color='red', marker='D', s=50, zorder=3, label='Mean')
        
        # Add mean values as text
        for i, m in enumerate(means):
            ax.annotate(f'{m:.3f}', (i+1, m), textcoords="offset points", 
                       xytext=(10, 5), fontsize=9, color='red')
    
    plt.tight_layout()
    
    # Save figure
    output_path = DATA_DIR / "bert_quality_analysis_boxplot.png"
    plt.savefig(output_path, dpi=150, bbox_inches='tight')
    print(f"\n‚úÖ Figure saved: {output_path}")
    
    plt.show()

plot_quality_comparison(combined_results)

In [None]:
# ============================================================================
# Part 12: Visualization - Distribution Plots
# ============================================================================
def plot_distributions(combined_df: pd.DataFrame):
    """
    Plot kernel density estimates for each metric.
    """
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle('Quality Metric Distributions by Option Count (BERT-based)', 
                 fontsize=16, fontweight='bold')
    
    metrics = [
        ('option_diversity', 'Option Diversity', axes[0, 0]),
        ('distractor_quality', 'Distractor Quality', axes[0, 1]),
        ('question_relevance', 'Question-Option Relevance', axes[1, 0]),
        ('avg_distractor_similarity', 'Avg Distractor-Correct Similarity', axes[1, 1])
    ]
    
    colors = {'3_options': '#3498db', '4_options': '#2ecc71', '5_options': '#e74c3c'}
    labels = {'3_options': '3 Options', '4_options': '4 Options', '5_options': '5 Options'}
    
    for metric, title, ax in metrics:
        for dataset_name in sorted(combined_df['dataset'].unique()):
            data = combined_df[combined_df['dataset'] == dataset_name][metric]
            sns.kdeplot(data, ax=ax, label=labels[dataset_name], 
                       color=colors[dataset_name], linewidth=2, fill=True, alpha=0.3)
        
        ax.set_title(title, fontsize=11, fontweight='bold')
        ax.set_xlabel('Score', fontsize=10)
        ax.set_ylabel('Density', fontsize=10)
        ax.legend()
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    # Save figure
    output_path = DATA_DIR / "bert_quality_analysis_kde.png"
    plt.savefig(output_path, dpi=150, bbox_inches='tight')
    print(f"\n‚úÖ Figure saved: {output_path}")
    
    plt.show()

plot_distributions(combined_results)

In [None]:
# ============================================================================
# Part 13: Radar Chart Comparison
# ============================================================================
def plot_radar_comparison(summary_df: pd.DataFrame):
    """
    Create radar chart comparing all datasets across metrics.
    """
    metrics = ['option_diversity_mean', 'distractor_quality_mean', 
               'question_relevance_mean', 'separability_mean']
    metric_labels = ['Option\nDiversity', 'Distractor\nQuality', 
                    'Question\nRelevance', 'Separability']
    
    # Number of metrics
    N = len(metrics)
    
    # Compute angle for each metric
    angles = np.linspace(0, 2*np.pi, N, endpoint=False).tolist()
    angles += angles[:1]  # Complete the loop
    
    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
    
    colors = {'3_options': '#3498db', '4_options': '#2ecc71', '5_options': '#e74c3c'}
    labels = {'3_options': '3 Options', '4_options': '4 Options', '5_options': '5 Options'}
    
    for _, row in summary_df.iterrows():
        dataset = row['dataset']
        values = [row[m] for m in metrics]
        values += values[:1]  # Complete the loop
        
        ax.plot(angles, values, 'o-', linewidth=2, label=labels[dataset], 
                color=colors[dataset])
        ax.fill(angles, values, alpha=0.25, color=colors[dataset])
    
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(metric_labels, fontsize=10)
    ax.set_ylim(0, 1)
    ax.set_title('VQA Dataset Quality Comparison\n(BERT-based Metrics)', 
                 fontsize=14, fontweight='bold', pad=20)
    ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
    ax.grid(True)
    
    plt.tight_layout()
    
    # Save figure
    output_path = DATA_DIR / "bert_quality_analysis_radar.png"
    plt.savefig(output_path, dpi=150, bbox_inches='tight')
    print(f"\n‚úÖ Figure saved: {output_path}")
    
    plt.show()

plot_radar_comparison(summary_stats)

In [None]:
# ============================================================================
# Part 14: Composite Quality Score
# ============================================================================
def compute_composite_score(combined_df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute a composite quality score combining all metrics.
    
    Weights:
    - Option Diversity: 0.25 (important for challenging distractors)
    - Distractor Quality: 0.30 (most important - right difficulty level)
    - Question Relevance: 0.20 (options should be topically relevant)
    - Separability: 0.25 (correct answer should be distinguishable)
    """
    weights = {
        'option_diversity': 0.25,
        'distractor_quality': 0.30,
        'question_relevance': 0.20,
        'separability': 0.25
    }
    
    combined_df['composite_score'] = (
        combined_df['option_diversity'] * weights['option_diversity'] +
        combined_df['distractor_quality'] * weights['distractor_quality'] +
        combined_df['question_relevance'] * weights['question_relevance'] +
        combined_df['separability'] * weights['separability']
    )
    
    return combined_df

combined_results = compute_composite_score(combined_results)

# Summary by dataset
print("\n" + "="*80)
print("üìä COMPOSITE QUALITY SCORE SUMMARY")
print("="*80)
print("\nWeights: Diversity=0.25, Distractor Quality=0.30, Relevance=0.20, Separability=0.25")
print("\n")

composite_summary = combined_results.groupby('dataset')['composite_score'].agg(['mean', 'std', 'median'])
composite_summary = composite_summary.round(4)
print(composite_summary)

# Best dataset
best_dataset = composite_summary['mean'].idxmax()
print(f"\nüèÜ Best Overall Quality: {best_dataset} (mean composite score: {composite_summary.loc[best_dataset, 'mean']:.4f})")

In [None]:
# ============================================================================
# Part 15: Final Comparison Bar Chart
# ============================================================================
def plot_final_comparison(combined_df: pd.DataFrame, summary_df: pd.DataFrame):
    """
    Create final comparison bar chart with composite scores.
    """
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Left: Composite score comparison
    ax = axes[0]
    composite_means = combined_df.groupby('dataset')['composite_score'].mean().sort_index()
    composite_stds = combined_df.groupby('dataset')['composite_score'].std().sort_index()
    
    colors = ['#3498db', '#2ecc71', '#e74c3c']
    x_labels = ['3 Options', '4 Options', '5 Options']
    
    bars = ax.bar(x_labels, composite_means.values, yerr=composite_stds.values,
                  color=colors, alpha=0.8, capsize=5, edgecolor='black')
    
    ax.set_ylabel('Composite Quality Score', fontsize=11)
    ax.set_title('Overall VQA Quality by Option Count\n(BERT-based Analysis)', 
                 fontsize=12, fontweight='bold')
    ax.set_ylim(0, 1)
    ax.grid(True, alpha=0.3, axis='y')
    
    # Add value labels
    for bar, val in zip(bars, composite_means.values):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
                f'{val:.3f}', ha='center', fontsize=11, fontweight='bold')
    
    # Right: Individual metrics comparison
    ax = axes[1]
    metrics = ['option_diversity_mean', 'distractor_quality_mean', 
               'question_relevance_mean', 'separability_mean']
    metric_short = ['Diversity', 'Distractor\nQuality', 'Relevance', 'Separability']
    
    x = np.arange(len(metrics))
    width = 0.25
    
    for i, (dataset, color) in enumerate(zip(['3_options', '4_options', '5_options'], colors)):
        row = summary_df[summary_df['dataset'] == dataset].iloc[0]
        values = [row[m] for m in metrics]
        ax.bar(x + (i-1)*width, values, width, label=f'{dataset.split("_")[0]} Options', 
               color=color, alpha=0.8)
    
    ax.set_xticks(x)
    ax.set_xticklabels(metric_short, fontsize=10)
    ax.set_ylabel('Score', fontsize=11)
    ax.set_title('Individual Metric Comparison', fontsize=12, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')
    ax.set_ylim(0, 1)
    
    plt.tight_layout()
    
    # Save figure
    output_path = DATA_DIR / "bert_quality_analysis_final.png"
    plt.savefig(output_path, dpi=150, bbox_inches='tight')
    print(f"\n‚úÖ Figure saved: {output_path}")
    
    plt.show()

plot_final_comparison(combined_results, summary_stats)

In [None]:
# ============================================================================
# Part 16: Save Results to CSV
# ============================================================================
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save detailed results
detail_path = DATA_DIR / f"bert_analysis_detailed_{timestamp}.csv"
combined_results.to_csv(detail_path, index=False)
print(f"‚úÖ Detailed results saved: {detail_path}")

# Save summary statistics
summary_path = DATA_DIR / f"bert_analysis_summary_{timestamp}.csv"
summary_stats.to_csv(summary_path, index=False)
print(f"‚úÖ Summary statistics saved: {summary_path}")

# Create a comprehensive report
report_path = DATA_DIR / f"bert_analysis_report_{timestamp}.txt"
with open(report_path, 'w') as f:
    f.write("=" * 80 + "\n")
    f.write("VQA Dataset Quality Analysis Report (BERT-based)\n")
    f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write("=" * 80 + "\n\n")
    
    f.write("## Method\n")
    f.write("Using Sentence-BERT (all-MiniLM-L6-v2) for semantic similarity analysis.\n")
    f.write("This provides deeper semantic understanding compared to TF-IDF.\n\n")
    
    f.write("## Metrics Explained\n")
    f.write("1. Option Diversity: Semantic variety among wrong options (higher = better)\n")
    f.write("2. Distractor Quality: Optimal similarity to correct answer (~0.45 is ideal)\n")
    f.write("3. Question Relevance: How topically related options are to the question\n")
    f.write("4. Separability: How distinguishable the correct answer is\n\n")
    
    f.write("## Summary Statistics\n")
    f.write(summary_stats.to_string() + "\n\n")
    
    f.write("## Composite Score Rankings\n")
    composite_summary = combined_results.groupby('dataset')['composite_score'].agg(['mean', 'std'])
    f.write(composite_summary.sort_values('mean', ascending=False).to_string() + "\n\n")
    
    f.write("## Conclusion\n")
    best = composite_summary['mean'].idxmax()
    f.write(f"Based on BERT-based semantic analysis, '{best}' shows the best overall quality.\n")

print(f"‚úÖ Report saved: {report_path}")

In [None]:
# ============================================================================
# Part 17: Summary and Conclusions
# ============================================================================
print("\n" + "="*80)
print("üìã ANALYSIS COMPLETE - BERT-based VQA Quality Assessment")
print("="*80)

print("\nüìä Key Findings:")
print("-" * 40)

# Find best for each metric
for metric in ['option_diversity_mean', 'distractor_quality_mean', 
               'question_relevance_mean', 'separability_mean']:
    best = summary_stats.loc[summary_stats[metric].idxmax(), 'dataset']
    value = summary_stats[metric].max()
    metric_name = metric.replace('_mean', '').replace('_', ' ').title()
    print(f"‚Ä¢ Best {metric_name}: {best} ({value:.4f})")

print("\nüèÜ Overall Quality Ranking (Composite Score):")
print("-" * 40)
composite_ranking = combined_results.groupby('dataset')['composite_score'].mean().sort_values(ascending=False)
for i, (dataset, score) in enumerate(composite_ranking.items(), 1):
    medal = ['ü•á', 'ü•à', 'ü•â'][i-1] if i <= 3 else f'{i}.'
    print(f"{medal} {dataset}: {score:.4f}")

print("\nüí° Interpretation:")
print("-" * 40)
print("‚Ä¢ Higher option diversity = more challenging distractors")
print("‚Ä¢ Optimal distractor quality (0.45) = similar enough to be confusing, different enough to be wrong")
print("‚Ä¢ Higher question relevance = options are on-topic")
print("‚Ä¢ Higher separability = correct answer is more distinguishable")

print("\n‚úÖ BERT-based analysis provides deeper semantic understanding than TF-IDF")
print("   by capturing contextual meaning and handling synonyms/paraphrases better.")

# Part B: Video-Text Similarity Analysis using CLIP

## Ë∑®Ê®°ÊÄÅÁõ∏‰ººÂ∫¶ËØÑ‰º∞

‰ΩøÁî® OpenAI ÁöÑ CLIP Ê®°ÂûãËÆ°ÁÆóËßÜÈ¢ëÂ∏ß‰∏éÈÄâÈ°πÊñáÊú¨‰πãÈó¥ÁöÑËØ≠‰πâÁõ∏‰ººÂ∫¶„ÄÇ

**ËØÑ‰º∞ÊåáÊ†áÔºö**
1. **Correct Answer Alignment** - Ê≠£Á°ÆÁ≠îÊ°à‰∏éËßÜÈ¢ëÂÜÖÂÆπÁöÑÂåπÈÖçÁ®ãÂ∫¶
2. **Distractor Plausibility** - ÈîôËØØÈÄâÈ°π‰∏éËßÜÈ¢ëÁöÑÂêàÁêÜÁõ∏ÂÖ≥ÊÄß
3. **Answer Discriminability** - Ê≠£Á°ÆÁ≠îÊ°àÊòØÂê¶ÊØîÈîôËØØÈÄâÈ°πÊõ¥ÂåπÈÖçËßÜÈ¢ë

In [None]:
# ============================================================================
# Part 18: Install CLIP Dependencies
# ============================================================================
import subprocess
import sys

def install_clip_deps():
    """Install CLIP and related dependencies"""
    packages = [
        ("transformers", "transformers"),
        ("Pillow", "PIL"),
        ("opencv-python", "cv2"),
    ]
    
    for package, import_name in packages:
        try:
            __import__(import_name)
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])
    
    # Install CLIP from transformers (no need for separate clip package)
    print("‚úÖ CLIP dependencies ready")

install_clip_deps()

In [None]:
# ============================================================================
# Part 19: Load CLIP Model
# ============================================================================
from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image
import cv2
import os

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load CLIP model (ViT-B/32 is fast and efficient)
print("Loading CLIP model (openai/clip-vit-base-patch32)...")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

print(f"‚úÖ CLIP model loaded on {device}")